From a10756ee657a4d43a48cca5c166919093930ed6b Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Thu, 21 Jan 2010 02:09:26 +0000 Subject: [PATCH] Re-implement the main strength-reduction portion of LoopStrengthReduction. This new version is much more aggressive about doing "full" reduction in cases where it reduces register pressure, and also more aggressive about rewriting induction variables to count down (or up) to zero when doing so reduces register pressure. It currently uses fairly simplistic algorithms for finding reuse opportunities, but it introduces a new framework allows it to combine multiple strategies at once to form hybrid solutions, instead of doing all full-reduction or all base+index. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@94061 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../llvm/Analysis/ScalarEvolutionExpander.h | 66 +- lib/Analysis/IVUsers.cpp | 6 - lib/Analysis/ScalarEvolution.cpp | 206 +- lib/Analysis/ScalarEvolutionExpander.cpp | 228 +- lib/Target/X86/X86ISelDAGToDAG.cpp | 2 +- lib/Transforms/Scalar/IndVarSimplify.cpp | 14 +- lib/Transforms/Scalar/LoopStrengthReduce.cpp | 4622 +++++++++-------- test/CodeGen/ARM/arm-negative-stride.ll | 26 + test/CodeGen/ARM/lsr-code-insertion.ll | 4 +- test/CodeGen/ARM/remat.ll | 3 +- test/CodeGen/Thumb2/lsr-deficiency.ll | 18 +- test/CodeGen/Thumb2/thumb2-ifcvt1.ll | 12 +- test/CodeGen/X86/2006-05-11-InstrSched.ll | 4 +- test/CodeGen/X86/2007-08-13-SpillerReuse.ll | 102 - .../CodeGen/X86/2007-11-30-LoadFolding-Bug.ll | 2 +- test/CodeGen/X86/2009-09-10-SpillComments.ll | 9 +- test/CodeGen/X86/full-lsr.ll | 12 +- test/CodeGen/X86/iv-users-in-other-loops.ll | 8 +- test/CodeGen/X86/loop-strength-reduce4.ll | 18 +- test/CodeGen/X86/loop-strength-reduce8.ll | 8 +- test/CodeGen/X86/lsr-reuse.ll | 159 + test/CodeGen/X86/masked-iv-safe.ll | 7 +- test/CodeGen/X86/pr3495-2.ll | 1 + test/CodeGen/X86/remat-mov-0.ll | 26 +- test/CodeGen/X86/remat-mov-1.ll | 40 - test/CodeGen/X86/subreg-to-reg-5.ll | 35 - .../IndVarSimplify/gep-with-mul-base.ll | 5 +- .../2008-08-06-CmpStride.ll | 5 +- .../change-compare-stride-trickiness-0.ll | 7 +- .../change-compare-stride-trickiness-1.ll | 8 +- .../LoopStrengthReduce/count-to-zero.ll | 2 +- .../LoopStrengthReduce/icmp_use_postinc.ll | 27 - 32 files changed, 3054 insertions(+), 2638 deletions(-) delete mode 100644 test/CodeGen/X86/2007-08-13-SpillerReuse.ll create mode 100644 test/CodeGen/X86/lsr-reuse.ll delete mode 100644 test/CodeGen/X86/remat-mov-1.ll delete mode 100644 test/CodeGen/X86/subreg-to-reg-5.ll delete mode 100644 test/Transforms/LoopStrengthReduce/icmp_use_postinc.ll diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h index bbdd0437a1e..5bec8e445a0 100644 --- a/include/llvm/Analysis/ScalarEvolutionExpander.h +++ b/include/llvm/Analysis/ScalarEvolutionExpander.h @@ -26,19 +26,44 @@ namespace llvm { /// Clients should create an instance of this class when rewriting is needed, /// and destroy it when finished to allow the release of the associated /// memory. - struct SCEVExpander : public SCEVVisitor { + class SCEVExpander : public SCEVVisitor { ScalarEvolution &SE; std::map, AssertingVH > InsertedExpressions; std::set InsertedValues; + /// PostIncLoop - When non-null, expanded addrecs referring to the given + /// loop expanded in post-inc mode. For example, expanding {1,+,1} in + /// post-inc mode returns the add instruction that adds one to the phi + /// for {0,+,1}, as opposed to a new phi starting at 1. This is only + /// supported in non-canonical mode. + const Loop *PostIncLoop; + + /// IVIncInsertPos - When this is non-null, addrecs expanded in the + /// loop it indicates should be inserted with increments at + /// IVIncInsertPos. + const Loop *IVIncInsertLoop; + + /// IVIncInsertPos - When expanding addrecs in the IVIncInsertLoop loop, + /// insert the IV increment at this position. + Instruction *IVIncInsertPos; + + /// CanonicalMode - When true, expressions are expanded in "canonical" + /// form. In particular, addrecs are expanded as arithmetic based on + /// a canonical induction variable. When false, expression are expanded + /// in a more literal form. + bool CanonicalMode; + + protected: typedef IRBuilder BuilderType; BuilderType Builder; friend struct SCEVVisitor; public: + /// SCEVExpander - Construct a SCEVExpander in "canonical" mode. explicit SCEVExpander(ScalarEvolution &se) - : SE(se), Builder(se.getContext(), TargetFolder(se.TD)) {} + : SE(se), PostIncLoop(0), IVIncInsertLoop(0), CanonicalMode(true), + Builder(se.getContext(), TargetFolder(se.TD)) {} /// clear - Erase the contents of the InsertedExpressions map so that users /// trying to expand the same expression into multiple BasicBlocks or @@ -54,11 +79,36 @@ namespace llvm { /// expandCodeFor - Insert code to directly compute the specified SCEV /// expression into the program. The inserted code is inserted into the /// specified block. - Value *expandCodeFor(const SCEV *SH, const Type *Ty, Instruction *IP) { + Value *expandCodeFor(const SCEV *SH, const Type *Ty, Instruction *I) { + BasicBlock::iterator IP = I; + while (isInsertedInstruction(IP)) ++IP; Builder.SetInsertPoint(IP->getParent(), IP); return expandCodeFor(SH, Ty); } + /// setIVIncInsertPos - Set the current IV increment loop and position. + void setIVIncInsertPos(const Loop *L, Instruction *Pos) { + assert(!CanonicalMode && + "IV increment positions are not supported in CanonicalMode"); + IVIncInsertLoop = L; + IVIncInsertPos = Pos; + } + + /// setPostInc - If L is non-null, enable post-inc expansion for addrecs + /// referring to the given loop. If L is null, disable post-inc expansion + /// completely. Post-inc expansion is only supported in non-canonical + /// mode. + void setPostInc(const Loop *L) { + assert(!CanonicalMode && + "Post-inc expansion is not supported in CanonicalMode"); + PostIncLoop = L; + } + + /// disableCanonicalMode - Disable the behavior of expanding expressions in + /// canonical form rather than in a more literal form. Non-canonical mode + /// is useful for late optimization passes. + void disableCanonicalMode() { CanonicalMode = false; } + private: LLVMContext &getContext() const { return SE.getContext(); } @@ -121,6 +171,16 @@ namespace llvm { Value *visitUnknown(const SCEVUnknown *S) { return S->getValue(); } + + void rememberInstruction(Value *I) { + if (!PostIncLoop) InsertedValues.insert(I); + } + + Value *expandAddRecExprLiterally(const SCEVAddRecExpr *); + PHINode *getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, + const Loop *L, + const Type *ExpandTy, + const Type *IntTy); }; } diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp index 92f00273a2c..38611ccb621 100644 --- a/lib/Analysis/IVUsers.cpp +++ b/lib/Analysis/IVUsers.cpp @@ -324,12 +324,6 @@ const SCEV *IVUsers::getReplacementExpr(const IVStrideUse &U) const { // the actual replacement value. if (U.isUseOfPostIncrementedValue()) RetVal = SE->getAddExpr(RetVal, U.getParent()->Stride); - // Evaluate the expression out of the loop, if possible. - if (!L->contains(U.getUser())) { - const SCEV *ExitVal = SE->getSCEVAtScope(RetVal, L->getParentLoop()); - if (ExitVal->isLoopInvariant(L)) - RetVal = ExitVal; - } return RetVal; } diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 7389007bf2f..2f44913abd4 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -1089,6 +1089,15 @@ const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op, if (!isa(SExt)) return SExt; + // Force the cast to be folded into the operands of an addrec. + if (const SCEVAddRecExpr *AR = dyn_cast(Op)) { + SmallVector Ops; + for (SCEVAddRecExpr::op_iterator I = AR->op_begin(), E = AR->op_end(); + I != E; ++I) + Ops.push_back(getAnyExtendExpr(*I, Ty)); + return getAddRecExpr(Ops, AR->getLoop()); + } + // If the expression is obviously signed, use the sext cast value. if (isa(Op)) return SExt; @@ -1204,6 +1213,17 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl &Ops, "SCEVAddExpr operand types don't match!"); #endif + // If HasNSW is true and all the operands are non-negative, infer HasNUW. + if (!HasNUW && HasNSW) { + bool All = true; + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + if (!isKnownNonNegative(Ops[i])) { + All = false; + break; + } + if (All) HasNUW = true; + } + // Sort by complexity, this groups all similar expression types together. GroupByComplexity(Ops, LI); @@ -1521,10 +1541,13 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl &Ops, for (unsigned i = 0, e = Ops.size(); i != e; ++i) ID.AddPointer(Ops[i]); void *IP = 0; - if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; - SCEVAddExpr *S = SCEVAllocator.Allocate(); - new (S) SCEVAddExpr(ID, Ops); - UniqueSCEVs.InsertNode(S, IP); + SCEVAddExpr *S = + static_cast(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); + if (!S) { + S = SCEVAllocator.Allocate(); + new (S) SCEVAddExpr(ID, Ops); + UniqueSCEVs.InsertNode(S, IP); + } if (HasNUW) S->setHasNoUnsignedWrap(true); if (HasNSW) S->setHasNoSignedWrap(true); return S; @@ -1535,6 +1558,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl &Ops, const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl &Ops, bool HasNUW, bool HasNSW) { assert(!Ops.empty() && "Cannot get empty mul!"); + if (Ops.size() == 1) return Ops[0]; #ifndef NDEBUG for (unsigned i = 1, e = Ops.size(); i != e; ++i) assert(getEffectiveSCEVType(Ops[i]->getType()) == @@ -1542,6 +1566,17 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl &Ops, "SCEVMulExpr operand types don't match!"); #endif + // If HasNSW is true and all the operands are non-negative, infer HasNUW. + if (!HasNUW && HasNSW) { + bool All = true; + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + if (!isKnownNonNegative(Ops[i])) { + All = false; + break; + } + if (All) HasNUW = true; + } + // Sort by complexity, this groups all similar expression types together. GroupByComplexity(Ops, LI); @@ -1576,6 +1611,22 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl &Ops, } else if (cast(Ops[0])->getValue()->isZero()) { // If we have a multiply of zero, it will always be zero. return Ops[0]; + } else if (Ops[0]->isAllOnesValue()) { + // If we have a mul by -1 of an add, try distributing the -1 among the + // add operands. + if (Ops.size() == 2) + if (const SCEVAddExpr *Add = dyn_cast(Ops[1])) { + SmallVector NewOps; + bool AnyFolded = false; + for (SCEVAddRecExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) { + const SCEV *Mul = getMulExpr(Ops[0], *I); + if (!isa(Mul)) AnyFolded = true; + NewOps.push_back(Mul); + } + if (AnyFolded) + return getAddExpr(NewOps); + } } } @@ -1642,7 +1693,9 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl &Ops, // It's tempting to propagate the NSW flag here, but nsw multiplication // is not associative so this isn't necessarily safe. - const SCEV *NewRec = getAddRecExpr(NewOps, AddRec->getLoop()); + const SCEV *NewRec = getAddRecExpr(NewOps, AddRec->getLoop(), + HasNUW && AddRec->hasNoUnsignedWrap(), + /*HasNSW=*/false); // If all of the other operands were loop invariant, we are done. if (Ops.size() == 1) return NewRec; @@ -1696,10 +1749,13 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl &Ops, for (unsigned i = 0, e = Ops.size(); i != e; ++i) ID.AddPointer(Ops[i]); void *IP = 0; - if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; - SCEVMulExpr *S = SCEVAllocator.Allocate(); - new (S) SCEVMulExpr(ID, Ops); - UniqueSCEVs.InsertNode(S, IP); + SCEVMulExpr *S = + static_cast(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); + if (!S) { + S = SCEVAllocator.Allocate(); + new (S) SCEVMulExpr(ID, Ops); + UniqueSCEVs.InsertNode(S, IP); + } if (HasNUW) S->setHasNoUnsignedWrap(true); if (HasNSW) S->setHasNoSignedWrap(true); return S; @@ -1842,10 +1898,24 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl &Operands, return getAddRecExpr(Operands, L, HasNUW, HasNSW); // {X,+,0} --> X } + // If HasNSW is true and all the operands are non-negative, infer HasNUW. + if (!HasNUW && HasNSW) { + bool All = true; + for (unsigned i = 0, e = Operands.size(); i != e; ++i) + if (!isKnownNonNegative(Operands[i])) { + All = false; + break; + } + if (All) HasNUW = true; + } + // Canonicalize nested AddRecs in by nesting them in order of loop depth. if (const SCEVAddRecExpr *NestedAR = dyn_cast(Operands[0])) { const Loop *NestedLoop = NestedAR->getLoop(); - if (L->getLoopDepth() < NestedLoop->getLoopDepth()) { + if (L->contains(NestedLoop->getHeader()) ? + (L->getLoopDepth() < NestedLoop->getLoopDepth()) : + (!NestedLoop->contains(L->getHeader()) && + DT->dominates(L->getHeader(), NestedLoop->getHeader()))) { SmallVector NestedOperands(NestedAR->op_begin(), NestedAR->op_end()); Operands[0] = NestedAR->getStart(); @@ -1884,10 +1954,13 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl &Operands, ID.AddPointer(Operands[i]); ID.AddPointer(L); void *IP = 0; - if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; - SCEVAddRecExpr *S = SCEVAllocator.Allocate(); - new (S) SCEVAddRecExpr(ID, Operands, L); - UniqueSCEVs.InsertNode(S, IP); + SCEVAddRecExpr *S = + static_cast(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); + if (!S) { + S = SCEVAllocator.Allocate(); + new (S) SCEVAddRecExpr(ID, Operands, L); + UniqueSCEVs.InsertNode(S, IP); + } if (HasNUW) S->setHasNoUnsignedWrap(true); if (HasNSW) S->setHasNoSignedWrap(true); return S; @@ -2525,31 +2598,28 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) { if (Accum->isLoopInvariant(L) || (isa(Accum) && cast(Accum)->getLoop() == L)) { + bool HasNUW = false; + bool HasNSW = false; + + // If the increment doesn't overflow, then neither the addrec nor + // the post-increment will overflow. + if (const AddOperator *OBO = dyn_cast(BEValueV)) { + if (OBO->hasNoUnsignedWrap()) + HasNUW = true; + if (OBO->hasNoSignedWrap()) + HasNSW = true; + } + const SCEV *StartVal = getSCEV(PN->getIncomingValue(IncomingEdge)); - const SCEVAddRecExpr *PHISCEV = - cast(getAddRecExpr(StartVal, Accum, L)); - - // If the increment doesn't overflow, then neither the addrec nor the - // post-increment will overflow. - if (const AddOperator *OBO = dyn_cast(BEValueV)) - if (OBO->getOperand(0) == PN && - getSCEV(OBO->getOperand(1)) == - PHISCEV->getStepRecurrence(*this)) { - const SCEVAddRecExpr *PostInc = PHISCEV->getPostIncExpr(*this); - if (OBO->hasNoUnsignedWrap()) { - const_cast(PHISCEV) - ->setHasNoUnsignedWrap(true); - const_cast(PostInc) - ->setHasNoUnsignedWrap(true); - } - if (OBO->hasNoSignedWrap()) { - const_cast(PHISCEV) - ->setHasNoSignedWrap(true); - const_cast(PostInc) - ->setHasNoSignedWrap(true); - } - } + const SCEV *PHISCEV = + getAddRecExpr(StartVal, Accum, L, HasNUW, HasNSW); + + // Since the no-wrap flags are on the increment, they apply to the + // post-incremented value as well. + if (Accum->isLoopInvariant(L)) + (void)getAddRecExpr(getAddExpr(StartVal, Accum), + Accum, L, HasNUW, HasNSW); // Okay, for the entire analysis of this edge we assumed the PHI // to be symbolic. We now need to go back and purge all of the @@ -2781,26 +2851,29 @@ ScalarEvolution::getUnsignedRange(const SCEV *S) { if (const SCEVAddRecExpr *AddRec = dyn_cast(S)) { const SCEV *T = getBackedgeTakenCount(AddRec->getLoop()); const SCEVConstant *Trip = dyn_cast(T); - if (!Trip) return FullSet; + ConstantRange ConservativeResult = FullSet; + + // If there's no unsigned wrap, the value will never be less than its + // initial value. + if (AddRec->hasNoUnsignedWrap()) + if (const SCEVConstant *C = dyn_cast(AddRec->getStart())) + ConservativeResult = + ConstantRange(C->getValue()->getValue(), + APInt(getTypeSizeInBits(C->getType()), 0)); // TODO: non-affine addrec - if (AddRec->isAffine()) { + if (Trip && AddRec->isAffine()) { const Type *Ty = AddRec->getType(); const SCEV *MaxBECount = getMaxBackedgeTakenCount(AddRec->getLoop()); if (getTypeSizeInBits(MaxBECount->getType()) <= getTypeSizeInBits(Ty)) { MaxBECount = getNoopOrZeroExtend(MaxBECount, Ty); const SCEV *Start = AddRec->getStart(); - const SCEV *Step = AddRec->getStepRecurrence(*this); const SCEV *End = AddRec->evaluateAtIteration(MaxBECount, *this); // Check for overflow. - // TODO: This is very conservative. - if (!(Step->isOne() && - isKnownPredicate(ICmpInst::ICMP_ULT, Start, End)) && - !(Step->isAllOnesValue() && - isKnownPredicate(ICmpInst::ICMP_UGT, Start, End))) - return FullSet; + if (!AddRec->hasNoUnsignedWrap()) + return ConservativeResult; ConstantRange StartRange = getUnsignedRange(Start); ConstantRange EndRange = getUnsignedRange(End); @@ -2809,10 +2882,12 @@ ScalarEvolution::getUnsignedRange(const SCEV *S) { APInt Max = APIntOps::umax(StartRange.getUnsignedMax(), EndRange.getUnsignedMax()); if (Min.isMinValue() && Max.isMaxValue()) - return FullSet; + return ConservativeResult; return ConstantRange(Min, Max+1); } } + + return ConservativeResult; } if (const SCEVUnknown *U = dyn_cast(S)) { @@ -2891,26 +2966,39 @@ ScalarEvolution::getSignedRange(const SCEV *S) { if (const SCEVAddRecExpr *AddRec = dyn_cast(S)) { const SCEV *T = getBackedgeTakenCount(AddRec->getLoop()); const SCEVConstant *Trip = dyn_cast(T); - if (!Trip) return FullSet; + ConstantRange ConservativeResult = FullSet; + + // If there's no signed wrap, and all the operands have the same sign or + // zero, the value won't ever change sign. + if (AddRec->hasNoSignedWrap()) { + bool AllNonNeg = true; + bool AllNonPos = true; + for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) { + if (!isKnownNonNegative(AddRec->getOperand(i))) AllNonNeg = false; + if (!isKnownNonPositive(AddRec->getOperand(i))) AllNonPos = false; + } + unsigned BitWidth = getTypeSizeInBits(AddRec->getType()); + if (AllNonNeg) + ConservativeResult = ConstantRange(APInt(BitWidth, 0), + APInt::getSignedMinValue(BitWidth)); + else if (AllNonPos) + ConservativeResult = ConstantRange(APInt::getSignedMinValue(BitWidth), + APInt(BitWidth, 1)); + } // TODO: non-affine addrec - if (AddRec->isAffine()) { + if (Trip && AddRec->isAffine()) { const Type *Ty = AddRec->getType(); const SCEV *MaxBECount = getMaxBackedgeTakenCount(AddRec->getLoop()); if (getTypeSizeInBits(MaxBECount->getType()) <= getTypeSizeInBits(Ty)) { MaxBECount = getNoopOrZeroExtend(MaxBECount, Ty); const SCEV *Start = AddRec->getStart(); - const SCEV *Step = AddRec->getStepRecurrence(*this); const SCEV *End = AddRec->evaluateAtIteration(MaxBECount, *this); // Check for overflow. - // TODO: This is very conservative. - if (!(Step->isOne() && - isKnownPredicate(ICmpInst::ICMP_SLT, Start, End)) && - !(Step->isAllOnesValue() && - isKnownPredicate(ICmpInst::ICMP_SGT, Start, End))) - return FullSet; + if (!AddRec->hasNoSignedWrap()) + return ConservativeResult; ConstantRange StartRange = getSignedRange(Start); ConstantRange EndRange = getSignedRange(End); @@ -2919,15 +3007,19 @@ ScalarEvolution::getSignedRange(const SCEV *S) { APInt Max = APIntOps::smax(StartRange.getSignedMax(), EndRange.getSignedMax()); if (Min.isMinSignedValue() && Max.isMaxSignedValue()) - return FullSet; + return ConservativeResult; return ConstantRange(Min, Max+1); } } + + return ConservativeResult; } if (const SCEVUnknown *U = dyn_cast(S)) { // For a SCEVUnknown, ask ValueTracking. unsigned BitWidth = getTypeSizeInBits(U->getType()); + if (!U->getValue()->getType()->isInteger() && !TD) + return FullSet; unsigned NS = ComputeNumSignBits(U->getValue(), TD); if (NS == 1) return FullSet; diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp index 2c66f1ac2c5..b049f424fa8 100644 --- a/lib/Analysis/ScalarEvolutionExpander.cpp +++ b/lib/Analysis/ScalarEvolutionExpander.cpp @@ -81,7 +81,7 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, const Type *Ty) { Instruction *I = CastInst::Create(Op, V, Ty, V->getName(), A->getParent()->getEntryBlock().begin()); - InsertedValues.insert(I); + rememberInstruction(I); return I; } @@ -114,7 +114,7 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, const Type *Ty) { IP = II->getNormalDest()->begin(); while (isa(IP)) ++IP; Instruction *CI = CastInst::Create(Op, V, Ty, V->getName(), IP); - InsertedValues.insert(CI); + rememberInstruction(CI); return CI; } @@ -144,7 +144,7 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode, // If we haven't found this binop, insert it. Value *BO = Builder.CreateBinOp(Opcode, LHS, RHS, "tmp"); - InsertedValues.insert(BO); + rememberInstruction(BO); return BO; } @@ -491,22 +491,39 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, // Emit a GEP. Value *GEP = Builder.CreateGEP(V, Idx, "uglygep"); - InsertedValues.insert(GEP); + rememberInstruction(GEP); return GEP; } // Insert a pretty getelementptr. Note that this GEP is not marked inbounds, // because ScalarEvolution may have changed the address arithmetic to // compute a value which is beyond the end of the allocated object. - Value *GEP = Builder.CreateGEP(V, + Value *Casted = V; + if (V->getType() != PTy) + Casted = InsertNoopCastOfTo(Casted, PTy); + Value *GEP = Builder.CreateGEP(Casted, GepIndices.begin(), GepIndices.end(), "scevgep"); Ops.push_back(SE.getUnknown(GEP)); - InsertedValues.insert(GEP); + rememberInstruction(GEP); return expand(SE.getAddExpr(Ops)); } +/// isNonConstantNegative - Return true if the specified scev is negated, but +/// not a constant. +static bool isNonConstantNegative(const SCEV *F) { + const SCEVMulExpr *Mul = dyn_cast(F); + if (!Mul) return false; + + // If there is a constant factor, it will be first. + const SCEVConstant *SC = dyn_cast(Mul->getOperand(0)); + if (!SC) return false; + + // Return true if the value is negative, this matches things like (-42 * V). + return SC->getValue()->getValue().isNegative(); +} + Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) { int NumOperands = S->getNumOperands(); const Type *Ty = SE.getEffectiveSCEVType(S->getType()); @@ -539,8 +556,14 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) { // Emit a bunch of add instructions for (int i = NumOperands-1; i >= 0; --i) { if (i == PIdx) continue; - Value *W = expandCodeFor(S->getOperand(i), Ty); - V = InsertBinop(Instruction::Add, V, W); + const SCEV *Op = S->getOperand(i); + if (isNonConstantNegative(Op)) { + Value *W = expandCodeFor(SE.getNegativeSCEV(Op), Ty); + V = InsertBinop(Instruction::Sub, V, W); + } else { + Value *W = expandCodeFor(Op, Ty); + V = InsertBinop(Instruction::Add, V, W); + } } return V; } @@ -603,7 +626,175 @@ static void ExposePointerBase(const SCEV *&Base, const SCEV *&Rest, } } +/// getAddRecExprPHILiterally - Helper for expandAddRecExprLiterally. Expand +/// the base addrec, which is the addrec without any non-loop-dominating +/// values, and return the PHI. +PHINode * +SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, + const Loop *L, + const Type *ExpandTy, + const Type *IntTy) { + // Reuse a previously-inserted PHI, if present. + for (BasicBlock::iterator I = L->getHeader()->begin(); + PHINode *PN = dyn_cast(I); ++I) + if (isInsertedInstruction(PN) && SE.getSCEV(PN) == Normalized) + return PN; + + // Save the original insertion point so we can restore it when we're done. + BasicBlock *SaveInsertBB = Builder.GetInsertBlock(); + BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint(); + + // Expand code for the start value. + Value *StartV = expandCodeFor(Normalized->getStart(), ExpandTy, + L->getHeader()->begin()); + + // Expand code for the step value. Insert instructions right before the + // terminator corresponding to the back-edge. Do this before creating the PHI + // so that PHI reuse code doesn't see an incomplete PHI. If the stride is + // negative, insert a sub instead of an add for the increment (unless it's a + // constant, because subtracts of constants are canonicalized to adds). + const SCEV *Step = Normalized->getStepRecurrence(SE); + bool isPointer = isa(ExpandTy); + bool isNegative = !isPointer && isNonConstantNegative(Step); + if (isNegative) + Step = SE.getNegativeSCEV(Step); + Value *StepV = expandCodeFor(Step, IntTy, L->getHeader()->begin()); + + // Create the PHI. + Builder.SetInsertPoint(L->getHeader(), L->getHeader()->begin()); + PHINode *PN = Builder.CreatePHI(ExpandTy, "lsr.iv"); + rememberInstruction(PN); + + // Create the step instructions and populate the PHI. + BasicBlock *Header = L->getHeader(); + for (pred_iterator HPI = pred_begin(Header), HPE = pred_end(Header); + HPI != HPE; ++HPI) { + BasicBlock *Pred = *HPI; + + // Add a start value. + if (!L->contains(Pred)) { + PN->addIncoming(StartV, Pred); + continue; + } + + // Create a step value and add it to the PHI. If IVIncInsertLoop is + // non-null and equal to the addrec's loop, insert the instructions + // at IVIncInsertPos. + Instruction *InsertPos = L == IVIncInsertLoop ? + IVIncInsertPos : Pred->getTerminator(); + Builder.SetInsertPoint(InsertPos->getParent(), InsertPos); + Value *IncV; + // If the PHI is a pointer, use a GEP, otherwise use an add or sub. + if (isPointer) { + const PointerType *GEPPtrTy = cast(ExpandTy); + // If the step isn't constant, don't use an implicitly scaled GEP, because + // that would require a multiply inside the loop. + if (!isa(StepV)) + GEPPtrTy = PointerType::get(Type::getInt1Ty(SE.getContext()), + GEPPtrTy->getAddressSpace()); + const SCEV *const StepArray[1] = { SE.getSCEV(StepV) }; + IncV = expandAddToGEP(StepArray, StepArray+1, GEPPtrTy, IntTy, PN); + if (IncV->getType() != PN->getType()) { + IncV = Builder.CreateBitCast(IncV, PN->getType(), "tmp"); + rememberInstruction(IncV); + } + } else { + IncV = isNegative ? + Builder.CreateSub(PN, StepV, "lsr.iv.next") : + Builder.CreateAdd(PN, StepV, "lsr.iv.next"); + rememberInstruction(IncV); + } + PN->addIncoming(IncV, Pred); + } + + // Restore the original insert point. + if (SaveInsertBB) + Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt); + + // Remember this PHI, even in post-inc mode. + InsertedValues.insert(PN); + + return PN; +} + +Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { + const Type *STy = S->getType(); + const Type *IntTy = SE.getEffectiveSCEVType(STy); + const Loop *L = S->getLoop(); + + // Determine a normalized form of this expression, which is the expression + // before any post-inc adjustment is made. + const SCEVAddRecExpr *Normalized = S; + if (L == PostIncLoop) { + const SCEV *Step = S->getStepRecurrence(SE); + Normalized = cast(SE.getMinusSCEV(S, Step)); + } + + // Strip off any non-loop-dominating component from the addrec start. + const SCEV *Start = Normalized->getStart(); + const SCEV *PostLoopOffset = 0; + if (!Start->properlyDominates(L->getHeader(), SE.DT)) { + PostLoopOffset = Start; + Start = SE.getIntegerSCEV(0, Normalized->getType()); + Normalized = + cast(SE.getAddRecExpr(Start, + Normalized->getStepRecurrence(SE), + Normalized->getLoop())); + } + + // Strip off any non-loop-dominating component from the addrec step. + const SCEV *Step = Normalized->getStepRecurrence(SE); + const SCEV *PostLoopScale = 0; + if (!Step->hasComputableLoopEvolution(L) && + !Step->dominates(L->getHeader(), SE.DT)) { + PostLoopScale = Step; + Step = SE.getIntegerSCEV(1, Normalized->getType()); + Normalized = + cast(SE.getAddRecExpr(Start, Step, + Normalized->getLoop())); + } + + // Expand the core addrec. If we need post-loop scaling, force it to + // expand to an integer type to avoid the need for additional casting. + const Type *ExpandTy = PostLoopScale ? IntTy : STy; + PHINode *PN = getAddRecExprPHILiterally(Normalized, L, ExpandTy, IntTy); + + // Accomodate post-inc mode, if necessary. + Value *Result; + if (L != PostIncLoop) + Result = PN; + else { + // In PostInc mode, use the post-incremented value. + BasicBlock *LatchBlock = L->getLoopLatch(); + assert(LatchBlock && "PostInc mode requires a unique loop latch!"); + Result = PN->getIncomingValueForBlock(LatchBlock); + } + + // Re-apply any non-loop-dominating scale. + if (PostLoopScale) { + Result = Builder.CreateMul(Result, + expandCodeFor(PostLoopScale, IntTy)); + rememberInstruction(Result); + } + + // Re-apply any non-loop-dominating offset. + if (PostLoopOffset) { + if (const PointerType *PTy = dyn_cast(ExpandTy)) { + const SCEV *const OffsetArray[1] = { PostLoopOffset }; + Result = expandAddToGEP(OffsetArray, OffsetArray+1, PTy, IntTy, Result); + } else { + Result = Builder.CreateAdd(Result, + expandCodeFor(PostLoopOffset, IntTy)); + rememberInstruction(Result); + } + } + + return Result; +} + Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { + if (!CanonicalMode) return expandAddRecExprLiterally(S); + const Type *Ty = SE.getEffectiveSCEVType(S->getType()); const Loop *L = S->getLoop(); @@ -681,7 +872,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { // specified loop. BasicBlock *Header = L->getHeader(); PHINode *PN = PHINode::Create(Ty, "indvar", Header->begin()); - InsertedValues.insert(PN); + rememberInstruction(PN); Constant *One = ConstantInt::get(Ty, 1); for (pred_iterator HPI = pred_begin(Header), HPE = pred_end(Header); @@ -691,7 +882,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { // corresponding to the back-edge. Instruction *Add = BinaryOperator::CreateAdd(PN, One, "indvar.next", (*HPI)->getTerminator()); - InsertedValues.insert(Add); + rememberInstruction(Add); PN->addIncoming(Add, *HPI); } else { PN->addIncoming(Constant::getNullValue(Ty), *HPI); @@ -738,7 +929,7 @@ Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) { Value *V = expandCodeFor(S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType())); Value *I = Builder.CreateTrunc(V, Ty, "tmp"); - InsertedValues.insert(I); + rememberInstruction(I); return I; } @@ -747,7 +938,7 @@ Value *SCEVExpander::visitZeroExtendExpr(const SCEVZeroExtendExpr *S) { Value *V = expandCodeFor(S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType())); Value *I = Builder.CreateZExt(V, Ty, "tmp"); - InsertedValues.insert(I); + rememberInstruction(I); return I; } @@ -756,7 +947,7 @@ Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) { Value *V = expandCodeFor(S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType())); Value *I = Builder.CreateSExt(V, Ty, "tmp"); - InsertedValues.insert(I); + rememberInstruction(I); return I; } @@ -772,9 +963,9 @@ Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) { } Value *RHS = expandCodeFor(S->getOperand(i), Ty); Value *ICmp = Builder.CreateICmpSGT(LHS, RHS, "tmp"); - InsertedValues.insert(ICmp); + rememberInstruction(ICmp); Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smax"); - InsertedValues.insert(Sel); + rememberInstruction(Sel); LHS = Sel; } // In the case of mixed integer and pointer types, cast the @@ -796,9 +987,9 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { } Value *RHS = expandCodeFor(S->getOperand(i), Ty); Value *ICmp = Builder.CreateICmpUGT(LHS, RHS, "tmp"); - InsertedValues.insert(ICmp); + rememberInstruction(ICmp); Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umax"); - InsertedValues.insert(Sel); + rememberInstruction(Sel); LHS = Sel; } // In the case of mixed integer and pointer types, cast the @@ -863,7 +1054,8 @@ Value *SCEVExpander::expand(const SCEV *S) { Value *V = visit(S); // Remember the expanded value for this SCEV at this location. - InsertedExpressions[std::make_pair(S, InsertPt)] = V; + if (!PostIncLoop) + InsertedExpressions[std::make_pair(S, InsertPt)] = V; Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt); return V; diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 01cad71977a..91e04838c90 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -944,7 +944,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Okay, we know that we have a scale by now. However, if the scaled // value is an add of something and a constant, we can fold the // constant into the disp field here. - if (ShVal.getNode()->getOpcode() == ISD::ADD && ShVal.hasOneUse() && + if (ShVal.getNode()->getOpcode() == ISD::ADD && isa(ShVal.getNode()->getOperand(1))) { AM.IndexReg = ShVal.getNode()->getOperand(0); ConstantSDNode *AddVal = diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index ce1307c8df3..17f7d985097 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -471,6 +471,13 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, const Type *LargestType, // Compute the final addrec to expand into code. const SCEV *AR = IU->getReplacementExpr(*UI); + // Evaluate the expression out of the loop, if possible. + if (!L->contains(UI->getUser())) { + const SCEV *ExitVal = SE->getSCEVAtScope(AR, L->getParentLoop()); + if (ExitVal->isLoopInvariant(L)) + AR = ExitVal; + } + // FIXME: It is an extremely bad idea to indvar substitute anything more // complex than affine induction variables. Doing so will put expensive // polynomial evaluations inside of the loop, and the str reduction pass @@ -522,11 +529,10 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, const Type *LargestType, Rewriter.clear(); // Now that we're done iterating through lists, clean up any instructions // which are now dead. - while (!DeadInsts.empty()) { - Instruction *Inst = dyn_cast_or_null(DeadInsts.pop_back_val()); - if (Inst) + while (!DeadInsts.empty()) + if (Instruction *Inst = + dyn_cast_or_null(DeadInsts.pop_back_val())) RecursivelyDeleteTriviallyDeadInstructions(Inst); - } } /// If there's a single exit block, sink any loop-invariant values that diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index fa820ed8e40..6456ca1eb0c 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -17,6 +17,45 @@ // available on the target, and it performs a variety of other optimizations // related to loop induction variables. // +// Terminology note: this code has a lot of handling for "post-increment" or +// "post-inc" users. This is not talking about post-increment addressing modes; +// it is instead talking about code like this: +// +// %i = phi [ 0, %entry ], [ %i.next, %latch ] +// ... +// %i.next = add %i, 1 +// %c = icmp eq %i.next, %n +// +// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however +// it's useful to think about these as the same register, with some uses using +// the value of the register before the add and some using // it after. In this +// example, the icmp is a post-increment user, since it uses %i.next, which is +// the value of the induction variable after the increment. The other common +// case of post-increment users is users outside the loop. +// +// TODO: More sophistication in the way Formulae are generated. +// +// TODO: Handle multiple loops at a time. +// +// TODO: test/CodeGen/X86/full-lsr.ll should get full lsr. The problem is +// that {0,+,1}<%bb> is getting picked first because all 7 uses can +// use it, and while it's a pretty good solution, it means that LSR +// doesn't look further to find an even better solution. +// +// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr +// instead of a GlobalValue? +// +// TODO: When truncation is free, truncate ICmp users' operands to make it a +// smaller encoding (on x86 at least). +// +// TODO: When a negated register is used by an add (such as in a list of +// multiple base registers, or as the increment expression in an addrec), +// we may not actually need both reg and (-1 * reg) in registers; the +// negation can be implemented by using a sub instead of an add. The +// lack of support for taking this into consideration when making +// register pressure decisions is partly worked around by the "Special" +// use kind. +// //===----------------------------------------------------------------------===// #define DEBUG_TYPE "loop-reduce" @@ -26,2167 +65,1399 @@ #include "llvm/IntrinsicInst.h" #include "llvm/DerivedTypes.h" #include "llvm/Analysis/IVUsers.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" -#include "llvm/Transforms/Utils/AddrModeMatcher.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLowering.h" #include using namespace llvm; -STATISTIC(NumReduced , "Number of IV uses strength reduced"); -STATISTIC(NumInserted, "Number of PHIs inserted"); -STATISTIC(NumVariable, "Number of PHIs with variable strides"); -STATISTIC(NumEliminated, "Number of strides eliminated"); -STATISTIC(NumShadow, "Number of Shadow IVs optimized"); -STATISTIC(NumImmSunk, "Number of common expr immediates sunk into uses"); -STATISTIC(NumLoopCond, "Number of loop terminating conds optimized"); -STATISTIC(NumCountZero, "Number of count iv optimized to count toward zero"); - -static cl::opt EnableFullLSRMode("enable-full-lsr", - cl::init(false), - cl::Hidden); - namespace { - struct BasedUser; +// Constant strides come first which in turns are sorted by their absolute +// values. If absolute values are the same, then positive strides comes first. +// e.g. +// 4, -1, X, 1, 2 ==> 1, -1, 2, 4, X +struct StrideCompare { + const ScalarEvolution &SE; + explicit StrideCompare(const ScalarEvolution &se) : SE(se) {} + + bool operator()(const SCEV *const &LHS, const SCEV *const &RHS) const { + const SCEVConstant *LHSC = dyn_cast(LHS); + const SCEVConstant *RHSC = dyn_cast(RHS); + if (LHSC && RHSC) { + unsigned BitWidth = std::max(SE.getTypeSizeInBits(LHS->getType()), + SE.getTypeSizeInBits(RHS->getType())); + APInt LV = LHSC->getValue()->getValue(); + APInt RV = RHSC->getValue()->getValue(); + LV.sextOrTrunc(BitWidth); + RV.sextOrTrunc(BitWidth); + APInt ALV = LV.abs(); + APInt ARV = RV.abs(); + if (ALV == ARV) { + if (LV != RV) + return LV.sgt(RV); + } else { + return ALV.ult(ARV); + } - /// IVInfo - This structure keeps track of one IV expression inserted during - /// StrengthReduceStridedIVUsers. It contains the stride, the common base, as - /// well as the PHI node and increment value created for rewrite. - struct IVExpr { - const SCEV *Stride; - const SCEV *Base; - PHINode *PHI; + // If it's the same value but different type, sort by bit width so + // that we emit larger induction variables before smaller + // ones, letting the smaller be re-written in terms of larger ones. + return SE.getTypeSizeInBits(RHS->getType()) < + SE.getTypeSizeInBits(LHS->getType()); + } + return LHSC && !RHSC; + } +}; - IVExpr(const SCEV *const stride, const SCEV *const base, PHINode *phi) - : Stride(stride), Base(base), PHI(phi) {} - }; +/// RegSortData - This class holds data which is used to order reuse +/// candidates. +class RegSortData { +public: + /// Bits - This represents the set of LSRUses (by index) which reference a + /// particular register. + SmallBitVector Bits; - /// IVsOfOneStride - This structure keeps track of all IV expression inserted - /// during StrengthReduceStridedIVUsers for a particular stride of the IV. - struct IVsOfOneStride { - std::vector IVs; + /// MaxComplexity - This represents the greatest complexity (see the comments + /// on Formula::getComplexity) seen with a particular register. + uint32_t MaxComplexity; - void addIV(const SCEV *const Stride, const SCEV *const Base, PHINode *PHI) { - IVs.push_back(IVExpr(Stride, Base, PHI)); - } - }; + /// Index - This holds an arbitrary value used as a last-resort tie breaker + /// to ensure deterministic behavior. + unsigned Index; - class LoopStrengthReduce : public LoopPass { - IVUsers *IU; - ScalarEvolution *SE; - bool Changed; - - /// IVsByStride - Keep track of all IVs that have been inserted for a - /// particular stride. - std::map IVsByStride; - - /// DeadInsts - Keep track of instructions we may have made dead, so that - /// we can remove them after we are done working. - SmallVector DeadInsts; - - /// TLI - Keep a pointer of a TargetLowering to consult for determining - /// transformation profitability. - const TargetLowering *TLI; - - public: - static char ID; // Pass ID, replacement for typeid - explicit LoopStrengthReduce(const TargetLowering *tli = NULL) : - LoopPass(&ID), TLI(tli) {} - - bool runOnLoop(Loop *L, LPPassManager &LPM); - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - // We split critical edges, so we change the CFG. However, we do update - // many analyses if they are around. - AU.addPreservedID(LoopSimplifyID); - AU.addPreserved("loops"); - AU.addPreserved("domfrontier"); - AU.addPreserved("domtree"); - - AU.addRequiredID(LoopSimplifyID); - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); - } + RegSortData() {} - private: - void OptimizeIndvars(Loop *L); - - /// OptimizeLoopTermCond - Change loop terminating condition to use the - /// postinc iv when possible. - void OptimizeLoopTermCond(Loop *L); - - /// OptimizeShadowIV - If IV is used in a int-to-float cast - /// inside the loop then try to eliminate the cast opeation. - void OptimizeShadowIV(Loop *L); - - /// OptimizeMax - Rewrite the loop's terminating condition - /// if it uses a max computation. - ICmpInst *OptimizeMax(Loop *L, ICmpInst *Cond, - IVStrideUse* &CondUse); - - /// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for - /// deciding when to exit the loop is used only for that purpose, try to - /// rearrange things so it counts down to a test against zero. - bool OptimizeLoopCountIV(Loop *L); - bool OptimizeLoopCountIVOfStride(const SCEV* &Stride, - IVStrideUse* &CondUse, Loop *L); - - /// StrengthReduceIVUsersOfStride - Strength reduce all of the users of a - /// single stride of IV. All of the users may have different starting - /// values, and this may not be the only stride. - void StrengthReduceIVUsersOfStride(const SCEV *Stride, - IVUsersOfOneStride &Uses, - Loop *L); - void StrengthReduceIVUsers(Loop *L); - - ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond, - IVStrideUse* &CondUse, - const SCEV* &CondStride, - bool PostPass = false); - - bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse, - const SCEV* &CondStride); - bool RequiresTypeConversion(const Type *Ty, const Type *NewTy); - const SCEV *CheckForIVReuse(bool, bool, bool, const SCEV *, - IVExpr&, const Type*, - const std::vector& UsersToProcess); - bool ValidScale(bool, int64_t, - const std::vector& UsersToProcess); - bool ValidOffset(bool, int64_t, int64_t, - const std::vector& UsersToProcess); - const SCEV *CollectIVUsers(const SCEV *Stride, - IVUsersOfOneStride &Uses, - Loop *L, - bool &AllUsesAreAddresses, - bool &AllUsesAreOutsideLoop, - std::vector &UsersToProcess); - bool StrideMightBeShared(const SCEV *Stride, Loop *L, bool CheckPreInc); - bool ShouldUseFullStrengthReductionMode( - const std::vector &UsersToProcess, - const Loop *L, - bool AllUsesAreAddresses, - const SCEV *Stride); - void PrepareToStrengthReduceFully( - std::vector &UsersToProcess, - const SCEV *Stride, - const SCEV *CommonExprs, - const Loop *L, - SCEVExpander &PreheaderRewriter); - void PrepareToStrengthReduceFromSmallerStride( - std::vector &UsersToProcess, - Value *CommonBaseV, - const IVExpr &ReuseIV, - Instruction *PreInsertPt); - void PrepareToStrengthReduceWithNewPhi( - std::vector &UsersToProcess, - const SCEV *Stride, - const SCEV *CommonExprs, - Value *CommonBaseV, - Instruction *IVIncInsertPt, - const Loop *L, - SCEVExpander &PreheaderRewriter); - - void DeleteTriviallyDeadInstructions(); - }; -} + void print(raw_ostream &OS) const; + void dump() const; +}; -char LoopStrengthReduce::ID = 0; -static RegisterPass -X("loop-reduce", "Loop Strength Reduction"); +} -Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) { - return new LoopStrengthReduce(TLI); +void RegSortData::print(raw_ostream &OS) const { + OS << "[NumUses=" << Bits.count() + << ", MaxComplexity="; + OS.write_hex(MaxComplexity); + OS << ", Index=" << Index << ']'; } -/// DeleteTriviallyDeadInstructions - If any of the instructions is the -/// specified set are trivially dead, delete them and see if this makes any of -/// their operands subsequently dead. -void LoopStrengthReduce::DeleteTriviallyDeadInstructions() { - while (!DeadInsts.empty()) { - Instruction *I = dyn_cast_or_null(DeadInsts.pop_back_val()); +void RegSortData::dump() const { + print(errs()); errs() << '\n'; +} - if (I == 0 || !isInstructionTriviallyDead(I)) - continue; +namespace { - for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) - if (Instruction *U = dyn_cast(*OI)) { - *OI = 0; - if (U->use_empty()) - DeadInsts.push_back(U); +/// RegCount - This is a helper class to sort a given set of registers +/// according to associated RegSortData values. +class RegCount { +public: + const SCEV *Reg; + RegSortData Sort; + + RegCount(const SCEV *R, const RegSortData &RSD) + : Reg(R), Sort(RSD) {} + + // Sort by count. Returning true means the register is preferred. + bool operator<(const RegCount &Other) const { + // Sort by the number of unique uses of this register. + unsigned A = Sort.Bits.count(); + unsigned B = Other.Sort.Bits.count(); + if (A != B) return A > B; + + if (const SCEVAddRecExpr *AR = dyn_cast(Reg)) { + const SCEVAddRecExpr *BR = dyn_cast(Other.Reg); + // AddRecs have higher priority than other things. + if (!BR) return true; + + // Prefer affine values. + if (AR->isAffine() != BR->isAffine()) + return AR->isAffine(); + + const Loop *AL = AR->getLoop(); + const Loop *BL = BR->getLoop(); + if (AL != BL) { + unsigned ADepth = AL->getLoopDepth(); + unsigned BDepth = BL->getLoopDepth(); + // Prefer a less deeply nested addrec. + if (ADepth != BDepth) + return ADepth < BDepth; + + // Different loops at the same depth; do something arbitrary. + BasicBlock *AH = AL->getHeader(); + BasicBlock *BH = BL->getHeader(); + for (Function::iterator I = AH, E = AH->getParent()->end(); I != E; ++I) + if (&*I == BH) return true; + return false; } - I->eraseFromParent(); - Changed = true; - } -} + // Sort addrecs by stride. + const SCEV *AStep = AR->getOperand(1); + const SCEV *BStep = BR->getOperand(1); + if (AStep != BStep) { + if (const SCEVConstant *AC = dyn_cast(AStep)) { + const SCEVConstant *BC = dyn_cast(BStep); + if (!BC) return true; + // Arbitrarily prefer wider registers. + if (AC->getValue()->getValue().getBitWidth() != + BC->getValue()->getValue().getBitWidth()) + return AC->getValue()->getValue().getBitWidth() > + BC->getValue()->getValue().getBitWidth(); + // Ignore the sign bit, assuming that striding by a negative value + // is just as easy as by a positive value. + // Prefer the addrec with the lesser absolute value stride, as it + // will allow uses to have simpler addressing modes. + return AC->getValue()->getValue().abs() + .ult(BC->getValue()->getValue().abs()); + } + } -/// isAddressUse - Returns true if the specified instruction is using the -/// specified value as an address. -static bool isAddressUse(Instruction *Inst, Value *OperandVal) { - bool isAddress = isa(Inst); - if (StoreInst *SI = dyn_cast(Inst)) { - if (SI->getOperand(1) == OperandVal) - isAddress = true; - } else if (IntrinsicInst *II = dyn_cast(Inst)) { - // Addressing modes can also be folded into prefetches and a variety - // of intrinsics. - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::prefetch: - case Intrinsic::x86_sse2_loadu_dq: - case Intrinsic::x86_sse2_loadu_pd: - case Intrinsic::x86_sse_loadu_ps: - case Intrinsic::x86_sse_storeu_ps: - case Intrinsic::x86_sse2_storeu_pd: - case Intrinsic::x86_sse2_storeu_dq: - case Intrinsic::x86_sse2_storel_dq: - if (II->getOperand(1) == OperandVal) - isAddress = true; - break; + // Then sort by the register which will permit the simplest uses. + // This is a heuristic; currently we only track the most complex use as a + // representative. + if (Sort.MaxComplexity != Other.Sort.MaxComplexity) + return Sort.MaxComplexity < Other.Sort.MaxComplexity; + + // Then sort them by their start values. + const SCEV *AStart = AR->getStart(); + const SCEV *BStart = BR->getStart(); + if (AStart != BStart) { + if (const SCEVConstant *AC = dyn_cast(AStart)) { + const SCEVConstant *BC = dyn_cast(BStart); + if (!BC) return true; + // Arbitrarily prefer wider registers. + if (AC->getValue()->getValue().getBitWidth() != + BC->getValue()->getValue().getBitWidth()) + return AC->getValue()->getValue().getBitWidth() > + BC->getValue()->getValue().getBitWidth(); + // Prefer positive over negative if the absolute values are the same. + if (AC->getValue()->getValue().abs() == + BC->getValue()->getValue().abs()) + return AC->getValue()->getValue().isStrictlyPositive(); + // Prefer the addrec with the lesser absolute value start. + return AC->getValue()->getValue().abs() + .ult(BC->getValue()->getValue().abs()); + } + } + } else { + // AddRecs have higher priority than other things. + if (isa(Other.Reg)) return false; + // Sort by the register which will permit the simplest uses. + // This is a heuristic; currently we only track the most complex use as a + // representative. + if (Sort.MaxComplexity != Other.Sort.MaxComplexity) + return Sort.MaxComplexity < Other.Sort.MaxComplexity; } - } - return isAddress; -} -/// getAccessType - Return the type of the memory being accessed. -static const Type *getAccessType(const Instruction *Inst) { - const Type *AccessTy = Inst->getType(); - if (const StoreInst *SI = dyn_cast(Inst)) - AccessTy = SI->getOperand(0)->getType(); - else if (const IntrinsicInst *II = dyn_cast(Inst)) { - // Addressing modes can also be folded into prefetches and a variety - // of intrinsics. - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::x86_sse_storeu_ps: - case Intrinsic::x86_sse2_storeu_pd: - case Intrinsic::x86_sse2_storeu_dq: - case Intrinsic::x86_sse2_storel_dq: - AccessTy = II->getOperand(1)->getType(); - break; - } + + // Tie-breaker: the arbitrary index, to ensure a reliable ordering. + return Sort.Index < Other.Sort.Index; } - return AccessTy; -} -namespace { - /// BasedUser - For a particular base value, keep information about how we've - /// partitioned the expression so far. - struct BasedUser { - /// Base - The Base value for the PHI node that needs to be inserted for - /// this use. As the use is processed, information gets moved from this - /// field to the Imm field (below). BasedUser values are sorted by this - /// field. - const SCEV *Base; - - /// Inst - The instruction using the induction variable. - Instruction *Inst; - - /// OperandValToReplace - The operand value of Inst to replace with the - /// EmittedBase. - Value *OperandValToReplace; - - /// Imm - The immediate value that should be added to the base immediately - /// before Inst, because it will be folded into the imm field of the - /// instruction. This is also sometimes used for loop-variant values that - /// must be added inside the loop. - const SCEV *Imm; - - /// Phi - The induction variable that performs the striding that - /// should be used for this user. - PHINode *Phi; - - // isUseOfPostIncrementedValue - True if this should use the - // post-incremented version of this IV, not the preincremented version. - // This can only be set in special cases, such as the terminating setcc - // instruction for a loop and uses outside the loop that are dominated by - // the loop. - bool isUseOfPostIncrementedValue; - - BasedUser(IVStrideUse &IVSU, ScalarEvolution *se) - : Base(IVSU.getOffset()), Inst(IVSU.getUser()), - OperandValToReplace(IVSU.getOperandValToReplace()), - Imm(se->getIntegerSCEV(0, Base->getType())), - isUseOfPostIncrementedValue(IVSU.isUseOfPostIncrementedValue()) {} - - // Once we rewrite the code to insert the new IVs we want, update the - // operands of Inst to use the new expression 'NewBase', with 'Imm' added - // to it. - void RewriteInstructionToUseNewBase(const SCEV *NewBase, - Instruction *InsertPt, - SCEVExpander &Rewriter, Loop *L, Pass *P, - SmallVectorImpl &DeadInsts, - ScalarEvolution *SE); - - Value *InsertCodeForBaseAtPosition(const SCEV *NewBase, - const Type *Ty, - SCEVExpander &Rewriter, - Instruction *IP, - ScalarEvolution *SE); - void dump() const; - }; + void print(raw_ostream &OS) const; + void dump() const; +}; + } -void BasedUser::dump() const { - dbgs() << " Base=" << *Base; - dbgs() << " Imm=" << *Imm; - dbgs() << " Inst: " << *Inst; +void RegCount::print(raw_ostream &OS) const { + OS << *Reg << ':'; + Sort.print(OS); } -Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV *NewBase, - const Type *Ty, - SCEVExpander &Rewriter, - Instruction *IP, - ScalarEvolution *SE) { - Value *Base = Rewriter.expandCodeFor(NewBase, 0, IP); +void RegCount::dump() const { + print(errs()); errs() << '\n'; +} - // Wrap the base in a SCEVUnknown so that ScalarEvolution doesn't try to - // re-analyze it. - const SCEV *NewValSCEV = SE->getUnknown(Base); +namespace { - // Always emit the immediate into the same block as the user. - NewValSCEV = SE->getAddExpr(NewValSCEV, Imm); +/// Formula - This class holds information that describes a formula for +/// satisfying a use. It may include broken-out immediates and scaled registers. +struct Formula { + /// AM - This is used to represent complex addressing, as well as other kinds + /// of interesting uses. + TargetLowering::AddrMode AM; - return Rewriter.expandCodeFor(NewValSCEV, Ty, IP); -} + /// BaseRegs - The list of "base" registers for this use. When this is + /// non-empty, AM.HasBaseReg should be set to true. + SmallVector BaseRegs; + /// ScaledReg - The 'scaled' register for this use. This should be non-null + /// when AM.Scale is not zero. + const SCEV *ScaledReg; -// Once we rewrite the code to insert the new IVs we want, update the -// operands of Inst to use the new expression 'NewBase', with 'Imm' added -// to it. NewBasePt is the last instruction which contributes to the -// value of NewBase in the case that it's a diffferent instruction from -// the PHI that NewBase is computed from, or null otherwise. -// -void BasedUser::RewriteInstructionToUseNewBase(const SCEV *NewBase, - Instruction *NewBasePt, - SCEVExpander &Rewriter, Loop *L, Pass *P, - SmallVectorImpl &DeadInsts, - ScalarEvolution *SE) { - if (!isa(Inst)) { - // By default, insert code at the user instruction. - BasicBlock::iterator InsertPt = Inst; - - // However, if the Operand is itself an instruction, the (potentially - // complex) inserted code may be shared by many users. Because of this, we - // want to emit code for the computation of the operand right before its old - // computation. This is usually safe, because we obviously used to use the - // computation when it was computed in its current block. However, in some - // cases (e.g. use of a post-incremented induction variable) the NewBase - // value will be pinned to live somewhere after the original computation. - // In this case, we have to back off. - // - // If this is a use outside the loop (which means after, since it is based - // on a loop indvar) we use the post-incremented value, so that we don't - // artificially make the preinc value live out the bottom of the loop. - if (!isUseOfPostIncrementedValue && L->contains(Inst)) { - if (NewBasePt && isa(OperandValToReplace)) { - InsertPt = NewBasePt; - ++InsertPt; - } else if (Instruction *OpInst - = dyn_cast(OperandValToReplace)) { - InsertPt = OpInst; - while (isa(InsertPt)) ++InsertPt; - } - } - Value *NewVal = InsertCodeForBaseAtPosition(NewBase, - OperandValToReplace->getType(), - Rewriter, InsertPt, SE); - // Replace the use of the operand Value with the new Phi we just created. - Inst->replaceUsesOfWith(OperandValToReplace, NewVal); - - DEBUG(dbgs() << " Replacing with "); - DEBUG(WriteAsOperand(dbgs(), NewVal, /*PrintType=*/false)); - DEBUG(dbgs() << ", which has value " << *NewBase << " plus IMM " - << *Imm << "\n"); - return; - } + Formula() : ScaledReg(0) {} - // PHI nodes are more complex. We have to insert one copy of the NewBase+Imm - // expression into each operand block that uses it. Note that PHI nodes can - // have multiple entries for the same predecessor. We use a map to make sure - // that a PHI node only has a single Value* for each predecessor (which also - // prevents us from inserting duplicate code in some blocks). - DenseMap InsertedCode; - PHINode *PN = cast(Inst); - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { - if (PN->getIncomingValue(i) == OperandValToReplace) { - // If the original expression is outside the loop, put the replacement - // code in the same place as the original expression, - // which need not be an immediate predecessor of this PHI. This way we - // need only one copy of it even if it is referenced multiple times in - // the PHI. We don't do this when the original expression is inside the - // loop because multiple copies sometimes do useful sinking of code in - // that case(?). - Instruction *OldLoc = dyn_cast(OperandValToReplace); - BasicBlock *PHIPred = PN->getIncomingBlock(i); - if (L->contains(OldLoc)) { - // If this is a critical edge, split the edge so that we do not insert - // the code on all predecessor/successor paths. We do this unless this - // is the canonical backedge for this loop, as this can make some - // inserted code be in an illegal position. - if (e != 1 && PHIPred->getTerminator()->getNumSuccessors() > 1 && - !isa(PHIPred->getTerminator()) && - (PN->getParent() != L->getHeader() || !L->contains(PHIPred))) { - - // First step, split the critical edge. - BasicBlock *NewBB = SplitCriticalEdge(PHIPred, PN->getParent(), - P, false); - - // Next step: move the basic block. In particular, if the PHI node - // is outside of the loop, and PredTI is in the loop, we want to - // move the block to be immediately before the PHI block, not - // immediately after PredTI. - if (L->contains(PHIPred) && !L->contains(PN)) - NewBB->moveBefore(PN->getParent()); + unsigned getNumRegs() const; + uint32_t getComplexity() const; + const Type *getType() const; - // Splitting the edge can reduce the number of PHI entries we have. - e = PN->getNumIncomingValues(); - PHIPred = NewBB; - i = PN->getBasicBlockIndex(PHIPred); - } - } - Value *&Code = InsertedCode[PHIPred]; - if (!Code) { - // Insert the code into the end of the predecessor block. - Instruction *InsertPt = (L->contains(OldLoc)) ? - PHIPred->getTerminator() : - OldLoc->getParent()->getTerminator(); - Code = InsertCodeForBaseAtPosition(NewBase, PN->getType(), - Rewriter, InsertPt, SE); - - DEBUG(dbgs() << " Changing PHI use to "); - DEBUG(WriteAsOperand(dbgs(), Code, /*PrintType=*/false)); - DEBUG(dbgs() << ", which has value " << *NewBase << " plus IMM " - << *Imm << "\n"); - } + void InitialMatch(const SCEV *S, Loop *L, + ScalarEvolution &SE, DominatorTree &DT); - // Replace the use of the operand Value with the new Phi we just created. - PN->setIncomingValue(i, Code); - Rewriter.clear(); - } + /// referencesReg - Test if this formula references the given register. + bool referencesReg(const SCEV *S) const { + return S == ScaledReg || + std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end(); } - // PHI node might have become a constant value after SplitCriticalEdge. - DeadInsts.push_back(Inst); -} - + bool operator==(const Formula &Other) const { + return BaseRegs == Other.BaseRegs && + ScaledReg == Other.ScaledReg && + AM.Scale == Other.AM.Scale && + AM.BaseOffs == Other.AM.BaseOffs && + AM.BaseGV == Other.AM.BaseGV; + } -/// fitsInAddressMode - Return true if V can be subsumed within an addressing -/// mode, and does not need to be put in a register first. -static bool fitsInAddressMode(const SCEV *V, const Type *AccessTy, - const TargetLowering *TLI, bool HasBaseReg) { - if (const SCEVConstant *SC = dyn_cast(V)) { - int64_t VC = SC->getValue()->getSExtValue(); - if (TLI) { - TargetLowering::AddrMode AM; - AM.BaseOffs = VC; - AM.HasBaseReg = HasBaseReg; - return TLI->isLegalAddressingMode(AM, AccessTy); - } else { - // Defaults to PPC. PPC allows a sign-extended 16-bit immediate field. - return (VC > -(1 << 16) && VC < (1 << 16)-1); - } + // This sorts at least partially based on host pointer values which are + // not deterministic, so it is only usable for uniqification. + bool operator<(const Formula &Other) const { + if (BaseRegs != Other.BaseRegs) + return BaseRegs < Other.BaseRegs; + if (ScaledReg != Other.ScaledReg) + return ScaledReg < Other.ScaledReg; + if (AM.Scale != Other.AM.Scale) + return AM.Scale < Other.AM.Scale; + if (AM.BaseOffs != Other.AM.BaseOffs) + return AM.BaseOffs < Other.AM.BaseOffs; + if (AM.BaseGV != Other.AM.BaseGV) + return AM.BaseGV < Other.AM.BaseGV; + return false; } - if (const SCEVUnknown *SU = dyn_cast(V)) - if (GlobalValue *GV = dyn_cast(SU->getValue())) { - if (TLI) { - TargetLowering::AddrMode AM; - AM.BaseGV = GV; - AM.HasBaseReg = HasBaseReg; - return TLI->isLegalAddressingMode(AM, AccessTy); - } else { - // Default: assume global addresses are not legal. - } - } + void print(raw_ostream &OS) const; + void dump() const; +}; - return false; } -/// MoveLoopVariantsToImmediateField - Move any subexpressions from Val that are -/// loop varying to the Imm operand. -static void MoveLoopVariantsToImmediateField(const SCEV *&Val, const SCEV *&Imm, - Loop *L, ScalarEvolution *SE) { - if (Val->isLoopInvariant(L)) return; // Nothing to do. - - if (const SCEVAddExpr *SAE = dyn_cast(Val)) { - SmallVector NewOps; - NewOps.reserve(SAE->getNumOperands()); +/// getNumRegs - Return the total number of register operands used by this +/// formula. +unsigned Formula::getNumRegs() const { + return !!ScaledReg + BaseRegs.size(); +} - for (unsigned i = 0; i != SAE->getNumOperands(); ++i) - if (!SAE->getOperand(i)->isLoopInvariant(L)) { - // If this is a loop-variant expression, it must stay in the immediate - // field of the expression. - Imm = SE->getAddExpr(Imm, SAE->getOperand(i)); - } else { - NewOps.push_back(SAE->getOperand(i)); - } +/// getComplexity - Return an oversimplified value indicating the complexity +/// of this formula. This is used as a tie-breaker in choosing register +/// preferences. +uint32_t Formula::getComplexity() const { + // Encode the information in a uint32_t so that comparing with operator< + // will be interesting. + return + // Most significant, the number of registers. This saturates because we + // need the bits, and because beyond a few hundred it doesn't really matter. + (std::min(getNumRegs(), (1u<<15)-1) << 17) | + // Having multiple base regs is worse than having a base reg and a scale. + ((BaseRegs.size() > 1) << 16) | + // Scale absolute value. + ((AM.Scale != 0 ? (Log2_64(abs64(AM.Scale)) + 1) : 0u) << 9) | + // Scale sign, which is less significant than the absolute value. + ((AM.Scale < 0) << 8) | + // Offset absolute value. + ((AM.BaseOffs != 0 ? (Log2_64(abs64(AM.BaseOffs)) + 1) : 0u) << 1) | + // If a GV is present, treat it like a maximal offset. + ((AM.BaseGV ? ((1u<<7)-1) : 0) << 1) | + // Offset sign, which is less significant than the absolute offset. + ((AM.BaseOffs < 0) << 0); +} - if (NewOps.empty()) - Val = SE->getIntegerSCEV(0, Val->getType()); - else - Val = SE->getAddExpr(NewOps); - } else if (const SCEVAddRecExpr *SARE = dyn_cast(Val)) { - // Try to pull immediates out of the start value of nested addrec's. - const SCEV *Start = SARE->getStart(); - MoveLoopVariantsToImmediateField(Start, Imm, L, SE); - - SmallVector Ops(SARE->op_begin(), SARE->op_end()); - Ops[0] = Start; - Val = SE->getAddRecExpr(Ops, SARE->getLoop()); - } else { - // Otherwise, all of Val is variant, move the whole thing over. - Imm = SE->getAddExpr(Imm, Val); - Val = SE->getIntegerSCEV(0, Val->getType()); - } +/// getType - Return the type of this formula, if it has one, or null +/// otherwise. This type is meaningless except for the bit size. +const Type *Formula::getType() const { + return !BaseRegs.empty() ? BaseRegs.front()->getType() : + ScaledReg ? ScaledReg->getType() : + AM.BaseGV ? AM.BaseGV->getType() : + 0; } +namespace { -/// MoveImmediateValues - Look at Val, and pull out any additions of constants -/// that can fit into the immediate field of instructions in the target. -/// Accumulate these immediate values into the Imm value. -static void MoveImmediateValues(const TargetLowering *TLI, - const Type *AccessTy, - const SCEV *&Val, const SCEV *&Imm, - bool isAddress, Loop *L, - ScalarEvolution *SE) { - if (const SCEVAddExpr *SAE = dyn_cast(Val)) { - SmallVector NewOps; - NewOps.reserve(SAE->getNumOperands()); +/// ComplexitySorter - A predicate which orders Formulae by the number of +/// registers they contain. +struct ComplexitySorter { + bool operator()(const Formula &LHS, const Formula &RHS) const { + unsigned L = LHS.getNumRegs(); + unsigned R = RHS.getNumRegs(); + if (L != R) return L < R; - for (unsigned i = 0; i != SAE->getNumOperands(); ++i) { - const SCEV *NewOp = SAE->getOperand(i); - MoveImmediateValues(TLI, AccessTy, NewOp, Imm, isAddress, L, SE); + return LHS.getComplexity() < RHS.getComplexity(); + } +}; - if (!NewOp->isLoopInvariant(L)) { - // If this is a loop-variant expression, it must stay in the immediate - // field of the expression. - Imm = SE->getAddExpr(Imm, NewOp); - } else { - NewOps.push_back(NewOp); - } - } +} - if (NewOps.empty()) - Val = SE->getIntegerSCEV(0, Val->getType()); - else - Val = SE->getAddExpr(NewOps); - return; - } else if (const SCEVAddRecExpr *SARE = dyn_cast(Val)) { - // Try to pull immediates out of the start value of nested addrec's. - const SCEV *Start = SARE->getStart(); - MoveImmediateValues(TLI, AccessTy, Start, Imm, isAddress, L, SE); - - if (Start != SARE->getStart()) { - SmallVector Ops(SARE->op_begin(), SARE->op_end()); - Ops[0] = Start; - Val = SE->getAddRecExpr(Ops, SARE->getLoop()); - } +/// DoInitialMatch - Recurrsion helper for InitialMatch. +static void DoInitialMatch(const SCEV *S, Loop *L, + SmallVectorImpl &Good, + SmallVectorImpl &Bad, + ScalarEvolution &SE, DominatorTree &DT) { + // Collect expressions which properly dominate the loop header. + if (S->properlyDominates(L->getHeader(), &DT)) { + Good.push_back(S); return; - } else if (const SCEVMulExpr *SME = dyn_cast(Val)) { - // Transform "8 * (4 + v)" -> "32 + 8*V" if "32" fits in the immed field. - if (isAddress && - fitsInAddressMode(SME->getOperand(0), AccessTy, TLI, false) && - SME->getNumOperands() == 2 && SME->isLoopInvariant(L)) { - - const SCEV *SubImm = SE->getIntegerSCEV(0, Val->getType()); - const SCEV *NewOp = SME->getOperand(1); - MoveImmediateValues(TLI, AccessTy, NewOp, SubImm, isAddress, L, SE); - - // If we extracted something out of the subexpressions, see if we can - // simplify this! - if (NewOp != SME->getOperand(1)) { - // Scale SubImm up by "8". If the result is a target constant, we are - // good. - SubImm = SE->getMulExpr(SubImm, SME->getOperand(0)); - if (fitsInAddressMode(SubImm, AccessTy, TLI, false)) { - // Accumulate the immediate. - Imm = SE->getAddExpr(Imm, SubImm); - - // Update what is left of 'Val'. - Val = SE->getMulExpr(SME->getOperand(0), NewOp); - return; - } - } - } } - // Loop-variant expressions must stay in the immediate field of the - // expression. - if ((isAddress && fitsInAddressMode(Val, AccessTy, TLI, false)) || - !Val->isLoopInvariant(L)) { - Imm = SE->getAddExpr(Imm, Val); - Val = SE->getIntegerSCEV(0, Val->getType()); + // Look at add operands. + if (const SCEVAddExpr *Add = dyn_cast(S)) { + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) + DoInitialMatch(*I, L, Good, Bad, SE, DT); return; } - // Otherwise, no immediates to move. -} - -static void MoveImmediateValues(const TargetLowering *TLI, - Instruction *User, - const SCEV *&Val, const SCEV *&Imm, - bool isAddress, Loop *L, - ScalarEvolution *SE) { - const Type *AccessTy = getAccessType(User); - MoveImmediateValues(TLI, AccessTy, Val, Imm, isAddress, L, SE); -} - -/// SeparateSubExprs - Decompose Expr into all of the subexpressions that are -/// added together. This is used to reassociate common addition subexprs -/// together for maximal sharing when rewriting bases. -static void SeparateSubExprs(SmallVector &SubExprs, - const SCEV *Expr, - ScalarEvolution *SE) { - if (const SCEVAddExpr *AE = dyn_cast(Expr)) { - for (unsigned j = 0, e = AE->getNumOperands(); j != e; ++j) - SeparateSubExprs(SubExprs, AE->getOperand(j), SE); - } else if (const SCEVAddRecExpr *SARE = dyn_cast(Expr)) { - const SCEV *Zero = SE->getIntegerSCEV(0, Expr->getType()); - if (SARE->getOperand(0) == Zero) { - SubExprs.push_back(Expr); - } else { - // Compute the addrec with zero as its base. - SmallVector Ops(SARE->op_begin(), SARE->op_end()); - Ops[0] = Zero; // Start with zero base. - SubExprs.push_back(SE->getAddRecExpr(Ops, SARE->getLoop())); - - - SeparateSubExprs(SubExprs, SARE->getOperand(0), SE); + // Look at addrec operands. + if (const SCEVAddRecExpr *AR = dyn_cast(S)) { + if (!AR->getStart()->isZero()) { + DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT); + DoInitialMatch(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()), + AR->getStepRecurrence(SE), + AR->getLoop()), + L, Good, Bad, SE, DT); + return; } - } else if (!Expr->isZero()) { - // Do not add zero. - SubExprs.push_back(Expr); } + + // Handle a multiplication by -1 (negation) if it didn't fold. + if (const SCEVMulExpr *Mul = dyn_cast(S)) + if (Mul->getOperand(0)->isAllOnesValue()) { + SmallVector Ops(Mul->op_begin()+1, Mul->op_end()); + const SCEV *NewMul = SE.getMulExpr(Ops); + + SmallVector MyGood; + SmallVector MyBad; + DoInitialMatch(NewMul, L, MyGood, MyBad, SE, DT); + const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue( + SE.getEffectiveSCEVType(NewMul->getType()))); + for (SmallVectorImpl::const_iterator I = MyGood.begin(), + E = MyGood.end(); I != E; ++I) + Good.push_back(SE.getMulExpr(NegOne, *I)); + for (SmallVectorImpl::const_iterator I = MyBad.begin(), + E = MyBad.end(); I != E; ++I) + Bad.push_back(SE.getMulExpr(NegOne, *I)); + return; + } + + // Ok, we can't do anything interesting. Just stuff the whole thing into a + // register and hope for the best. + Bad.push_back(S); } -// This is logically local to the following function, but C++ says we have -// to make it file scope. -struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; }; - -/// RemoveCommonExpressionsFromUseBases - Look through all of the Bases of all -/// the Uses, removing any common subexpressions, except that if all such -/// subexpressions can be folded into an addressing mode for all uses inside -/// the loop (this case is referred to as "free" in comments herein) we do -/// not remove anything. This looks for things like (a+b+c) and -/// (a+c+d) and computes the common (a+c) subexpression. The common expression -/// is *removed* from the Bases and returned. -static const SCEV * -RemoveCommonExpressionsFromUseBases(std::vector &Uses, - ScalarEvolution *SE, Loop *L, - const TargetLowering *TLI) { - unsigned NumUses = Uses.size(); - - // Only one use? This is a very common case, so we handle it specially and - // cheaply. - const SCEV *Zero = SE->getIntegerSCEV(0, Uses[0].Base->getType()); - const SCEV *Result = Zero; - const SCEV *FreeResult = Zero; - if (NumUses == 1) { - // If the use is inside the loop, use its base, regardless of what it is: - // it is clearly shared across all the IV's. If the use is outside the loop - // (which means after it) we don't want to factor anything *into* the loop, - // so just use 0 as the base. - if (L->contains(Uses[0].Inst)) - std::swap(Result, Uses[0].Base); - return Result; +/// InitialMatch - Incorporate loop-variant parts of S into this Formula, +/// attempting to keep all loop-invariant and loop-computable values in a +/// single base register. +void Formula::InitialMatch(const SCEV *S, Loop *L, + ScalarEvolution &SE, DominatorTree &DT) { + SmallVector Good; + SmallVector Bad; + DoInitialMatch(S, L, Good, Bad, SE, DT); + if (!Good.empty()) { + BaseRegs.push_back(SE.getAddExpr(Good)); + AM.HasBaseReg = true; } + if (!Bad.empty()) { + BaseRegs.push_back(SE.getAddExpr(Bad)); + AM.HasBaseReg = true; + } +} - // To find common subexpressions, count how many of Uses use each expression. - // If any subexpressions are used Uses.size() times, they are common. - // Also track whether all uses of each expression can be moved into an - // an addressing mode "for free"; such expressions are left within the loop. - // struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; }; - std::map SubExpressionUseData; - - // UniqueSubExprs - Keep track of all of the subexpressions we see in the - // order we see them. - SmallVector UniqueSubExprs; - - SmallVector SubExprs; - unsigned NumUsesInsideLoop = 0; - for (unsigned i = 0; i != NumUses; ++i) { - // If the user is outside the loop, just ignore it for base computation. - // Since the user is outside the loop, it must be *after* the loop (if it - // were before, it could not be based on the loop IV). We don't want users - // after the loop to affect base computation of values *inside* the loop, - // because we can always add their offsets to the result IV after the loop - // is done, ensuring we get good code inside the loop. - if (!L->contains(Uses[i].Inst)) - continue; - NumUsesInsideLoop++; - - // If the base is zero (which is common), return zero now, there are no - // CSEs we can find. - if (Uses[i].Base == Zero) return Zero; - - // If this use is as an address we may be able to put CSEs in the addressing - // mode rather than hoisting them. - bool isAddrUse = isAddressUse(Uses[i].Inst, Uses[i].OperandValToReplace); - // We may need the AccessTy below, but only when isAddrUse, so compute it - // only in that case. - const Type *AccessTy = 0; - if (isAddrUse) - AccessTy = getAccessType(Uses[i].Inst); - - // Split the expression into subexprs. - SeparateSubExprs(SubExprs, Uses[i].Base, SE); - // Add one to SubExpressionUseData.Count for each subexpr present, and - // if the subexpr is not a valid immediate within an addressing mode use, - // set SubExpressionUseData.notAllUsesAreFree. We definitely want to - // hoist these out of the loop (if they are common to all uses). - for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) { - if (++SubExpressionUseData[SubExprs[j]].Count == 1) - UniqueSubExprs.push_back(SubExprs[j]); - if (!isAddrUse || !fitsInAddressMode(SubExprs[j], AccessTy, TLI, false)) - SubExpressionUseData[SubExprs[j]].notAllUsesAreFree = true; - } - SubExprs.clear(); +void Formula::print(raw_ostream &OS) const { + bool First = true; + if (AM.BaseGV) { + if (!First) OS << " + "; else First = false; + WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false); } + if (AM.BaseOffs != 0) { + if (!First) OS << " + "; else First = false; + OS << AM.BaseOffs; + } + for (SmallVectorImpl::const_iterator I = BaseRegs.begin(), + E = BaseRegs.end(); I != E; ++I) { + if (!First) OS << " + "; else First = false; + OS << "reg("; + OS << **I; + OS << ")"; + } + if (AM.Scale != 0) { + if (!First) OS << " + "; else First = false; + OS << AM.Scale << "*reg("; + if (ScaledReg) + OS << *ScaledReg; + else + OS << ""; + OS << ")"; + } +} - // Now that we know how many times each is used, build Result. Iterate over - // UniqueSubexprs so that we have a stable ordering. - for (unsigned i = 0, e = UniqueSubExprs.size(); i != e; ++i) { - std::map::iterator I = - SubExpressionUseData.find(UniqueSubExprs[i]); - assert(I != SubExpressionUseData.end() && "Entry not found?"); - if (I->second.Count == NumUsesInsideLoop) { // Found CSE! - if (I->second.notAllUsesAreFree) - Result = SE->getAddExpr(Result, I->first); - else - FreeResult = SE->getAddExpr(FreeResult, I->first); - } else - // Remove non-cse's from SubExpressionUseData. - SubExpressionUseData.erase(I); +void Formula::dump() const { + print(errs()); errs() << '\n'; +} + +/// getSDiv - Return an expression for LHS /s RHS, if it can be determined, +/// or null otherwise. If IgnoreSignificantBits is true, expressions like +/// (X * Y) /s Y are simplified to Y, ignoring that the multiplication may +/// overflow, which is useful when the result will be used in a context where +/// the most significant bits are ignored. +static const SCEV *getSDiv(const SCEV *LHS, const SCEV *RHS, + ScalarEvolution &SE, + bool IgnoreSignificantBits = false) { + // Handle the trivial case, which works for any SCEV type. + if (LHS == RHS) + return SE.getIntegerSCEV(1, LHS->getType()); + + // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do some + // folding. + if (RHS->isAllOnesValue()) + return SE.getMulExpr(LHS, RHS); + + // Check for a division of a constant by a constant. + if (const SCEVConstant *C = dyn_cast(LHS)) { + const SCEVConstant *RC = dyn_cast(RHS); + if (!RC) + return 0; + if (C->getValue()->getValue().srem(RC->getValue()->getValue()) != 0) + return 0; + return SE.getConstant(C->getValue()->getValue() + .sdiv(RC->getValue()->getValue())); } - if (FreeResult != Zero) { - // We have some subexpressions that can be subsumed into addressing - // modes in every use inside the loop. However, it's possible that - // there are so many of them that the combined FreeResult cannot - // be subsumed, or that the target cannot handle both a FreeResult - // and a Result in the same instruction (for example because it would - // require too many registers). Check this. - for (unsigned i=0; icontains(Uses[i].Inst)) - continue; - // We know this is an addressing mode use; if there are any uses that - // are not, FreeResult would be Zero. - const Type *AccessTy = getAccessType(Uses[i].Inst); - if (!fitsInAddressMode(FreeResult, AccessTy, TLI, Result!=Zero)) { - // FIXME: could split up FreeResult into pieces here, some hoisted - // and some not. There is no obvious advantage to this. - Result = SE->getAddExpr(Result, FreeResult); - FreeResult = Zero; - break; - } - } + // Distribute the sdiv over addrec operands. + if (const SCEVAddRecExpr *AR = dyn_cast(LHS)) { + const SCEV *Start = getSDiv(AR->getStart(), RHS, SE, + IgnoreSignificantBits); + if (!Start) return 0; + const SCEV *Step = getSDiv(AR->getStepRecurrence(SE), RHS, SE, + IgnoreSignificantBits); + if (!Step) return 0; + return SE.getAddRecExpr(Start, Step, AR->getLoop()); } - // If we found no CSE's, return now. - if (Result == Zero) return Result; - - // If we still have a FreeResult, remove its subexpressions from - // SubExpressionUseData. This means they will remain in the use Bases. - if (FreeResult != Zero) { - SeparateSubExprs(SubExprs, FreeResult, SE); - for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) { - std::map::iterator I = - SubExpressionUseData.find(SubExprs[j]); - SubExpressionUseData.erase(I); + // Distribute the sdiv over add operands. + if (const SCEVAddExpr *Add = dyn_cast(LHS)) { + SmallVector Ops; + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) { + const SCEV *Op = getSDiv(*I, RHS, SE, + IgnoreSignificantBits); + if (!Op) return 0; + Ops.push_back(Op); } - SubExprs.clear(); + return SE.getAddExpr(Ops); } - // Otherwise, remove all of the CSE's we found from each of the base values. - for (unsigned i = 0; i != NumUses; ++i) { - // Uses outside the loop don't necessarily include the common base, but - // the final IV value coming into those uses does. Instead of trying to - // remove the pieces of the common base, which might not be there, - // subtract off the base to compensate for this. - if (!L->contains(Uses[i].Inst)) { - Uses[i].Base = SE->getMinusSCEV(Uses[i].Base, Result); - continue; + // Check for a multiply operand that we can pull RHS out of. + if (const SCEVMulExpr *Mul = dyn_cast(LHS)) + if (IgnoreSignificantBits || Mul->hasNoSignedWrap()) { + SmallVector Ops; + bool Found = false; + for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end(); + I != E; ++I) { + if (!Found) + if (const SCEV *Q = getSDiv(*I, RHS, SE, IgnoreSignificantBits)) { + Ops.push_back(Q); + Found = true; + continue; + } + Ops.push_back(*I); + } + return Found ? SE.getMulExpr(Ops) : 0; } - // Split the expression into subexprs. - SeparateSubExprs(SubExprs, Uses[i].Base, SE); + // Otherwise we don't know. + return 0; +} - // Remove any common subexpressions. - for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) - if (SubExpressionUseData.count(SubExprs[j])) { - SubExprs.erase(SubExprs.begin()+j); - --j; --e; - } +namespace { - // Finally, add the non-shared expressions together. - if (SubExprs.empty()) - Uses[i].Base = Zero; - else - Uses[i].Base = SE->getAddExpr(SubExprs); - SubExprs.clear(); - } +/// LSRUse - This class holds the state that LSR keeps for each use in +/// IVUsers, as well as uses invented by LSR itself. It includes information +/// about what kinds of things can be folded into the user, information +/// about the user itself, and information about how the use may be satisfied. +/// TODO: Represent multiple users of the same expression in common? +class LSRUse { + SmallSet FormulaeUniquifier; + +public: + /// KindType - An enum for a kind of use, indicating what types of + /// scaled and immediate operands it might support. + enum KindType { + Basic, ///< A normal use, with no folding. + Special, ///< A special case of basic, allowing -1 scales. + Address, ///< An address use; folding according to TargetLowering + ICmpZero ///< An equality icmp with both operands folded into one. + // TODO: Add a generic icmp too? + }; - return Result; -} + KindType Kind; + const Type *AccessTy; + Instruction *UserInst; + Value *OperandValToReplace; -/// ValidScale - Check whether the given Scale is valid for all loads and -/// stores in UsersToProcess. -/// -bool LoopStrengthReduce::ValidScale(bool HasBaseReg, int64_t Scale, - const std::vector& UsersToProcess) { - if (!TLI) - return true; + /// PostIncLoop - If this user is to use the post-incremented value of an + /// induction variable, this variable is non-null and holds the loop + /// associated with the induction variable. + const Loop *PostIncLoop; - for (unsigned i = 0, e = UsersToProcess.size(); i!=e; ++i) { - // If this is a load or other access, pass the type of the access in. - const Type *AccessTy = - Type::getVoidTy(UsersToProcess[i].Inst->getContext()); - if (isAddressUse(UsersToProcess[i].Inst, - UsersToProcess[i].OperandValToReplace)) - AccessTy = getAccessType(UsersToProcess[i].Inst); - else if (isa(UsersToProcess[i].Inst)) - continue; + /// Formulae - A list of ways to build a value that can satisfy this user. + /// After the list is populated, one of these is selected heuristically and + /// used to formulate a replacement for OperandValToReplace in UserInst. + SmallVector Formulae; - TargetLowering::AddrMode AM; - if (const SCEVConstant *SC = dyn_cast(UsersToProcess[i].Imm)) - AM.BaseOffs = SC->getValue()->getSExtValue(); - AM.HasBaseReg = HasBaseReg || !UsersToProcess[i].Base->isZero(); - AM.Scale = Scale; + LSRUse() : Kind(Basic), AccessTy(0), + UserInst(0), OperandValToReplace(0), PostIncLoop(0) {} - // If load[imm+r*scale] is illegal, bail out. - if (!TLI->isLegalAddressingMode(AM, AccessTy)) - return false; - } - return true; -} + void InsertInitialFormula(const SCEV *S, Loop *L, + ScalarEvolution &SE, DominatorTree &DT); + void InsertSupplementalFormula(const SCEV *S); -/// ValidOffset - Check whether the given Offset is valid for all loads and -/// stores in UsersToProcess. -/// -bool LoopStrengthReduce::ValidOffset(bool HasBaseReg, - int64_t Offset, - int64_t Scale, - const std::vector& UsersToProcess) { - if (!TLI) - return true; + bool InsertFormula(const Formula &F); - for (unsigned i=0, e = UsersToProcess.size(); i!=e; ++i) { - // If this is a load or other access, pass the type of the access in. - const Type *AccessTy = - Type::getVoidTy(UsersToProcess[i].Inst->getContext()); - if (isAddressUse(UsersToProcess[i].Inst, - UsersToProcess[i].OperandValToReplace)) - AccessTy = getAccessType(UsersToProcess[i].Inst); - else if (isa(UsersToProcess[i].Inst)) - continue; + void Rewrite(Loop *L, SCEVExpander &Rewriter, + SmallVectorImpl &DeadInsts, + ScalarEvolution &SE, DominatorTree &DT, + Pass *P) const; - TargetLowering::AddrMode AM; - if (const SCEVConstant *SC = dyn_cast(UsersToProcess[i].Imm)) - AM.BaseOffs = SC->getValue()->getSExtValue(); - AM.BaseOffs = (uint64_t)AM.BaseOffs + (uint64_t)Offset; - AM.HasBaseReg = HasBaseReg || !UsersToProcess[i].Base->isZero(); - AM.Scale = Scale; + void print(raw_ostream &OS) const; + void dump() const; - // If load[imm+r*scale] is illegal, bail out. - if (!TLI->isLegalAddressingMode(AM, AccessTy)) - return false; - } - return true; -} +private: + Value *Expand(BasicBlock::iterator IP, + Loop *L, SCEVExpander &Rewriter, + SmallVectorImpl &DeadInsts, + ScalarEvolution &SE, DominatorTree &DT) const; +}; -/// RequiresTypeConversion - Returns true if converting Ty1 to Ty2 is not -/// a nop. -bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty1, - const Type *Ty2) { - if (Ty1 == Ty2) - return false; - Ty1 = SE->getEffectiveSCEVType(Ty1); - Ty2 = SE->getEffectiveSCEVType(Ty2); - if (Ty1 == Ty2) - return false; - if (Ty1->canLosslesslyBitCastTo(Ty2)) - return false; - if (TLI && TLI->isTruncateFree(Ty1, Ty2)) - return false; - return true; } -/// CheckForIVReuse - Returns the multiple if the stride is the multiple -/// of a previous stride and it is a legal value for the target addressing -/// mode scale component and optional base reg. This allows the users of -/// this stride to be rewritten as prev iv * factor. It returns 0 if no -/// reuse is possible. Factors can be negative on same targets, e.g. ARM. -/// -/// If all uses are outside the loop, we don't require that all multiplies -/// be folded into the addressing mode, nor even that the factor be constant; -/// a multiply (executed once) outside the loop is better than another IV -/// within. Well, usually. -const SCEV *LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg, - bool AllUsesAreAddresses, - bool AllUsesAreOutsideLoop, - const SCEV *Stride, - IVExpr &IV, const Type *Ty, - const std::vector& UsersToProcess) { - if (const SCEVConstant *SC = dyn_cast(Stride)) { - int64_t SInt = SC->getValue()->getSExtValue(); - for (unsigned NewStride = 0, e = IU->StrideOrder.size(); - NewStride != e; ++NewStride) { - std::map::iterator SI = - IVsByStride.find(IU->StrideOrder[NewStride]); - if (SI == IVsByStride.end() || !isa(SI->first)) - continue; - // The other stride has no uses, don't reuse it. - std::map::iterator UI = - IU->IVUsesByStride.find(IU->StrideOrder[NewStride]); - if (UI->second->Users.empty()) - continue; - int64_t SSInt = cast(SI->first)->getValue()->getSExtValue(); - if (SI->first != Stride && - (unsigned(abs64(SInt)) < SSInt || (SInt % SSInt) != 0)) - continue; - int64_t Scale = SInt / SSInt; - // Check that this stride is valid for all the types used for loads and - // stores; if it can be used for some and not others, we might as well use - // the original stride everywhere, since we have to create the IV for it - // anyway. If the scale is 1, then we don't need to worry about folding - // multiplications. - if (Scale == 1 || - (AllUsesAreAddresses && - ValidScale(HasBaseReg, Scale, UsersToProcess))) { - // Prefer to reuse an IV with a base of zero. - for (std::vector::iterator II = SI->second.IVs.begin(), - IE = SI->second.IVs.end(); II != IE; ++II) - // Only reuse previous IV if it would not require a type conversion - // and if the base difference can be folded. - if (II->Base->isZero() && - !RequiresTypeConversion(II->Base->getType(), Ty)) { - IV = *II; - return SE->getIntegerSCEV(Scale, Stride->getType()); - } - // Otherwise, settle for an IV with a foldable base. - if (AllUsesAreAddresses) - for (std::vector::iterator II = SI->second.IVs.begin(), - IE = SI->second.IVs.end(); II != IE; ++II) - // Only reuse previous IV if it would not require a type conversion - // and if the base difference can be folded. - if (SE->getEffectiveSCEVType(II->Base->getType()) == - SE->getEffectiveSCEVType(Ty) && - isa(II->Base)) { - int64_t Base = - cast(II->Base)->getValue()->getSExtValue(); - if (Base > INT32_MIN && Base <= INT32_MAX && - ValidOffset(HasBaseReg, -Base * Scale, - Scale, UsersToProcess)) { - IV = *II; - return SE->getIntegerSCEV(Scale, Stride->getType()); - } - } - } - } - } else if (AllUsesAreOutsideLoop) { - // Accept nonconstant strides here; it is really really right to substitute - // an existing IV if we can. - for (unsigned NewStride = 0, e = IU->StrideOrder.size(); - NewStride != e; ++NewStride) { - std::map::iterator SI = - IVsByStride.find(IU->StrideOrder[NewStride]); - if (SI == IVsByStride.end() || !isa(SI->first)) - continue; - int64_t SSInt = cast(SI->first)->getValue()->getSExtValue(); - if (SI->first != Stride && SSInt != 1) - continue; - for (std::vector::iterator II = SI->second.IVs.begin(), - IE = SI->second.IVs.end(); II != IE; ++II) - // Accept nonzero base here. - // Only reuse previous IV if it would not require a type conversion. - if (!RequiresTypeConversion(II->Base->getType(), Ty)) { - IV = *II; - return Stride; - } - } - // Special case, old IV is -1*x and this one is x. Can treat this one as - // -1*old. - for (unsigned NewStride = 0, e = IU->StrideOrder.size(); - NewStride != e; ++NewStride) { - std::map::iterator SI = - IVsByStride.find(IU->StrideOrder[NewStride]); - if (SI == IVsByStride.end()) - continue; - if (const SCEVMulExpr *ME = dyn_cast(SI->first)) - if (const SCEVConstant *SC = dyn_cast(ME->getOperand(0))) - if (Stride == ME->getOperand(1) && - SC->getValue()->getSExtValue() == -1LL) - for (std::vector::iterator II = SI->second.IVs.begin(), - IE = SI->second.IVs.end(); II != IE; ++II) - // Accept nonzero base here. - // Only reuse previous IV if it would not require type conversion. - if (!RequiresTypeConversion(II->Base->getType(), Ty)) { - IV = *II; - return SE->getIntegerSCEV(-1LL, Stride->getType()); - } +/// ExtractImmediate - If S involves the addition of a constant integer value, +/// return that integer value, and mutate S to point to a new SCEV with that +/// value excluded. +static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { + if (const SCEVConstant *C = dyn_cast(S)) { + if (C->getValue()->getValue().getMinSignedBits() <= 64) { + S = SE.getIntegerSCEV(0, C->getType()); + return C->getValue()->getSExtValue(); } + } else if (const SCEVAddExpr *Add = dyn_cast(S)) { + SmallVector NewOps(Add->op_begin(), Add->op_end()); + int64_t Result = ExtractImmediate(NewOps.front(), SE); + S = SE.getAddExpr(NewOps); + return Result; + } else if (const SCEVAddRecExpr *AR = dyn_cast(S)) { + SmallVector NewOps(AR->op_begin(), AR->op_end()); + int64_t Result = ExtractImmediate(NewOps.front(), SE); + S = SE.getAddRecExpr(NewOps, AR->getLoop()); + return Result; } - return SE->getIntegerSCEV(0, Stride->getType()); -} - -/// PartitionByIsUseOfPostIncrementedValue - Simple boolean predicate that -/// returns true if Val's isUseOfPostIncrementedValue is true. -static bool PartitionByIsUseOfPostIncrementedValue(const BasedUser &Val) { - return Val.isUseOfPostIncrementedValue; -} - -/// isNonConstantNegative - Return true if the specified scev is negated, but -/// not a constant. -static bool isNonConstantNegative(const SCEV *Expr) { - const SCEVMulExpr *Mul = dyn_cast(Expr); - if (!Mul) return false; - - // If there is a constant factor, it will be first. - const SCEVConstant *SC = dyn_cast(Mul->getOperand(0)); - if (!SC) return false; - - // Return true if the value is negative, this matches things like (-42 * V). - return SC->getValue()->getValue().isNegative(); -} - -/// CollectIVUsers - Transform our list of users and offsets to a bit more -/// complex table. In this new vector, each 'BasedUser' contains 'Base', the -/// base of the strided accesses, as well as the old information from Uses. We -/// progressively move information from the Base field to the Imm field, until -/// we eventually have the full access expression to rewrite the use. -const SCEV *LoopStrengthReduce::CollectIVUsers(const SCEV *Stride, - IVUsersOfOneStride &Uses, - Loop *L, - bool &AllUsesAreAddresses, - bool &AllUsesAreOutsideLoop, - std::vector &UsersToProcess) { - // FIXME: Generalize to non-affine IV's. - if (!Stride->isLoopInvariant(L)) - return SE->getIntegerSCEV(0, Stride->getType()); - - UsersToProcess.reserve(Uses.Users.size()); - for (ilist::iterator I = Uses.Users.begin(), - E = Uses.Users.end(); I != E; ++I) { - UsersToProcess.push_back(BasedUser(*I, SE)); - - // Move any loop variant operands from the offset field to the immediate - // field of the use, so that we don't try to use something before it is - // computed. - MoveLoopVariantsToImmediateField(UsersToProcess.back().Base, - UsersToProcess.back().Imm, L, SE); - assert(UsersToProcess.back().Base->isLoopInvariant(L) && - "Base value is not loop invariant!"); + return 0; +} + +/// ExtractSymbol - If S involves the addition of a GlobalValue address, +/// return that symbol, and mutate S to point to a new SCEV with that +/// value excluded. +static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { + if (const SCEVUnknown *U = dyn_cast(S)) { + if (GlobalValue *GV = dyn_cast(U->getValue())) { + S = SE.getIntegerSCEV(0, GV->getType()); + return GV; + } + } else if (const SCEVAddExpr *Add = dyn_cast(S)) { + SmallVector NewOps(Add->op_begin(), Add->op_end()); + GlobalValue *Result = ExtractSymbol(NewOps.back(), SE); + S = SE.getAddExpr(NewOps); + return Result; + } else if (const SCEVAddRecExpr *AR = dyn_cast(S)) { + SmallVector NewOps(AR->op_begin(), AR->op_end()); + GlobalValue *Result = ExtractSymbol(NewOps.front(), SE); + S = SE.getAddRecExpr(NewOps, AR->getLoop()); + return Result; } + return 0; +} - // We now have a whole bunch of uses of like-strided induction variables, but - // they might all have different bases. We want to emit one PHI node for this - // stride which we fold as many common expressions (between the IVs) into as - // possible. Start by identifying the common expressions in the base values - // for the strides (e.g. if we have "A+C+B" and "A+B+D" as our bases, find - // "A+B"), emit it to the preheader, then remove the expression from the - // UsersToProcess base values. - const SCEV *CommonExprs = - RemoveCommonExpressionsFromUseBases(UsersToProcess, SE, L, TLI); - - // Next, figure out what we can represent in the immediate fields of - // instructions. If we can represent anything there, move it to the imm - // fields of the BasedUsers. We do this so that it increases the commonality - // of the remaining uses. - unsigned NumPHI = 0; - bool HasAddress = false; - for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) { - // If the user is not in the current loop, this means it is using the exit - // value of the IV. Do not put anything in the base, make sure it's all in - // the immediate field to allow as much factoring as possible. - if (!L->contains(UsersToProcess[i].Inst)) { - UsersToProcess[i].Imm = SE->getAddExpr(UsersToProcess[i].Imm, - UsersToProcess[i].Base); - UsersToProcess[i].Base = - SE->getIntegerSCEV(0, UsersToProcess[i].Base->getType()); - } else { - // Not all uses are outside the loop. - AllUsesAreOutsideLoop = false; - - // Addressing modes can be folded into loads and stores. Be careful that - // the store is through the expression, not of the expression though. - bool isPHI = false; - bool isAddress = isAddressUse(UsersToProcess[i].Inst, - UsersToProcess[i].OperandValToReplace); - if (isa(UsersToProcess[i].Inst)) { - isPHI = true; - ++NumPHI; - } +/// isLegalUse - Test whether the use described by AM is "legal", meaning +/// it can be completely folded into the user instruction at isel time. +/// This includes address-mode folding and special icmp tricks. +static bool isLegalUse(const TargetLowering::AddrMode &AM, + LSRUse::KindType Kind, const Type *AccessTy, + const TargetLowering *TLI) { + switch (Kind) { + case LSRUse::Address: + // If we have low-level target information, ask the target if it can + // completely fold this address. + if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy); + + // Otherwise, just guess that reg+reg addressing is legal. + return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1; + + case LSRUse::ICmpZero: + // There's not even a target hook for querying whether it would be legal + // to fold a GV into an ICmp. + if (AM.BaseGV) + return false; - if (isAddress) - HasAddress = true; + // ICmp only has two operands; don't allow more than two non-trivial parts. + if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0) + return false; - // If this use isn't an address, then not all uses are addresses. - if (!isAddress && !isPHI) - AllUsesAreAddresses = false; + // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale + // by putting the scaled register in the other operand of the icmp. + if (AM.Scale != 0 && AM.Scale != -1) + return false; - MoveImmediateValues(TLI, UsersToProcess[i].Inst, UsersToProcess[i].Base, - UsersToProcess[i].Imm, isAddress, L, SE); + // If we have low-level target information, ask the target if it can + // fold an integer immediate on an icmp. + if (AM.BaseOffs != 0) { + if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs); + return false; } - } - // If one of the use is a PHI node and all other uses are addresses, still - // allow iv reuse. Essentially we are trading one constant multiplication - // for one fewer iv. - if (NumPHI > 1) - AllUsesAreAddresses = false; + return true; + + case LSRUse::Basic: + // Only handle single-register values. + return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0; - // There are no in-loop address uses. - if (AllUsesAreAddresses && (!HasAddress && !AllUsesAreOutsideLoop)) - AllUsesAreAddresses = false; + case LSRUse::Special: + // Only handle -1 scales, or no scale. + return AM.Scale == 0 || AM.Scale == -1; + } - return CommonExprs; + return false; } -/// ShouldUseFullStrengthReductionMode - Test whether full strength-reduction -/// is valid and profitable for the given set of users of a stride. In -/// full strength-reduction mode, all addresses at the current stride are -/// strength-reduced all the way down to pointer arithmetic. -/// -bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode( - const std::vector &UsersToProcess, - const Loop *L, - bool AllUsesAreAddresses, - const SCEV *Stride) { - if (!EnableFullLSRMode) - return false; +static bool isAlwaysFoldable(const SCEV *S, + bool HasBaseReg, + LSRUse::KindType Kind, const Type *AccessTy, + const TargetLowering *TLI, + ScalarEvolution &SE) { + // Fast-path: zero is always foldable. + if (S->isZero()) return true; + + // Conservatively, create an address with an immediate and a + // base and a scale. + TargetLowering::AddrMode AM; + AM.BaseOffs = ExtractImmediate(S, SE); + AM.BaseGV = ExtractSymbol(S, SE); + AM.HasBaseReg = HasBaseReg; + AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1; + + // If there's anything else involved, it's not foldable. + if (!S->isZero()) return false; + + return isLegalUse(AM, Kind, AccessTy, TLI); +} - // The heuristics below aim to avoid increasing register pressure, but - // fully strength-reducing all the addresses increases the number of - // add instructions, so don't do this when optimizing for size. - // TODO: If the loop is large, the savings due to simpler addresses - // may oughtweight the costs of the extra increment instructions. - if (L->getHeader()->getParent()->hasFnAttr(Attribute::OptimizeForSize)) - return false; +/// InsertFormula - If the given formula has not yet been inserted, add it +/// to the list, and return true. Return false otherwise. +bool LSRUse::InsertFormula(const Formula &F) { + Formula Copy = F; - // TODO: For now, don't do full strength reduction if there could - // potentially be greater-stride multiples of the current stride - // which could reuse the current stride IV. - if (IU->StrideOrder.back() != Stride) - return false; + // Sort the base regs, to avoid adding the same solution twice with + // the base regs in different orders. This uses host pointer values, but + // it doesn't matter since it's only used for uniquifying. + std::sort(Copy.BaseRegs.begin(), Copy.BaseRegs.end()); - // Iterate through the uses to find conditions that automatically rule out - // full-lsr mode. - for (unsigned i = 0, e = UsersToProcess.size(); i != e; ) { - const SCEV *Base = UsersToProcess[i].Base; - const SCEV *Imm = UsersToProcess[i].Imm; - // If any users have a loop-variant component, they can't be fully - // strength-reduced. - if (Imm && !Imm->isLoopInvariant(L)) - return false; - // If there are to users with the same base and the difference between - // the two Imm values can't be folded into the address, full - // strength reduction would increase register pressure. - do { - const SCEV *CurImm = UsersToProcess[i].Imm; - if ((CurImm || Imm) && CurImm != Imm) { - if (!CurImm) CurImm = SE->getIntegerSCEV(0, Stride->getType()); - if (!Imm) Imm = SE->getIntegerSCEV(0, Stride->getType()); - const Instruction *Inst = UsersToProcess[i].Inst; - const Type *AccessTy = getAccessType(Inst); - const SCEV *Diff = SE->getMinusSCEV(UsersToProcess[i].Imm, Imm); - if (!Diff->isZero() && - (!AllUsesAreAddresses || - !fitsInAddressMode(Diff, AccessTy, TLI, /*HasBaseReg=*/true))) - return false; - } - } while (++i != e && Base == UsersToProcess[i].Base); - } + DEBUG(for (SmallVectorImpl::const_iterator I = + F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) + assert(!(*I)->isZero() && "Zero allocated in a base register!"); + assert((!F.ScaledReg || !F.ScaledReg->isZero()) && + "Zero allocated in a scaled register!")); - // If there's exactly one user in this stride, fully strength-reducing it - // won't increase register pressure. If it's starting from a non-zero base, - // it'll be simpler this way. - if (UsersToProcess.size() == 1 && !UsersToProcess[0].Base->isZero()) + if (FormulaeUniquifier.insert(Copy)) { + Formulae.push_back(F); return true; + } - // Otherwise, if there are any users in this stride that don't require - // a register for their base, full strength-reduction will increase - // register pressure. - for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) - if (UsersToProcess[i].Base->isZero()) - return false; - - // Otherwise, go for it. - return true; + return false; } -/// InsertAffinePhi Create and insert a PHI node for an induction variable -/// with the specified start and step values in the specified loop. -/// -/// If NegateStride is true, the stride should be negated by using a -/// subtract instead of an add. -/// -/// Return the created phi node. -/// -static PHINode *InsertAffinePhi(const SCEV *Start, const SCEV *Step, - Instruction *IVIncInsertPt, - const Loop *L, - SCEVExpander &Rewriter) { - assert(Start->isLoopInvariant(L) && "New PHI start is not loop invariant!"); - assert(Step->isLoopInvariant(L) && "New PHI stride is not loop invariant!"); - - BasicBlock *Header = L->getHeader(); - BasicBlock *Preheader = L->getLoopPreheader(); - BasicBlock *LatchBlock = L->getLoopLatch(); - const Type *Ty = Start->getType(); - Ty = Rewriter.SE.getEffectiveSCEVType(Ty); - - PHINode *PN = PHINode::Create(Ty, "lsr.iv", Header->begin()); - PN->addIncoming(Rewriter.expandCodeFor(Start, Ty, Preheader->getTerminator()), - Preheader); - - // If the stride is negative, insert a sub instead of an add for the - // increment. - bool isNegative = isNonConstantNegative(Step); - const SCEV *IncAmount = Step; - if (isNegative) - IncAmount = Rewriter.SE.getNegativeSCEV(Step); - - // Insert an add instruction right before the terminator corresponding - // to the back-edge or just before the only use. The location is determined - // by the caller and passed in as IVIncInsertPt. - Value *StepV = Rewriter.expandCodeFor(IncAmount, Ty, - Preheader->getTerminator()); - Instruction *IncV; - if (isNegative) { - IncV = BinaryOperator::CreateSub(PN, StepV, "lsr.iv.next", - IVIncInsertPt); - } else { - IncV = BinaryOperator::CreateAdd(PN, StepV, "lsr.iv.next", - IVIncInsertPt); - } - if (!isa(StepV)) ++NumVariable; - - PN->addIncoming(IncV, LatchBlock); - - ++NumInserted; - return PN; -} - -static void SortUsersToProcess(std::vector &UsersToProcess) { - // We want to emit code for users inside the loop first. To do this, we - // rearrange BasedUser so that the entries at the end have - // isUseOfPostIncrementedValue = false, because we pop off the end of the - // vector (so we handle them first). - std::partition(UsersToProcess.begin(), UsersToProcess.end(), - PartitionByIsUseOfPostIncrementedValue); - - // Sort this by base, so that things with the same base are handled - // together. By partitioning first and stable-sorting later, we are - // guaranteed that within each base we will pop off users from within the - // loop before users outside of the loop with a particular base. - // - // We would like to use stable_sort here, but we can't. The problem is that - // const SCEV *'s don't have a deterministic ordering w.r.t to each other, so - // we don't have anything to do a '<' comparison on. Because we think the - // number of uses is small, do a horrible bubble sort which just relies on - // ==. - for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) { - // Get a base value. - const SCEV *Base = UsersToProcess[i].Base; - - // Compact everything with this base to be consecutive with this one. - for (unsigned j = i+1; j != e; ++j) { - if (UsersToProcess[j].Base == Base) { - std::swap(UsersToProcess[i+1], UsersToProcess[j]); - ++i; - } - } - } +void +LSRUse::InsertInitialFormula(const SCEV *S, Loop *L, + ScalarEvolution &SE, DominatorTree &DT) { + Formula F; + F.InitialMatch(S, L, SE, DT); + bool Inserted = InsertFormula(F); + assert(Inserted && "Initial formula already exists!"); (void)Inserted; } -/// PrepareToStrengthReduceFully - Prepare to fully strength-reduce -/// UsersToProcess, meaning lowering addresses all the way down to direct -/// pointer arithmetic. -/// void -LoopStrengthReduce::PrepareToStrengthReduceFully( - std::vector &UsersToProcess, - const SCEV *Stride, - const SCEV *CommonExprs, - const Loop *L, - SCEVExpander &PreheaderRewriter) { - DEBUG(dbgs() << " Fully reducing all users\n"); - - // Rewrite the UsersToProcess records, creating a separate PHI for each - // unique Base value. - Instruction *IVIncInsertPt = L->getLoopLatch()->getTerminator(); - for (unsigned i = 0, e = UsersToProcess.size(); i != e; ) { - // TODO: The uses are grouped by base, but not sorted. We arbitrarily - // pick the first Imm value here to start with, and adjust it for the - // other uses. - const SCEV *Imm = UsersToProcess[i].Imm; - const SCEV *Base = UsersToProcess[i].Base; - const SCEV *Start = SE->getAddExpr(CommonExprs, Base, Imm); - PHINode *Phi = InsertAffinePhi(Start, Stride, IVIncInsertPt, L, - PreheaderRewriter); - // Loop over all the users with the same base. - do { - UsersToProcess[i].Base = SE->getIntegerSCEV(0, Stride->getType()); - UsersToProcess[i].Imm = SE->getMinusSCEV(UsersToProcess[i].Imm, Imm); - UsersToProcess[i].Phi = Phi; - assert(UsersToProcess[i].Imm->isLoopInvariant(L) && - "ShouldUseFullStrengthReductionMode should reject this!"); - } while (++i != e && Base == UsersToProcess[i].Base); - } -} - -/// FindIVIncInsertPt - Return the location to insert the increment instruction. -/// If the only use if a use of postinc value, (must be the loop termination -/// condition), then insert it just before the use. -static Instruction *FindIVIncInsertPt(std::vector &UsersToProcess, - const Loop *L) { - if (UsersToProcess.size() == 1 && - UsersToProcess[0].isUseOfPostIncrementedValue && - L->contains(UsersToProcess[0].Inst)) - return UsersToProcess[0].Inst; - return L->getLoopLatch()->getTerminator(); +LSRUse::InsertSupplementalFormula(const SCEV *S) { + Formula F; + F.BaseRegs.push_back(S); + F.AM.HasBaseReg = true; + bool Inserted = InsertFormula(F); + assert(Inserted && "Supplemental formula already exists!"); (void)Inserted; } -/// PrepareToStrengthReduceWithNewPhi - Insert a new induction variable for the -/// given users to share. +/// getImmediateDominator - A handy utility for the specific DominatorTree +/// query that we need here. /// -void -LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi( - std::vector &UsersToProcess, - const SCEV *Stride, - const SCEV *CommonExprs, - Value *CommonBaseV, - Instruction *IVIncInsertPt, - const Loop *L, - SCEVExpander &PreheaderRewriter) { - DEBUG(dbgs() << " Inserting new PHI:\n"); - - PHINode *Phi = InsertAffinePhi(SE->getUnknown(CommonBaseV), - Stride, IVIncInsertPt, L, - PreheaderRewriter); - - // Remember this in case a later stride is multiple of this. - IVsByStride[Stride].addIV(Stride, CommonExprs, Phi); - - // All the users will share this new IV. - for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) - UsersToProcess[i].Phi = Phi; - - DEBUG(dbgs() << " IV="); - DEBUG(WriteAsOperand(dbgs(), Phi, /*PrintType=*/false)); - DEBUG(dbgs() << "\n"); -} - -/// PrepareToStrengthReduceFromSmallerStride - Prepare for the given users to -/// reuse an induction variable with a stride that is a factor of the current -/// induction variable. -/// -void -LoopStrengthReduce::PrepareToStrengthReduceFromSmallerStride( - std::vector &UsersToProcess, - Value *CommonBaseV, - const IVExpr &ReuseIV, - Instruction *PreInsertPt) { - DEBUG(dbgs() << " Rewriting in terms of existing IV of STRIDE " - << *ReuseIV.Stride << " and BASE " << *ReuseIV.Base << "\n"); - - // All the users will share the reused IV. - for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) - UsersToProcess[i].Phi = ReuseIV.PHI; - - Constant *C = dyn_cast(CommonBaseV); - if (C && - (!C->isNullValue() && - !fitsInAddressMode(SE->getUnknown(CommonBaseV), CommonBaseV->getType(), - TLI, false))) - // We want the common base emitted into the preheader! This is just - // using cast as a copy so BitCast (no-op cast) is appropriate - CommonBaseV = new BitCastInst(CommonBaseV, CommonBaseV->getType(), - "commonbase", PreInsertPt); -} - -static bool IsImmFoldedIntoAddrMode(GlobalValue *GV, int64_t Offset, - const Type *AccessTy, - std::vector &UsersToProcess, - const TargetLowering *TLI) { - SmallVector AddrModeInsts; - for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) { - if (UsersToProcess[i].isUseOfPostIncrementedValue) - continue; - ExtAddrMode AddrMode = - AddressingModeMatcher::Match(UsersToProcess[i].OperandValToReplace, - AccessTy, UsersToProcess[i].Inst, - AddrModeInsts, *TLI); - if (GV && GV != AddrMode.BaseGV) - return false; - if (Offset && !AddrMode.BaseOffs) - // FIXME: How to accurate check it's immediate offset is folded. - return false; - AddrModeInsts.clear(); - } - return true; +static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) { + DomTreeNode *Node = DT.getNode(BB); + if (!Node) return 0; + Node = Node->getIDom(); + if (!Node) return 0; + return Node->getBlock(); } -/// StrengthReduceIVUsersOfStride - Strength reduce all of the users of a single -/// stride of IV. All of the users may have different starting values, and this -/// may not be the only stride. -void -LoopStrengthReduce::StrengthReduceIVUsersOfStride(const SCEV *Stride, - IVUsersOfOneStride &Uses, - Loop *L) { - // If all the users are moved to another stride, then there is nothing to do. - if (Uses.Users.empty()) - return; - - // Keep track if every use in UsersToProcess is an address. If they all are, - // we may be able to rewrite the entire collection of them in terms of a - // smaller-stride IV. - bool AllUsesAreAddresses = true; - - // Keep track if every use of a single stride is outside the loop. If so, - // we want to be more aggressive about reusing a smaller-stride IV; a - // multiply outside the loop is better than another IV inside. Well, usually. - bool AllUsesAreOutsideLoop = true; - - // Transform our list of users and offsets to a bit more complex table. In - // this new vector, each 'BasedUser' contains 'Base' the base of the - // strided accessas well as the old information from Uses. We progressively - // move information from the Base field to the Imm field, until we eventually - // have the full access expression to rewrite the use. - std::vector UsersToProcess; - const SCEV *CommonExprs = CollectIVUsers(Stride, Uses, L, AllUsesAreAddresses, - AllUsesAreOutsideLoop, - UsersToProcess); - - // Sort the UsersToProcess array so that users with common bases are - // next to each other. - SortUsersToProcess(UsersToProcess); - - // If we managed to find some expressions in common, we'll need to carry - // their value in a register and add it in for each use. This will take up - // a register operand, which potentially restricts what stride values are - // valid. - bool HaveCommonExprs = !CommonExprs->isZero(); - const Type *ReplacedTy = CommonExprs->getType(); - - // If all uses are addresses, consider sinking the immediate part of the - // common expression back into uses if they can fit in the immediate fields. - if (TLI && HaveCommonExprs && AllUsesAreAddresses) { - const SCEV *NewCommon = CommonExprs; - const SCEV *Imm = SE->getIntegerSCEV(0, ReplacedTy); - MoveImmediateValues(TLI, Type::getVoidTy( - L->getLoopPreheader()->getContext()), - NewCommon, Imm, true, L, SE); - if (!Imm->isZero()) { - bool DoSink = true; - - // If the immediate part of the common expression is a GV, check if it's - // possible to fold it into the target addressing mode. - GlobalValue *GV = 0; - if (const SCEVUnknown *SU = dyn_cast(Imm)) - GV = dyn_cast(SU->getValue()); - int64_t Offset = 0; - if (const SCEVConstant *SC = dyn_cast(Imm)) - Offset = SC->getValue()->getSExtValue(); - if (GV || Offset) - // Pass VoidTy as the AccessTy to be conservative, because - // there could be multiple access types among all the uses. - DoSink = IsImmFoldedIntoAddrMode(GV, Offset, - Type::getVoidTy(L->getLoopPreheader()->getContext()), - UsersToProcess, TLI); - - if (DoSink) { - DEBUG(dbgs() << " Sinking " << *Imm << " back down into uses\n"); - for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) - UsersToProcess[i].Imm = SE->getAddExpr(UsersToProcess[i].Imm, Imm); - CommonExprs = NewCommon; - HaveCommonExprs = !CommonExprs->isZero(); - ++NumImmSunk; +Value *LSRUse::Expand(BasicBlock::iterator IP, + Loop *L, SCEVExpander &Rewriter, + SmallVectorImpl &DeadInsts, + ScalarEvolution &SE, DominatorTree &DT) const { + // Then, collect some instructions which we will remain dominated by when + // expanding the replacement. These must be dominated by any operands that + // will be required in the expansion. + SmallVector Inputs; + if (Instruction *I = dyn_cast(OperandValToReplace)) + Inputs.push_back(I); + if (Kind == ICmpZero) + if (Instruction *I = + dyn_cast(cast(UserInst)->getOperand(1))) + Inputs.push_back(I); + if (PostIncLoop && !L->contains(UserInst)) + Inputs.push_back(L->getLoopLatch()->getTerminator()); + + // Then, climb up the immediate dominator tree as far as we can go while + // still being dominated by the input positions. + for (;;) { + bool AllDominate = true; + Instruction *BetterPos = 0; + BasicBlock *IDom = getImmediateDominator(IP->getParent(), DT); + if (!IDom) break; + Instruction *Tentative = IDom->getTerminator(); + for (SmallVectorImpl::const_iterator I = Inputs.begin(), + E = Inputs.end(); I != E; ++I) { + Instruction *Inst = *I; + if (Inst == Tentative || !DT.dominates(Inst, Tentative)) { + AllDominate = false; + break; } + if (IDom == Inst->getParent() && + (!BetterPos || DT.dominates(BetterPos, Inst))) + BetterPos = next(BasicBlock::iterator(Inst)); } + if (!AllDominate) + break; + if (BetterPos) + IP = BetterPos; + else + IP = Tentative; + } + while (isa(IP)) ++IP; + + // The first formula in the list is the winner. + const Formula &F = Formulae.front(); + + // Inform the Rewriter if we have a post-increment use, so that it can + // perform an advantageous expansion. + Rewriter.setPostInc(PostIncLoop); + + // This is the type that the user actually needs. + const Type *OpTy = OperandValToReplace->getType(); + // This will be the type that we'll initially expand to. + const Type *Ty = F.getType(); + if (!Ty) + // No type known; just expand directly to the ultimate type. + Ty = OpTy; + else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy)) + // Expand directly to the ultimate type if it's the right size. + Ty = OpTy; + // This is the type to do integer arithmetic in. + const Type *IntTy = SE.getEffectiveSCEVType(Ty); + + // Build up a list of operands to add together to form the full base. + SmallVector Ops; + + // Expand the BaseRegs portion. + for (SmallVectorImpl::const_iterator I = F.BaseRegs.begin(), + E = F.BaseRegs.end(); I != E; ++I) { + const SCEV *Reg = *I; + assert(!Reg->isZero() && "Zero allocated in a base register!"); + + // If we're expanding for a post-inc user for the add-rec's loop, make the + // post-inc adjustment. + if (const SCEVAddRecExpr *AR = dyn_cast(Reg)) + if (AR->getLoop() == PostIncLoop) + Reg = SE.getAddExpr(Reg, AR->getStepRecurrence(SE)); + + Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP))); } - // Now that we know what we need to do, insert the PHI node itself. - // - DEBUG(dbgs() << "LSR: Examining IVs of TYPE " << *ReplacedTy << " of STRIDE " - << *Stride << ":\n" - << " Common base: " << *CommonExprs << "\n"); - - SCEVExpander Rewriter(*SE); - SCEVExpander PreheaderRewriter(*SE); - - BasicBlock *Preheader = L->getLoopPreheader(); - Instruction *PreInsertPt = Preheader->getTerminator(); - BasicBlock *LatchBlock = L->getLoopLatch(); - Instruction *IVIncInsertPt = LatchBlock->getTerminator(); - - Value *CommonBaseV = Constant::getNullValue(ReplacedTy); - - const SCEV *RewriteFactor = SE->getIntegerSCEV(0, ReplacedTy); - IVExpr ReuseIV(SE->getIntegerSCEV(0, - Type::getInt32Ty(Preheader->getContext())), - SE->getIntegerSCEV(0, - Type::getInt32Ty(Preheader->getContext())), - 0); - - // Choose a strength-reduction strategy and prepare for it by creating - // the necessary PHIs and adjusting the bookkeeping. - if (ShouldUseFullStrengthReductionMode(UsersToProcess, L, - AllUsesAreAddresses, Stride)) { - PrepareToStrengthReduceFully(UsersToProcess, Stride, CommonExprs, L, - PreheaderRewriter); - } else { - // Emit the initial base value into the loop preheader. - CommonBaseV = PreheaderRewriter.expandCodeFor(CommonExprs, ReplacedTy, - PreInsertPt); - - // If all uses are addresses, check if it is possible to reuse an IV. The - // new IV must have a stride that is a multiple of the old stride; the - // multiple must be a number that can be encoded in the scale field of the - // target addressing mode; and we must have a valid instruction after this - // substitution, including the immediate field, if any. - RewriteFactor = CheckForIVReuse(HaveCommonExprs, AllUsesAreAddresses, - AllUsesAreOutsideLoop, - Stride, ReuseIV, ReplacedTy, - UsersToProcess); - if (!RewriteFactor->isZero()) - PrepareToStrengthReduceFromSmallerStride(UsersToProcess, CommonBaseV, - ReuseIV, PreInsertPt); - else { - IVIncInsertPt = FindIVIncInsertPt(UsersToProcess, L); - PrepareToStrengthReduceWithNewPhi(UsersToProcess, Stride, CommonExprs, - CommonBaseV, IVIncInsertPt, - L, PreheaderRewriter); + // Expand the ScaledReg portion. + Value *ICmpScaledV = 0; + if (F.AM.Scale != 0) { + const SCEV *ScaledS = F.ScaledReg; + + // If we're expanding for a post-inc user for the add-rec's loop, make the + // post-inc adjustment. + if (const SCEVAddRecExpr *AR = dyn_cast(ScaledS)) + if (AR->getLoop() == PostIncLoop) + ScaledS = SE.getAddExpr(ScaledS, AR->getStepRecurrence(SE)); + + if (Kind == ICmpZero) { + // An interesting way of "folding" with an icmp is to use a negated + // scale, which we'll implement by inserting it into the other operand + // of the icmp. + assert(F.AM.Scale == -1 && + "The only scale supported by ICmpZero uses is -1!"); + ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP); + } else { + // Otherwise just expand the scaled register and an explicit scale, + // which is expected to be matched as part of the address. + ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP)); + const Type *ScaledTy = SE.getEffectiveSCEVType(ScaledS->getType()); + ScaledS = SE.getMulExpr(ScaledS, + SE.getSCEV(ConstantInt::get(ScaledTy, + F.AM.Scale))); + Ops.push_back(ScaledS); } } - // Process all the users now, replacing their strided uses with - // strength-reduced forms. This outer loop handles all bases, the inner - // loop handles all users of a particular base. - while (!UsersToProcess.empty()) { - const SCEV *Base = UsersToProcess.back().Base; - Instruction *Inst = UsersToProcess.back().Inst; - - // Emit the code for Base into the preheader. - Value *BaseV = 0; - if (!Base->isZero()) { - BaseV = PreheaderRewriter.expandCodeFor(Base, 0, PreInsertPt); - - DEBUG(dbgs() << " INSERTING code for BASE = " << *Base << ":"); - if (BaseV->hasName()) - DEBUG(dbgs() << " Result value name = %" << BaseV->getName()); - DEBUG(dbgs() << "\n"); - - // If BaseV is a non-zero constant, make sure that it gets inserted into - // the preheader, instead of being forward substituted into the uses. We - // do this by forcing a BitCast (noop cast) to be inserted into the - // preheader in this case. - if (!fitsInAddressMode(Base, getAccessType(Inst), TLI, false) && - isa(BaseV)) { - // We want this constant emitted into the preheader! This is just - // using cast as a copy so BitCast (no-op cast) is appropriate - BaseV = new BitCastInst(BaseV, BaseV->getType(), "preheaderinsert", - PreInsertPt); + // Expand the immediate portions. + if (F.AM.BaseGV) + Ops.push_back(SE.getSCEV(F.AM.BaseGV)); + if (F.AM.BaseOffs != 0) { + if (Kind == ICmpZero) { + // The other interesting way of "folding" with an ICmpZero is to use a + // negated immediate. + if (!ICmpScaledV) + ICmpScaledV = ConstantInt::get(IntTy, -F.AM.BaseOffs); + else { + Ops.push_back(SE.getUnknown(ICmpScaledV)); + ICmpScaledV = ConstantInt::get(IntTy, F.AM.BaseOffs); } + } else { + // Just add the immediate values. These again are expected to be matched + // as part of the address. + Ops.push_back(SE.getSCEV(ConstantInt::get(IntTy, F.AM.BaseOffs))); } + } - // Emit the code to add the immediate offset to the Phi value, just before - // the instructions that we identified as using this stride and base. - do { - // FIXME: Use emitted users to emit other users. - BasedUser &User = UsersToProcess.back(); - - DEBUG(dbgs() << " Examining "); - if (User.isUseOfPostIncrementedValue) - DEBUG(dbgs() << "postinc"); - else - DEBUG(dbgs() << "preinc"); - DEBUG(dbgs() << " use "); - DEBUG(WriteAsOperand(dbgs(), UsersToProcess.back().OperandValToReplace, - /*PrintType=*/false)); - DEBUG(dbgs() << " in Inst: " << *User.Inst); - - // If this instruction wants to use the post-incremented value, move it - // after the post-inc and use its value instead of the PHI. - Value *RewriteOp = User.Phi; - if (User.isUseOfPostIncrementedValue) { - RewriteOp = User.Phi->getIncomingValueForBlock(LatchBlock); - // If this user is in the loop, make sure it is the last thing in the - // loop to ensure it is dominated by the increment. In case it's the - // only use of the iv, the increment instruction is already before the - // use. - if (L->contains(User.Inst) && User.Inst != IVIncInsertPt) - User.Inst->moveBefore(IVIncInsertPt); + // Emit instructions summing all the operands. + const SCEV *FullS = Ops.empty() ? + SE.getIntegerSCEV(0, IntTy) : + SE.getAddExpr(Ops); + Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP); + + // We're done expanding now, so reset the rewriter. + Rewriter.setPostInc(0); + + // An ICmpZero Formula represents an ICmp which we're handling as a + // comparison against zero. Now that we've expanded an expression for that + // form, update the ICmp's other operand. + if (Kind == ICmpZero) { + ICmpInst *CI = cast(UserInst); + DeadInsts.push_back(CI->getOperand(1)); + assert(!F.AM.BaseGV && "ICmp does not support folding a global value and " + "a scale at the same time!"); + if (F.AM.Scale == -1) { + if (ICmpScaledV->getType() != OpTy) { + Instruction *Cast = + CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false, + OpTy, false), + ICmpScaledV, OpTy, "tmp", CI); + ICmpScaledV = Cast; } + CI->setOperand(1, ICmpScaledV); + } else { + assert(F.AM.Scale == 0 && + "ICmp does not support folding a global value and " + "a scale at the same time!"); + Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), + -(uint64_t)F.AM.BaseOffs); + if (C->getType() != OpTy) + C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false, + OpTy, false), + C, OpTy); + + CI->setOperand(1, C); + } + } - const SCEV *RewriteExpr = SE->getUnknown(RewriteOp); + return FullV; +} - if (SE->getEffectiveSCEVType(RewriteOp->getType()) != - SE->getEffectiveSCEVType(ReplacedTy)) { - assert(SE->getTypeSizeInBits(RewriteOp->getType()) > - SE->getTypeSizeInBits(ReplacedTy) && - "Unexpected widening cast!"); - RewriteExpr = SE->getTruncateExpr(RewriteExpr, ReplacedTy); - } +/// Rewrite - Emit instructions for the leading candidate expression for this +/// LSRUse (this is called "expanding"), and update the UserInst to reference +/// the newly expanded value. +void LSRUse::Rewrite(Loop *L, SCEVExpander &Rewriter, + SmallVectorImpl &DeadInsts, + ScalarEvolution &SE, DominatorTree &DT, + Pass *P) const { + const Type *OpTy = OperandValToReplace->getType(); + + // First, find an insertion point that dominates UserInst. For PHI nodes, + // find the nearest block which dominates all the relevant uses. + if (PHINode *PN = dyn_cast(UserInst)) { + DenseMap Inserted; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) == OperandValToReplace) { + BasicBlock *BB = PN->getIncomingBlock(i); - // If we had to insert new instructions for RewriteOp, we have to - // consider that they may not have been able to end up immediately - // next to RewriteOp, because non-PHI instructions may never precede - // PHI instructions in a block. In this case, remember where the last - // instruction was inserted so that if we're replacing a different - // PHI node, we can use the later point to expand the final - // RewriteExpr. - Instruction *NewBasePt = dyn_cast(RewriteOp); - if (RewriteOp == User.Phi) NewBasePt = 0; - - // Clear the SCEVExpander's expression map so that we are guaranteed - // to have the code emitted where we expect it. - Rewriter.clear(); - - // If we are reusing the iv, then it must be multiplied by a constant - // factor to take advantage of the addressing mode scale component. - if (!RewriteFactor->isZero()) { - // If we're reusing an IV with a nonzero base (currently this happens - // only when all reuses are outside the loop) subtract that base here. - // The base has been used to initialize the PHI node but we don't want - // it here. - if (!ReuseIV.Base->isZero()) { - const SCEV *typedBase = ReuseIV.Base; - if (SE->getEffectiveSCEVType(RewriteExpr->getType()) != - SE->getEffectiveSCEVType(ReuseIV.Base->getType())) { - // It's possible the original IV is a larger type than the new IV, - // in which case we have to truncate the Base. We checked in - // RequiresTypeConversion that this is valid. - assert(SE->getTypeSizeInBits(RewriteExpr->getType()) < - SE->getTypeSizeInBits(ReuseIV.Base->getType()) && - "Unexpected lengthening conversion!"); - typedBase = SE->getTruncateExpr(ReuseIV.Base, - RewriteExpr->getType()); - } - RewriteExpr = SE->getMinusSCEV(RewriteExpr, typedBase); + // If this is a critical edge, split the edge so that we do not insert + // the code on all predecessor/successor paths. We do this unless this + // is the canonical backedge for this loop, which complicates post-inc + // users. + if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 && + !isa(BB->getTerminator()) && + (PN->getParent() != L->getHeader() || !L->contains(BB))) { + // Split the critical edge. + BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P); + + // If PN is outside of the loop and BB is in the loop, we want to + // move the block to be immediately before the PHI block, not + // immediately after BB. + if (L->contains(BB) && !L->contains(PN)) + NewBB->moveBefore(PN->getParent()); + + // Splitting the edge can reduce the number of PHI entries we have. + e = PN->getNumIncomingValues(); + BB = NewBB; + i = PN->getBasicBlockIndex(BB); } - // Multiply old variable, with base removed, by new scale factor. - RewriteExpr = SE->getMulExpr(RewriteFactor, - RewriteExpr); - - // The common base is emitted in the loop preheader. But since we - // are reusing an IV, it has not been used to initialize the PHI node. - // Add it to the expression used to rewrite the uses. - // When this use is outside the loop, we earlier subtracted the - // common base, and are adding it back here. Use the same expression - // as before, rather than CommonBaseV, so DAGCombiner will zap it. - if (!CommonExprs->isZero()) { - if (L->contains(User.Inst)) - RewriteExpr = SE->getAddExpr(RewriteExpr, - SE->getUnknown(CommonBaseV)); - else - RewriteExpr = SE->getAddExpr(RewriteExpr, CommonExprs); + std::pair::iterator, bool> Pair = + Inserted.insert(std::make_pair(BB, static_cast(0))); + if (!Pair.second) + PN->setIncomingValue(i, Pair.first->second); + else { + Value *FullV = Expand(BB->getTerminator(), + L, Rewriter, DeadInsts, SE, DT); + + // If this is reuse-by-noop-cast, insert the noop cast. + if (FullV->getType() != OpTy) + FullV = + CastInst::Create(CastInst::getCastOpcode(FullV, false, + OpTy, false), + FullV, OperandValToReplace->getType(), + "tmp", BB->getTerminator()); + + PN->setIncomingValue(i, FullV); + Pair.first->second = FullV; } } + } else { + Value *FullV = Expand(UserInst, L, Rewriter, DeadInsts, SE, DT); + + // If this is reuse-by-noop-cast, insert the noop cast. + if (FullV->getType() != OpTy) { + Instruction *Cast = + CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false), + FullV, OpTy, "tmp", UserInst); + FullV = Cast; + } - // Now that we know what we need to do, insert code before User for the - // immediate and any loop-variant expressions. - if (BaseV) - // Add BaseV to the PHI value if needed. - RewriteExpr = SE->getAddExpr(RewriteExpr, SE->getUnknown(BaseV)); - - User.RewriteInstructionToUseNewBase(RewriteExpr, NewBasePt, - Rewriter, L, this, - DeadInsts, SE); - - // Mark old value we replaced as possibly dead, so that it is eliminated - // if we just replaced the last use of that value. - DeadInsts.push_back(User.OperandValToReplace); - - UsersToProcess.pop_back(); - ++NumReduced; - - // If there are any more users to process with the same base, process them - // now. We sorted by base above, so we just have to check the last elt. - } while (!UsersToProcess.empty() && UsersToProcess.back().Base == Base); - // TODO: Next, find out which base index is the most common, pull it out. + // Update the user. + UserInst->replaceUsesOfWith(OperandValToReplace, FullV); } - // IMPORTANT TODO: Figure out how to partition the IV's with this stride, but - // different starting values, into different PHIs. + DeadInsts.push_back(OperandValToReplace); } -void LoopStrengthReduce::StrengthReduceIVUsers(Loop *L) { - // Note: this processes each stride/type pair individually. All users - // passed into StrengthReduceIVUsersOfStride have the same type AND stride. - // Also, note that we iterate over IVUsesByStride indirectly by using - // StrideOrder. This extra layer of indirection makes the ordering of - // strides deterministic - not dependent on map order. - for (unsigned Stride = 0, e = IU->StrideOrder.size(); Stride != e; ++Stride) { - std::map::iterator SI = - IU->IVUsesByStride.find(IU->StrideOrder[Stride]); - assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!"); - // FIXME: Generalize to non-affine IV's. - if (!SI->first->isLoopInvariant(L)) - continue; - StrengthReduceIVUsersOfStride(SI->first, *SI->second, L); +void LSRUse::print(raw_ostream &OS) const { + OS << "LSR Use: Kind="; + switch (Kind) { + case Basic: OS << "Basic"; break; + case Special: OS << "Special"; break; + case ICmpZero: OS << "ICmpZero"; break; + case Address: + OS << "Address of "; + if (isa(AccessTy)) + OS << "pointer"; // the full pointer type could be really verbose + else + OS << *AccessTy; } -} -/// FindIVUserForCond - If Cond has an operand that is an expression of an IV, -/// set the IV user and stride information and return true, otherwise return -/// false. -bool LoopStrengthReduce::FindIVUserForCond(ICmpInst *Cond, - IVStrideUse *&CondUse, - const SCEV* &CondStride) { - for (unsigned Stride = 0, e = IU->StrideOrder.size(); - Stride != e && !CondUse; ++Stride) { - std::map::iterator SI = - IU->IVUsesByStride.find(IU->StrideOrder[Stride]); - assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!"); - - for (ilist::iterator UI = SI->second->Users.begin(), - E = SI->second->Users.end(); UI != E; ++UI) - if (UI->getUser() == Cond) { - // NOTE: we could handle setcc instructions with multiple uses here, but - // InstCombine does it as well for simple uses, it's not clear that it - // occurs enough in real life to handle. - CondUse = UI; - CondStride = SI->first; - return true; - } + OS << ", UserInst="; + // Store is common and interesting enough to be worth special-casing. + if (StoreInst *Store = dyn_cast(UserInst)) { + OS << "store "; + WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false); + } else if (UserInst->getType()->isVoidTy()) + OS << UserInst->getOpcodeName(); + else + WriteAsOperand(OS, UserInst, /*PrintType=*/false); + + OS << ", OperandValToReplace="; + WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false); + + if (PostIncLoop) { + OS << ", PostIncLoop="; + WriteAsOperand(OS, PostIncLoop->getHeader(), /*PrintType=*/false); } - return false; +} + +void LSRUse::dump() const { + print(errs()); errs() << '\n'; } namespace { - // Constant strides come first which in turns are sorted by their absolute - // values. If absolute values are the same, then positive strides comes first. - // e.g. - // 4, -1, X, 1, 2 ==> 1, -1, 2, 4, X - struct StrideCompare { - const ScalarEvolution *SE; - explicit StrideCompare(const ScalarEvolution *se) : SE(se) {} - - bool operator()(const SCEV *LHS, const SCEV *RHS) { - const SCEVConstant *LHSC = dyn_cast(LHS); - const SCEVConstant *RHSC = dyn_cast(RHS); - if (LHSC && RHSC) { - int64_t LV = LHSC->getValue()->getSExtValue(); - int64_t RV = RHSC->getValue()->getSExtValue(); - uint64_t ALV = (LV < 0) ? -LV : LV; - uint64_t ARV = (RV < 0) ? -RV : RV; - if (ALV == ARV) { - if (LV != RV) - return LV > RV; - } else { - return ALV < ARV; - } - // If it's the same value but different type, sort by bit width so - // that we emit larger induction variables before smaller - // ones, letting the smaller be re-written in terms of larger ones. - return SE->getTypeSizeInBits(RHS->getType()) < - SE->getTypeSizeInBits(LHS->getType()); - } - return LHSC && !RHSC; - } - }; +/// Score - This class is used to measure and compare candidate formulae. +class Score { + unsigned NumRegs; + unsigned NumPhis; + unsigned NumIVMuls; + unsigned NumBaseAdds; + unsigned NumImms; + +public: + Score() + : NumRegs(0), NumPhis(0), NumIVMuls(0), NumBaseAdds(0), NumImms(0) {} + + void RateInitial(SmallVector const &Uses, const Loop *L, + ScalarEvolution &SE); + + void Rate(const SCEV *Reg, const SmallBitVector &Bits, + const SmallVector &Uses, const Loop *L, + ScalarEvolution &SE); + + bool operator<(const Score &Other) const; + + void print_details(raw_ostream &OS, const SCEV *Reg, + const SmallPtrSet &Regs) const; + + void print(raw_ostream &OS) const; + void dump() const; + +private: + void RateRegister(const SCEV *Reg, SmallPtrSet &Regs, + const Loop *L); + void RateFormula(const Formula &F, SmallPtrSet &Regs, + const Loop *L); + + void Loose(); +}; + } -/// ChangeCompareStride - If a loop termination compare instruction is the -/// only use of its stride, and the compaison is against a constant value, -/// try eliminate the stride by moving the compare instruction to another -/// stride and change its constant operand accordingly. e.g. -/// -/// loop: -/// ... -/// v1 = v1 + 3 -/// v2 = v2 + 1 -/// if (v2 < 10) goto loop -/// => -/// loop: -/// ... -/// v1 = v1 + 3 -/// if (v1 < 30) goto loop -ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, - IVStrideUse* &CondUse, - const SCEV* &CondStride, - bool PostPass) { - // If there's only one stride in the loop, there's nothing to do here. - if (IU->StrideOrder.size() < 2) - return Cond; - // If there are other users of the condition's stride, don't bother - // trying to change the condition because the stride will still - // remain. - std::map::iterator I = - IU->IVUsesByStride.find(CondStride); - if (I == IU->IVUsesByStride.end()) - return Cond; - if (I->second->Users.size() > 1) { - for (ilist::iterator II = I->second->Users.begin(), - EE = I->second->Users.end(); II != EE; ++II) { - if (II->getUser() == Cond) - continue; - if (!isInstructionTriviallyDead(II->getUser())) - return Cond; - } - } - // Only handle constant strides for now. - const SCEVConstant *SC = dyn_cast(CondStride); - if (!SC) return Cond; - - ICmpInst::Predicate Predicate = Cond->getPredicate(); - int64_t CmpSSInt = SC->getValue()->getSExtValue(); - unsigned BitWidth = SE->getTypeSizeInBits(CondStride->getType()); - uint64_t SignBit = 1ULL << (BitWidth-1); - const Type *CmpTy = Cond->getOperand(0)->getType(); - const Type *NewCmpTy = NULL; - unsigned TyBits = SE->getTypeSizeInBits(CmpTy); - unsigned NewTyBits = 0; - const SCEV *NewStride = NULL; - Value *NewCmpLHS = NULL; - Value *NewCmpRHS = NULL; - int64_t Scale = 1; - const SCEV *NewOffset = SE->getIntegerSCEV(0, CmpTy); - - if (ConstantInt *C = dyn_cast(Cond->getOperand(1))) { - int64_t CmpVal = C->getValue().getSExtValue(); - - // Check the relevant induction variable for conformance to - // the pattern. - const SCEV *IV = SE->getSCEV(Cond->getOperand(0)); - const SCEVAddRecExpr *AR = dyn_cast(IV); - if (!AR || !AR->isAffine()) - return Cond; - - const SCEVConstant *StartC = dyn_cast(AR->getStart()); - // Check stride constant and the comparision constant signs to detect - // overflow. - if (StartC) { - if ((StartC->getValue()->getSExtValue() < CmpVal && CmpSSInt < 0) || - (StartC->getValue()->getSExtValue() > CmpVal && CmpSSInt > 0)) - return Cond; - } else { - // More restrictive check for the other cases. - if ((CmpVal & SignBit) != (CmpSSInt & SignBit)) - return Cond; +/// RateRegister - Tally up interesting quantities from the given register. +void Score::RateRegister(const SCEV *Reg, + SmallPtrSet &Regs, + const Loop *L) { + if (Regs.insert(Reg)) + if (const SCEVAddRecExpr *AR = dyn_cast(Reg)) { + NumPhis += AR->getLoop() == L; + + // Add the step value register, if it needs one. + if (!AR->isAffine() || !isa(AR->getOperand(1))) + RateRegister(AR->getOperand(1), Regs, L); } +} - // Look for a suitable stride / iv as replacement. - for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) { - std::map::iterator SI = - IU->IVUsesByStride.find(IU->StrideOrder[i]); - if (!isa(SI->first) || SI->second->Users.empty()) - continue; - int64_t SSInt = cast(SI->first)->getValue()->getSExtValue(); - if (SSInt == CmpSSInt || - abs64(SSInt) < abs64(CmpSSInt) || - (SSInt % CmpSSInt) != 0) - continue; +void Score::RateFormula(const Formula &F, + SmallPtrSet &Regs, + const Loop *L) { + // Tally up the registers. + if (F.ScaledReg) + RateRegister(F.ScaledReg, Regs, L); + for (SmallVectorImpl::const_iterator I = F.BaseRegs.begin(), + E = F.BaseRegs.end(); I != E; ++I) { + const SCEV *BaseReg = *I; + RateRegister(BaseReg, Regs, L); + + NumIVMuls += isa(BaseReg) && + BaseReg->hasComputableLoopEvolution(L); + } - Scale = SSInt / CmpSSInt; - int64_t NewCmpVal = CmpVal * Scale; + if (F.BaseRegs.size() > 1) + NumBaseAdds += F.BaseRegs.size() - 1; - // If old icmp value fits in icmp immediate field, but the new one doesn't - // try something else. - if (TLI && - TLI->isLegalICmpImmediate(CmpVal) && - !TLI->isLegalICmpImmediate(NewCmpVal)) - continue; + // Tally up the non-zero immediates. + if (F.AM.BaseGV || F.AM.BaseOffs != 0) + ++NumImms; +} - APInt Mul = APInt(BitWidth*2, CmpVal, true); - Mul = Mul * APInt(BitWidth*2, Scale, true); - // Check for overflow. - if (!Mul.isSignedIntN(BitWidth)) - continue; - // Check for overflow in the stride's type too. - if (!Mul.isSignedIntN(SE->getTypeSizeInBits(SI->first->getType()))) - continue; +/// Loose - Set this score to a loosing value. +void Score::Loose() { + NumRegs = ~0u; + NumPhis = ~0u; + NumIVMuls = ~0u; + NumBaseAdds = ~0u; + NumImms = ~0u; +} - // Watch out for overflow. - if (ICmpInst::isSigned(Predicate) && - (CmpVal & SignBit) != (NewCmpVal & SignBit)) - continue; +/// RateInitial - Compute a score for the initial "fully reduced" solution. +void Score::RateInitial(SmallVector const &Uses, const Loop *L, + ScalarEvolution &SE) { + SmallPtrSet Regs; + for (SmallVectorImpl::const_iterator I = Uses.begin(), + E = Uses.end(); I != E; ++I) + RateFormula(I->Formulae.front(), Regs, L); + NumRegs += Regs.size(); - // Pick the best iv to use trying to avoid a cast. - NewCmpLHS = NULL; - for (ilist::iterator UI = SI->second->Users.begin(), - E = SI->second->Users.end(); UI != E; ++UI) { - Value *Op = UI->getOperandValToReplace(); - - // If the IVStrideUse implies a cast, check for an actual cast which - // can be used to find the original IV expression. - if (SE->getEffectiveSCEVType(Op->getType()) != - SE->getEffectiveSCEVType(SI->first->getType())) { - CastInst *CI = dyn_cast(Op); - // If it's not a simple cast, it's complicated. - if (!CI) - continue; - // If it's a cast from a type other than the stride type, - // it's complicated. - if (CI->getOperand(0)->getType() != SI->first->getType()) - continue; - // Ok, we found the IV expression in the stride's type. - Op = CI->getOperand(0); - } + DEBUG(print_details(dbgs(), 0, Regs)); +} - NewCmpLHS = Op; - if (NewCmpLHS->getType() == CmpTy) - break; +/// Rate - Compute a score for the solution where the reuse associated with +/// putting Reg in a register is selected. +void Score::Rate(const SCEV *Reg, const SmallBitVector &Bits, + const SmallVector &Uses, const Loop *L, + ScalarEvolution &SE) { + SmallPtrSet Regs; + for (size_t i = 0, e = Uses.size(); i != e; ++i) { + const LSRUse &LU = Uses[i]; + + const Formula *BestFormula = 0; + if (i >= Bits.size() || !Bits.test(i)) + // This use doesn't use the current register. Just go with the current + // leading candidate formula. + BestFormula = &LU.Formulae.front(); + else + // Find the best formula for this use that uses the current register. + for (SmallVectorImpl::const_iterator I = LU.Formulae.begin(), + E = LU.Formulae.end(); I != E; ++I) { + const Formula &F = *I; + if (F.referencesReg(Reg) && + (!BestFormula || ComplexitySorter()(F, *BestFormula))) + BestFormula = &F; } - if (!NewCmpLHS) - continue; - NewCmpTy = NewCmpLHS->getType(); - NewTyBits = SE->getTypeSizeInBits(NewCmpTy); - const Type *NewCmpIntTy = IntegerType::get(Cond->getContext(), NewTyBits); - if (RequiresTypeConversion(NewCmpTy, CmpTy)) { - // Check if it is possible to rewrite it using - // an iv / stride of a smaller integer type. - unsigned Bits = NewTyBits; - if (ICmpInst::isSigned(Predicate)) - --Bits; - uint64_t Mask = (1ULL << Bits) - 1; - if (((uint64_t)NewCmpVal & Mask) != (uint64_t)NewCmpVal) - continue; - } + // If we didn't find *any* forumlae, because earlier we eliminated some + // in greedy fashion, skip the current register's reuse opportunity. + if (!BestFormula) { + DEBUG(dbgs() << "Reuse with reg " << *Reg + << " wouldn't help any users.\n"); + Loose(); + return; + } - // Don't rewrite if use offset is non-constant and the new type is - // of a different type. - // FIXME: too conservative? - if (NewTyBits != TyBits && !isa(CondUse->getOffset())) - continue; + // For an in-loop post-inc user, don't allow multiple base registers, + // because that would require an awkward in-loop add after the increment. + if (LU.PostIncLoop && LU.PostIncLoop->contains(LU.UserInst) && + BestFormula->BaseRegs.size() > 1) { + DEBUG(dbgs() << "Reuse with reg " << *Reg + << " would require an in-loop post-inc add: "; + BestFormula->dump()); + Loose(); + return; + } - if (!PostPass) { - bool AllUsesAreAddresses = true; - bool AllUsesAreOutsideLoop = true; - std::vector UsersToProcess; - const SCEV *CommonExprs = CollectIVUsers(SI->first, *SI->second, L, - AllUsesAreAddresses, - AllUsesAreOutsideLoop, - UsersToProcess); - // Avoid rewriting the compare instruction with an iv of new stride - // if it's likely the new stride uses will be rewritten using the - // stride of the compare instruction. - if (AllUsesAreAddresses && - ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess)) - continue; - } + RateFormula(*BestFormula, Regs, L); + } + NumRegs += Regs.size(); - // Avoid rewriting the compare instruction with an iv which has - // implicit extension or truncation built into it. - // TODO: This is over-conservative. - if (SE->getTypeSizeInBits(CondUse->getOffset()->getType()) != TyBits) - continue; + DEBUG(print_details(dbgs(), Reg, Regs)); +} - // If scale is negative, use swapped predicate unless it's testing - // for equality. - if (Scale < 0 && !Cond->isEquality()) - Predicate = ICmpInst::getSwappedPredicate(Predicate); +/// operator< - Choose the better score. +bool Score::operator<(const Score &Other) const { + if (NumRegs != Other.NumRegs) + return NumRegs < Other.NumRegs; + if (NumPhis != Other.NumPhis) + return NumPhis < Other.NumPhis; + if (NumIVMuls != Other.NumIVMuls) + return NumIVMuls < Other.NumIVMuls; + if (NumBaseAdds != Other.NumBaseAdds) + return NumBaseAdds < Other.NumBaseAdds; + return NumImms < Other.NumImms; +} - NewStride = IU->StrideOrder[i]; - if (!isa(NewCmpTy)) - NewCmpRHS = ConstantInt::get(NewCmpTy, NewCmpVal); - else { - Constant *CI = ConstantInt::get(NewCmpIntTy, NewCmpVal); - NewCmpRHS = ConstantExpr::getIntToPtr(CI, NewCmpTy); - } - NewOffset = TyBits == NewTyBits - ? SE->getMulExpr(CondUse->getOffset(), - SE->getConstant(CmpTy, Scale)) - : SE->getConstant(NewCmpIntTy, - cast(CondUse->getOffset())->getValue() - ->getSExtValue()*Scale); - break; +void Score::print_details(raw_ostream &OS, + const SCEV *Reg, + const SmallPtrSet &Regs) const { + if (Reg) OS << "Reuse with reg " << *Reg << " would require "; + else OS << "The initial solution would require "; + print(OS); + OS << ". Regs:"; + for (SmallPtrSet::const_iterator I = Regs.begin(), + E = Regs.end(); I != E; ++I) + OS << ' ' << **I; + OS << '\n'; +} + +void Score::print(raw_ostream &OS) const { + OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s"); + if (NumPhis != 0) + OS << ", including " << NumPhis << " PHI" << (NumPhis == 1 ? "" : "s"); + if (NumIVMuls != 0) + OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s"); + if (NumBaseAdds != 0) + OS << ", plus " << NumBaseAdds << " base add" + << (NumBaseAdds == 1 ? "" : "s"); + if (NumImms != 0) + OS << ", plus " << NumImms << " imm" << (NumImms == 1 ? "" : "s"); +} + +void Score::dump() const { + print(errs()); errs() << '\n'; +} + +/// isAddressUse - Returns true if the specified instruction is using the +/// specified value as an address. +static bool isAddressUse(Instruction *Inst, Value *OperandVal) { + bool isAddress = isa(Inst); + if (StoreInst *SI = dyn_cast(Inst)) { + if (SI->getOperand(1) == OperandVal) + isAddress = true; + } else if (IntrinsicInst *II = dyn_cast(Inst)) { + // Addressing modes can also be folded into prefetches and a variety + // of intrinsics. + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::prefetch: + case Intrinsic::x86_sse2_loadu_dq: + case Intrinsic::x86_sse2_loadu_pd: + case Intrinsic::x86_sse_loadu_ps: + case Intrinsic::x86_sse_storeu_ps: + case Intrinsic::x86_sse2_storeu_pd: + case Intrinsic::x86_sse2_storeu_dq: + case Intrinsic::x86_sse2_storel_dq: + if (II->getOperand(1) == OperandVal) + isAddress = true; + break; } } + return isAddress; +} - // Forgo this transformation if it the increment happens to be - // unfortunately positioned after the condition, and the condition - // has multiple uses which prevent it from being moved immediately - // before the branch. See - // test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-*.ll - // for an example of this situation. - if (!Cond->hasOneUse()) { - for (BasicBlock::iterator I = Cond, E = Cond->getParent()->end(); - I != E; ++I) - if (I == NewCmpLHS) - return Cond; +/// getAccessType - Return the type of the memory being accessed. +static const Type *getAccessType(const Instruction *Inst) { + const Type *AccessTy = Inst->getType(); + if (const StoreInst *SI = dyn_cast(Inst)) + AccessTy = SI->getOperand(0)->getType(); + else if (const IntrinsicInst *II = dyn_cast(Inst)) { + // Addressing modes can also be folded into prefetches and a variety + // of intrinsics. + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::x86_sse_storeu_ps: + case Intrinsic::x86_sse2_storeu_pd: + case Intrinsic::x86_sse2_storeu_dq: + case Intrinsic::x86_sse2_storel_dq: + AccessTy = II->getOperand(1)->getType(); + break; + } } + return AccessTy; +} + +/// DeleteTriviallyDeadInstructions - If any of the instructions is the +/// specified set are trivially dead, delete them and see if this makes any of +/// their operands subsequently dead. +static bool +DeleteTriviallyDeadInstructions(SmallVectorImpl &DeadInsts) { + bool Changed = false; + + while (!DeadInsts.empty()) { + Instruction *I = dyn_cast_or_null(DeadInsts.pop_back_val()); + + if (I == 0 || !isInstructionTriviallyDead(I)) + continue; - if (NewCmpRHS) { - // Create a new compare instruction using new stride / iv. - ICmpInst *OldCond = Cond; - // Insert new compare instruction. - Cond = new ICmpInst(OldCond, Predicate, NewCmpLHS, NewCmpRHS, - L->getHeader()->getName() + ".termcond"); - - DEBUG(dbgs() << " Change compare stride in Inst " << *OldCond); - DEBUG(dbgs() << " to " << *Cond << '\n'); - - // Remove the old compare instruction. The old indvar is probably dead too. - DeadInsts.push_back(CondUse->getOperandValToReplace()); - OldCond->replaceAllUsesWith(Cond); - OldCond->eraseFromParent(); - - IU->IVUsesByStride[NewStride]->addUser(NewOffset, Cond, NewCmpLHS); - CondUse = &IU->IVUsesByStride[NewStride]->Users.back(); - CondStride = NewStride; - ++NumEliminated; + for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) + if (Instruction *U = dyn_cast(*OI)) { + *OI = 0; + if (U->use_empty()) + DeadInsts.push_back(U); + } + + I->eraseFromParent(); Changed = true; } - return Cond; + return Changed; } -/// OptimizeMax - Rewrite the loop's terminating condition if it uses -/// a max computation. -/// -/// This is a narrow solution to a specific, but acute, problem. For loops -/// like this: -/// -/// i = 0; -/// do { -/// p[i] = 0.0; -/// } while (++i < n); -/// -/// the trip count isn't just 'n', because 'n' might not be positive. And -/// unfortunately this can come up even for loops where the user didn't use -/// a C do-while loop. For example, seemingly well-behaved top-test loops -/// will commonly be lowered like this: -// -/// if (n > 0) { -/// i = 0; -/// do { -/// p[i] = 0.0; -/// } while (++i < n); -/// } -/// -/// and then it's possible for subsequent optimization to obscure the if -/// test in such a way that indvars can't find it. -/// -/// When indvars can't find the if test in loops like this, it creates a -/// max expression, which allows it to give the loop a canonical -/// induction variable: -/// -/// i = 0; -/// max = n < 1 ? 1 : n; -/// do { -/// p[i] = 0.0; -/// } while (++i != max); -/// -/// Canonical induction variables are necessary because the loop passes -/// are designed around them. The most obvious example of this is the -/// LoopInfo analysis, which doesn't remember trip count values. It -/// expects to be able to rediscover the trip count each time it is -/// needed, and it does this using a simple analyis that only succeeds if -/// the loop has a canonical induction variable. -/// -/// However, when it comes time to generate code, the maximum operation -/// can be quite costly, especially if it's inside of an outer loop. -/// -/// This function solves this problem by detecting this type of loop and -/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting -/// the instructions for the maximum computation. -/// -ICmpInst *LoopStrengthReduce::OptimizeMax(Loop *L, ICmpInst *Cond, - IVStrideUse* &CondUse) { - // Check that the loop matches the pattern we're looking for. - if (Cond->getPredicate() != CmpInst::ICMP_EQ && - Cond->getPredicate() != CmpInst::ICMP_NE) - return Cond; - - SelectInst *Sel = dyn_cast(Cond->getOperand(1)); - if (!Sel || !Sel->hasOneUse()) return Cond; +namespace { - const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); - if (isa(BackedgeTakenCount)) - return Cond; - const SCEV *One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType()); +/// LSRInstance - This class holds state for the main loop strength +/// reduction logic. +class LSRInstance { + IVUsers &IU; + ScalarEvolution &SE; + DominatorTree &DT; + const TargetLowering *const TLI; + Loop *const L; + bool Changed; + + /// CurrentArbitraryRegIndex - To ensure a deterministic ordering, assign an + /// arbitrary index value to each register as a sort tie breaker. + unsigned CurrentArbitraryRegIndex; + + /// Factors - Interesting factors between use strides. + SmallSetVector Factors; + + /// Types - Interesting use types, to facilitate truncation reuse. + SmallSetVector Types; + + /// Uses - The list of interesting uses. + SmallVector Uses; + + // TODO: Reorganize these data structures. + typedef DenseMap RegUsesTy; + RegUsesTy RegUses; + SmallVector RegSequence; + + void OptimizeShadowIV(); + bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse, + const SCEV* &CondStride); + ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse); + bool StrideMightBeShared(const SCEV* Stride); + bool OptimizeLoopTermCond(Instruction *&IVIncInsertPos); + + LSRUse &getNewUse() { + Uses.push_back(LSRUse()); + return Uses.back(); + } - // Add one to the backedge-taken count to get the trip count. - const SCEV *IterationCount = SE->getAddExpr(BackedgeTakenCount, One); + void CountRegister(const SCEV *Reg, uint32_t Complexity, size_t LUIdx); + void CountRegisters(const Formula &F, size_t LUIdx); - // Check for a max calculation that matches the pattern. - if (!isa(IterationCount) && !isa(IterationCount)) - return Cond; - const SCEVNAryExpr *Max = cast(IterationCount); - if (Max != SE->getSCEV(Sel)) return Cond; + void GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx, + Formula Base); + void GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx, + Formula Base); + void GenerateFormulaeFromReplacedBaseReg(LSRUse &LU, + unsigned LUIdx, + const Formula &Base, unsigned i, + const SmallVectorImpl + &AddOps); + void GenerateReassociationReuse(LSRUse &LU, unsigned LUIdx, + Formula Base); + void GenerateCombinationReuse(LSRUse &LU, unsigned LUIdx, + Formula Base); + void GenerateScaledReuse(LSRUse &LU, unsigned LUIdx, + Formula Base); + void GenerateTruncateReuse(LSRUse &LU, unsigned LUIdx, + Formula Base); - // To handle a max with more than two operands, this optimization would - // require additional checking and setup. - if (Max->getNumOperands() != 2) - return Cond; + void GenerateConstantOffsetReuse(); - const SCEV *MaxLHS = Max->getOperand(0); - const SCEV *MaxRHS = Max->getOperand(1); - if (!MaxLHS || MaxLHS != One) return Cond; + void GenerateAllReuseFormulae(); - // Check the relevant induction variable for conformance to - // the pattern. - const SCEV *IV = SE->getSCEV(Cond->getOperand(0)); - const SCEVAddRecExpr *AR = dyn_cast(IV); - if (!AR || !AR->isAffine() || - AR->getStart() != One || - AR->getStepRecurrence(*SE) != One) - return Cond; - - assert(AR->getLoop() == L && - "Loop condition operand is an addrec in a different loop!"); + void GenerateLoopInvariantRegisterUses(); - // Check the right operand of the select, and remember it, as it will - // be used in the new comparison instruction. - Value *NewRHS = 0; - if (SE->getSCEV(Sel->getOperand(1)) == MaxRHS) - NewRHS = Sel->getOperand(1); - else if (SE->getSCEV(Sel->getOperand(2)) == MaxRHS) - NewRHS = Sel->getOperand(2); - if (!NewRHS) return Cond; +public: + LSRInstance(const TargetLowering *tli, Loop *l, Pass *P); - // Determine the new comparison opcode. It may be signed or unsigned, - // and the original comparison may be either equality or inequality. - CmpInst::Predicate Pred = - isa(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT; - if (Cond->getPredicate() == CmpInst::ICMP_EQ) - Pred = CmpInst::getInversePredicate(Pred); + bool getChanged() const { return Changed; } - // Ok, everything looks ok to change the condition into an SLT or SGE and - // delete the max calculation. - ICmpInst *NewCond = - new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp"); + void print(raw_ostream &OS) const; + void dump() const; +}; - // Delete the max calculation instructions. - Cond->replaceAllUsesWith(NewCond); - CondUse->setUser(NewCond); - Instruction *Cmp = cast(Sel->getOperand(0)); - Cond->eraseFromParent(); - Sel->eraseFromParent(); - if (Cmp->use_empty()) - Cmp->eraseFromParent(); - return NewCond; } /// OptimizeShadowIV - If IV is used in a int-to-float cast /// inside the loop then try to eliminate the cast opeation. -void LoopStrengthReduce::OptimizeShadowIV(Loop *L) { - - const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); +void LSRInstance::OptimizeShadowIV() { + const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); if (isa(BackedgeTakenCount)) return; - for (unsigned Stride = 0, e = IU->StrideOrder.size(); Stride != e; - ++Stride) { + for (size_t StrideIdx = 0, e = IU.StrideOrder.size(); + StrideIdx != e; ++StrideIdx) { std::map::iterator SI = - IU->IVUsesByStride.find(IU->StrideOrder[Stride]); - assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!"); + IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]); + assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!"); if (!isa(SI->first)) continue; @@ -2229,7 +1500,7 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) { const Type *SrcTy = PH->getType(); int Mantissa = DestTy->getFPMantissaWidth(); if (Mantissa == -1) continue; - if ((int)SE->getTypeSizeInBits(SrcTy) > Mantissa) + if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa) continue; unsigned Entry, Latch; @@ -2267,43 +1538,183 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) { // correctly. TODO: Remove this restriction. if (!C->getValue().isStrictlyPositive()) continue; - /* Add new PHINode. */ - PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH); + /* Add new PHINode. */ + PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH); + + /* create new increment. '++d' in above example. */ + Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue()); + BinaryOperator *NewIncr = + BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ? + Instruction::FAdd : Instruction::FSub, + NewPH, CFP, "IV.S.next.", Incr); + + NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry)); + NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch)); + + /* Remove cast operation */ + ShadowUse->replaceAllUsesWith(NewPH); + ShadowUse->eraseFromParent(); + break; + } + } +} + +/// FindIVUserForCond - If Cond has an operand that is an expression of an IV, +/// set the IV user and stride information and return true, otherwise return +/// false. +bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, + IVStrideUse *&CondUse, + const SCEV* &CondStride) { + for (unsigned StrideIdx = 0, e = IU.StrideOrder.size(); + StrideIdx != e && !CondUse; ++StrideIdx) { + std::map::iterator SI = + IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]); + assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!"); + + for (ilist::iterator UI = SI->second->Users.begin(), + E = SI->second->Users.end(); UI != E; ++UI) + if (UI->getUser() == Cond) { + // NOTE: we could handle setcc instructions with multiple uses here, but + // InstCombine does it as well for simple uses, it's not clear that it + // occurs enough in real life to handle. + CondUse = UI; + CondStride = SI->first; + return true; + } + } + return false; +} + +/// OptimizeMax - Rewrite the loop's terminating condition if it uses +/// a max computation. +/// +/// This is a narrow solution to a specific, but acute, problem. For loops +/// like this: +/// +/// i = 0; +/// do { +/// p[i] = 0.0; +/// } while (++i < n); +/// +/// the trip count isn't just 'n', because 'n' might not be positive. And +/// unfortunately this can come up even for loops where the user didn't use +/// a C do-while loop. For example, seemingly well-behaved top-test loops +/// will commonly be lowered like this: +// +/// if (n > 0) { +/// i = 0; +/// do { +/// p[i] = 0.0; +/// } while (++i < n); +/// } +/// +/// and then it's possible for subsequent optimization to obscure the if +/// test in such a way that indvars can't find it. +/// +/// When indvars can't find the if test in loops like this, it creates a +/// max expression, which allows it to give the loop a canonical +/// induction variable: +/// +/// i = 0; +/// max = n < 1 ? 1 : n; +/// do { +/// p[i] = 0.0; +/// } while (++i != max); +/// +/// Canonical induction variables are necessary because the loop passes +/// are designed around them. The most obvious example of this is the +/// LoopInfo analysis, which doesn't remember trip count values. It +/// expects to be able to rediscover the trip count each time it is +/// needed, and it does this using a simple analysis that only succeeds if +/// the loop has a canonical induction variable. +/// +/// However, when it comes time to generate code, the maximum operation +/// can be quite costly, especially if it's inside of an outer loop. +/// +/// This function solves this problem by detecting this type of loop and +/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting +/// the instructions for the maximum computation. +/// +ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { + // Check that the loop matches the pattern we're looking for. + if (Cond->getPredicate() != CmpInst::ICMP_EQ && + Cond->getPredicate() != CmpInst::ICMP_NE) + return Cond; + + SelectInst *Sel = dyn_cast(Cond->getOperand(1)); + if (!Sel || !Sel->hasOneUse()) return Cond; + + const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); + if (isa(BackedgeTakenCount)) + return Cond; + const SCEV *One = SE.getIntegerSCEV(1, BackedgeTakenCount->getType()); + + // Add one to the backedge-taken count to get the trip count. + const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One); + + // Check for a max calculation that matches the pattern. + if (!isa(IterationCount) && !isa(IterationCount)) + return Cond; + const SCEVNAryExpr *Max = cast(IterationCount); + if (Max != SE.getSCEV(Sel)) return Cond; + + // To handle a max with more than two operands, this optimization would + // require additional checking and setup. + if (Max->getNumOperands() != 2) + return Cond; - /* create new increment. '++d' in above example. */ - Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue()); - BinaryOperator *NewIncr = - BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ? - Instruction::FAdd : Instruction::FSub, - NewPH, CFP, "IV.S.next.", Incr); + const SCEV *MaxLHS = Max->getOperand(0); + const SCEV *MaxRHS = Max->getOperand(1); + if (!MaxLHS || MaxLHS != One) return Cond; + // Check the relevant induction variable for conformance to + // the pattern. + const SCEV *IV = SE.getSCEV(Cond->getOperand(0)); + const SCEVAddRecExpr *AR = dyn_cast(IV); + if (!AR || !AR->isAffine() || + AR->getStart() != One || + AR->getStepRecurrence(SE) != One) + return Cond; - NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry)); - NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch)); + assert(AR->getLoop() == L && + "Loop condition operand is an addrec in a different loop!"); - /* Remove cast operation */ - ShadowUse->replaceAllUsesWith(NewPH); - ShadowUse->eraseFromParent(); - NumShadow++; - break; - } - } -} + // Check the right operand of the select, and remember it, as it will + // be used in the new comparison instruction. + Value *NewRHS = 0; + if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS) + NewRHS = Sel->getOperand(1); + else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS) + NewRHS = Sel->getOperand(2); + if (!NewRHS) return Cond; + + // Determine the new comparison opcode. It may be signed or unsigned, + // and the original comparison may be either equality or inequality. + CmpInst::Predicate Pred = + isa(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT; + if (Cond->getPredicate() == CmpInst::ICMP_EQ) + Pred = CmpInst::getInversePredicate(Pred); -/// OptimizeIndvars - Now that IVUsesByStride is set up with all of the indvar -/// uses in the loop, look to see if we can eliminate some, in favor of using -/// common indvars for the different uses. -void LoopStrengthReduce::OptimizeIndvars(Loop *L) { - // TODO: implement optzns here. + // Ok, everything looks ok to change the condition into an SLT or SGE and + // delete the max calculation. + ICmpInst *NewCond = + new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp"); - OptimizeShadowIV(L); + // Delete the max calculation instructions. + Cond->replaceAllUsesWith(NewCond); + CondUse->setUser(NewCond); + Instruction *Cmp = cast(Sel->getOperand(0)); + Cond->eraseFromParent(); + Sel->eraseFromParent(); + if (Cmp->use_empty()) + Cmp->eraseFromParent(); + return NewCond; } -bool LoopStrengthReduce::StrideMightBeShared(const SCEV* Stride, Loop *L, - bool CheckPreInc) { +bool LSRInstance::StrideMightBeShared(const SCEV* Stride) { int64_t SInt = cast(Stride)->getValue()->getSExtValue(); - for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) { + for (unsigned i = 0, e = IU.StrideOrder.size(); i != e; ++i) { std::map::iterator SI = - IU->IVUsesByStride.find(IU->StrideOrder[i]); + IU.IVUsesByStride.find(IU.StrideOrder[i]); const SCEV *Share = SI->first; if (!isa(SI->first) || Share == Stride) continue; @@ -2313,110 +1724,44 @@ bool LoopStrengthReduce::StrideMightBeShared(const SCEV* Stride, Loop *L, if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0) continue; int64_t Scale = SSInt / SInt; - bool AllUsesAreAddresses = true; - bool AllUsesAreOutsideLoop = true; - std::vector UsersToProcess; - const SCEV *CommonExprs = CollectIVUsers(SI->first, *SI->second, L, - AllUsesAreAddresses, - AllUsesAreOutsideLoop, - UsersToProcess); - if (AllUsesAreAddresses && - ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess)) { - if (!CheckPreInc) + + // This AM will be used for conservative queries. At this point in the + // process we don't know which users will have a base reg, immediate, + // etc., so we conservatively assume that it may not, making more + // strides valid, thus erring on the side of assuming that there + // might be sharing. + TargetLowering::AddrMode AM; + AM.Scale = Scale; + + // Any pre-inc iv use? + IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[Share]; + for (ilist::iterator I = StrideUses.Users.begin(), + E = StrideUses.Users.end(); I != E; ++I) { + bool isAddress = isAddressUse(I->getUser(), I->getOperandValToReplace()); + if (!I->isUseOfPostIncrementedValue() && + isLegalUse(AM, isAddress ? LSRUse::Address : LSRUse::Basic, + isAddress ? getAccessType(I->getUser()) : 0, + TLI)) return true; - // Any pre-inc iv use? - IVUsersOfOneStride &StrideUses = *IU->IVUsesByStride[Share]; - for (ilist::iterator I = StrideUses.Users.begin(), - E = StrideUses.Users.end(); I != E; ++I) { - if (!I->isUseOfPostIncrementedValue()) - return true; - } } } return false; } -/// isUsedByExitBranch - Return true if icmp is used by a loop terminating -/// conditional branch or it's and / or with other conditions before being used -/// as the condition. -static bool isUsedByExitBranch(ICmpInst *Cond, Loop *L) { - BasicBlock *CondBB = Cond->getParent(); - if (!L->isLoopExiting(CondBB)) - return false; - BranchInst *TermBr = dyn_cast(CondBB->getTerminator()); - if (!TermBr || !TermBr->isConditional()) - return false; - - Value *User = *Cond->use_begin(); - Instruction *UserInst = dyn_cast(User); - while (UserInst && - (UserInst->getOpcode() == Instruction::And || - UserInst->getOpcode() == Instruction::Or)) { - if (!UserInst->hasOneUse() || UserInst->getParent() != CondBB) - return false; - User = *User->use_begin(); - UserInst = dyn_cast(User); - } - return User == TermBr; -} - -static bool ShouldCountToZero(ICmpInst *Cond, IVStrideUse* &CondUse, - ScalarEvolution *SE, Loop *L, - const TargetLowering *TLI = 0) { - if (!L->contains(Cond)) - return false; - - if (!isa(CondUse->getOffset())) - return false; - - // Handle only tests for equality for the moment. - if (!Cond->isEquality() || !Cond->hasOneUse()) - return false; - if (!isUsedByExitBranch(Cond, L)) - return false; - - Value *CondOp0 = Cond->getOperand(0); - const SCEV *IV = SE->getSCEV(CondOp0); - const SCEVAddRecExpr *AR = dyn_cast(IV); - if (!AR || !AR->isAffine()) - return false; - - const SCEVConstant *SC = dyn_cast(AR->getStepRecurrence(*SE)); - if (!SC || SC->getValue()->getSExtValue() < 0) - // If it's already counting down, don't do anything. - return false; - - // If the RHS of the comparison is not an loop invariant, the rewrite - // cannot be done. Also bail out if it's already comparing against a zero. - // If we are checking this before cmp stride optimization, check if it's - // comparing against a already legal immediate. - Value *RHS = Cond->getOperand(1); - ConstantInt *RHSC = dyn_cast(RHS); - if (!L->isLoopInvariant(RHS) || - (RHSC && RHSC->isZero()) || - (RHSC && TLI && TLI->isLegalICmpImmediate(RHSC->getSExtValue()))) - return false; - - // Make sure the IV is only used for counting. Value may be preinc or - // postinc; 2 uses in either case. - if (!CondOp0->hasNUses(2)) - return false; - - return true; -} - /// OptimizeLoopTermCond - Change loop terminating condition to use the /// postinc iv when possible. -void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) { +bool +LSRInstance::OptimizeLoopTermCond(Instruction *&IVIncInsertPos) { + SmallPtrSet PostIncs; + BasicBlock *LatchBlock = L->getLoopLatch(); - bool LatchExit = L->isLoopExiting(LatchBlock); SmallVector ExitingBlocks; L->getExitingBlocks(ExitingBlocks); for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { BasicBlock *ExitingBlock = ExitingBlocks[i]; - // Finally, get the terminating condition for the loop if possible. If we + // Get the terminating condition for the loop if possible. If we // can, we want to change it to use a post-incremented version of its // induction variable, to allow coalescing the live ranges for the IV into // one register value. @@ -2435,291 +1780,982 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) { if (!FindIVUserForCond(Cond, CondUse, CondStride)) continue; - // If the latch block is exiting and it's not a single block loop, it's - // not safe to use postinc iv in other exiting blocks. FIXME: overly - // conservative? How about icmp stride optimization? - bool UsePostInc = !(e > 1 && LatchExit && ExitingBlock != LatchBlock); - if (UsePostInc && ExitingBlock != LatchBlock) { - if (!Cond->hasOneUse()) - // See below, we don't want the condition to be cloned. - UsePostInc = false; - else { - // If exiting block is the latch block, we know it's safe and profitable - // to transform the icmp to use post-inc iv. Otherwise do so only if it - // would not reuse another iv and its iv would be reused by other uses. - // We are optimizing for the case where the icmp is the only use of the - // iv. - IVUsersOfOneStride &StrideUses = *IU->IVUsesByStride[CondStride]; - for (ilist::iterator I = StrideUses.Users.begin(), - E = StrideUses.Users.end(); I != E; ++I) { - if (I->getUser() == Cond) - continue; - if (!I->isUseOfPostIncrementedValue()) { - UsePostInc = false; - break; - } - } - } - - // If iv for the stride might be shared and any of the users use pre-inc - // iv might be used, then it's not safe to use post-inc iv. - if (UsePostInc && - isa(CondStride) && - StrideMightBeShared(CondStride, L, true)) - UsePostInc = false; - } - // If the trip count is computed in terms of a max (due to ScalarEvolution // being unable to find a sufficient guard, for example), change the loop // comparison to use SLT or ULT instead of NE. - Cond = OptimizeMax(L, Cond, CondUse); - - // If possible, change stride and operands of the compare instruction to - // eliminate one stride. However, avoid rewriting the compare instruction - // with an iv of new stride if it's likely the new stride uses will be - // rewritten using the stride of the compare instruction. - if (ExitingBlock == LatchBlock && isa(CondStride)) { - // If the condition stride is a constant and it's the only use, we might - // want to optimize it first by turning it to count toward zero. - if (!StrideMightBeShared(CondStride, L, false) && - !ShouldCountToZero(Cond, CondUse, SE, L, TLI)) - Cond = ChangeCompareStride(L, Cond, CondUse, CondStride); + // One consequence of doing this now is that it disrupts the count-down + // optimization. That's not always a bad thing though, because in such + // cases it may still be worthwhile to avoid a max. + Cond = OptimizeMax(Cond, CondUse); + + // If this exiting block is the latch block, and the condition has only + // one use inside the loop (the branch), use the post-incremented value + // of the induction variable + if (ExitingBlock != LatchBlock) { + // If this exiting block dominates the latch block, it may also use + // the post-inc value if it won't be shared with other uses. + // Check for dominance. + if (!DT.dominates(ExitingBlock, LatchBlock)) + continue; + // Check for sharing within the same stride. + bool SameStrideSharing = false; + IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[CondStride]; + for (ilist::iterator I = StrideUses.Users.begin(), + E = StrideUses.Users.end(); I != E; ++I) { + if (I->getUser() == Cond) + continue; + if (!I->isUseOfPostIncrementedValue()) { + SameStrideSharing = true; + break; + } + } + if (SameStrideSharing) + continue; + // Check for sharing from a different stride. + if (isa(CondStride) && StrideMightBeShared(CondStride)) + continue; + } + if (!Cond->hasOneUse()) { + bool HasOneUseInLoop = true; + for (Value::use_iterator UI = Cond->use_begin(), UE = Cond->use_end(); + UI != UE; ++UI) { + Instruction *U = cast(*UI); + if (U == TermBr) + continue; + if (L->contains(U)) { + HasOneUseInLoop = false; + break; + } + } + if (!HasOneUseInLoop) + continue; } - - if (!UsePostInc) - continue; DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: " - << *Cond << '\n'); + << *Cond << '\n'); // It's possible for the setcc instruction to be anywhere in the loop, and // possible for it to have multiple users. If it is not immediately before // the exiting block branch, move it. - if (&*++BasicBlock::iterator(Cond) != (Instruction*)TermBr) { + if (&*++BasicBlock::iterator(Cond) != TermBr) { if (Cond->hasOneUse()) { // Condition has a single use, just move it. Cond->moveBefore(TermBr); } else { // Otherwise, clone the terminating condition and insert into the // loopend. + ICmpInst *OldCond = Cond; Cond = cast(Cond->clone()); Cond->setName(L->getHeader()->getName() + ".termcond"); ExitingBlock->getInstList().insert(TermBr, Cond); // Clone the IVUse, as the old use still exists! - IU->IVUsesByStride[CondStride]->addUser(CondUse->getOffset(), Cond, + IU.IVUsesByStride[CondStride]->addUser(CondUse->getOffset(), Cond, CondUse->getOperandValToReplace()); - CondUse = &IU->IVUsesByStride[CondStride]->Users.back(); + CondUse = &IU.IVUsesByStride[CondStride]->Users.back(); + TermBr->replaceUsesOfWith(OldCond, Cond); } } // If we get to here, we know that we can transform the setcc instruction to // use the post-incremented version of the IV, allowing us to coalesce the // live ranges for the IV correctly. - CondUse->setOffset(SE->getMinusSCEV(CondUse->getOffset(), CondStride)); + CondUse->setOffset(SE.getMinusSCEV(CondUse->getOffset(), CondStride)); CondUse->setIsUseOfPostIncrementedValue(true); Changed = true; - ++NumLoopCond; + PostIncs.insert(Cond); } -} - -bool LoopStrengthReduce::OptimizeLoopCountIVOfStride(const SCEV* &Stride, - IVStrideUse* &CondUse, - Loop *L) { - // If the only use is an icmp of a loop exiting conditional branch, then - // attempt the optimization. - BasedUser User = BasedUser(*CondUse, SE); - assert(isa(User.Inst) && "Expecting an ICMPInst!"); - ICmpInst *Cond = cast(User.Inst); - // Less strict check now that compare stride optimization is done. - if (!ShouldCountToZero(Cond, CondUse, SE, L)) - return false; + // Determine an insertion point for the loop induction variable increment. It + // must dominate all the post-inc comparisons we just set up, and it must + // dominate the loop latch edge. + IVIncInsertPos = L->getLoopLatch()->getTerminator(); + for (SmallPtrSet::iterator I = PostIncs.begin(), + E = PostIncs.end(); I != E; ++I) { + BasicBlock *BB = + DT.findNearestCommonDominator(IVIncInsertPos->getParent(), + (*I)->getParent()); + if (BB == (*I)->getParent()) + IVIncInsertPos = *I; + else if (BB != IVIncInsertPos->getParent()) + IVIncInsertPos = BB->getTerminator(); + } - Value *CondOp0 = Cond->getOperand(0); - PHINode *PHIExpr = dyn_cast(CondOp0); - Instruction *Incr; - if (!PHIExpr) { - // Value tested is postinc. Find the phi node. - Incr = dyn_cast(CondOp0); - // FIXME: Just use User.OperandValToReplace here? - if (!Incr || Incr->getOpcode() != Instruction::Add) - return false; + return Changed; +} - PHIExpr = dyn_cast(Incr->getOperand(0)); - if (!PHIExpr) - return false; - // 1 use for preinc value, the increment. - if (!PHIExpr->hasOneUse()) - return false; +/// CountRegisters - Note the given register. +void LSRInstance::CountRegister(const SCEV *Reg, uint32_t Complexity, + size_t LUIdx) { + std::pair Pair = + RegUses.insert(std::make_pair(Reg, RegSortData())); + RegSortData &BV = Pair.first->second; + if (Pair.second) { + BV.Index = CurrentArbitraryRegIndex++; + BV.MaxComplexity = Complexity; + RegSequence.push_back(Reg); } else { - assert(isa(CondOp0) && - "Unexpected loop exiting counting instruction sequence!"); - PHIExpr = cast(CondOp0); - // Value tested is preinc. Find the increment. - // A CmpInst is not a BinaryOperator; we depend on this. - Instruction::use_iterator UI = PHIExpr->use_begin(); - Incr = dyn_cast(UI); - if (!Incr) - Incr = dyn_cast(++UI); - // One use for postinc value, the phi. Unnecessarily conservative? - if (!Incr || !Incr->hasOneUse() || Incr->getOpcode() != Instruction::Add) - return false; + BV.MaxComplexity = std::max(BV.MaxComplexity, Complexity); } + BV.Bits.resize(std::max(BV.Bits.size(), LUIdx + 1)); + BV.Bits.set(LUIdx); +} - // Replace the increment with a decrement. - DEBUG(dbgs() << "LSR: Examining use "); - DEBUG(WriteAsOperand(dbgs(), CondOp0, /*PrintType=*/false)); - DEBUG(dbgs() << " in Inst: " << *Cond << '\n'); - BinaryOperator *Decr = BinaryOperator::Create(Instruction::Sub, - Incr->getOperand(0), Incr->getOperand(1), "tmp", Incr); - Incr->replaceAllUsesWith(Decr); - Incr->eraseFromParent(); - - // Substitute endval-startval for the original startval, and 0 for the - // original endval. Since we're only testing for equality this is OK even - // if the computation wraps around. - BasicBlock *Preheader = L->getLoopPreheader(); - Instruction *PreInsertPt = Preheader->getTerminator(); - unsigned InBlock = L->contains(PHIExpr->getIncomingBlock(0)) ? 1 : 0; - Value *StartVal = PHIExpr->getIncomingValue(InBlock); - Value *EndVal = Cond->getOperand(1); - DEBUG(dbgs() << " Optimize loop counting iv to count down [" - << *EndVal << " .. " << *StartVal << "]\n"); - - // FIXME: check for case where both are constant. - Constant* Zero = ConstantInt::get(Cond->getOperand(1)->getType(), 0); - BinaryOperator *NewStartVal = BinaryOperator::Create(Instruction::Sub, - EndVal, StartVal, "tmp", PreInsertPt); - PHIExpr->setIncomingValue(InBlock, NewStartVal); - Cond->setOperand(1, Zero); - DEBUG(dbgs() << " New icmp: " << *Cond << "\n"); +/// CountRegisters - Note which registers are used by the given formula, +/// updating RegUses. +void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { + uint32_t Complexity = F.getComplexity(); + if (F.ScaledReg) + CountRegister(F.ScaledReg, Complexity, LUIdx); + for (SmallVectorImpl::const_iterator I = F.BaseRegs.begin(), + E = F.BaseRegs.end(); I != E; ++I) + CountRegister(*I, Complexity, LUIdx); +} - int64_t SInt = cast(Stride)->getValue()->getSExtValue(); - const SCEV *NewStride = 0; - bool Found = false; - for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) { - const SCEV *OldStride = IU->StrideOrder[i]; - if (const SCEVConstant *SC = dyn_cast(OldStride)) - if (SC->getValue()->getSExtValue() == -SInt) { - Found = true; - NewStride = OldStride; - break; - } +/// GenerateSymbolicOffsetReuse - Generate reuse formulae using symbolic +/// offsets. +void LSRInstance::GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx, + Formula Base) { + // We can't add a symbolic offset if the address already contains one. + if (Base.AM.BaseGV) return; + + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { + const SCEV *G = Base.BaseRegs[i]; + GlobalValue *GV = ExtractSymbol(G, SE); + if (G->isZero()) + continue; + Formula F = Base; + F.AM.BaseGV = GV; + if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI)) + continue; + F.BaseRegs[i] = G; + if (LU.InsertFormula(F)) + CountRegisters(LU.Formulae.back(), LUIdx); } +} + +/// GenerateICmpZeroScaledReuse - For ICmpZero, check to see if we can scale up +/// the comparison. For example, x == y -> x*c == y*c. +void LSRInstance::GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx, + Formula Base) { + if (LU.Kind != LSRUse::ICmpZero) return; + + // Determine the integer type for the base formula. + const Type *IntTy = Base.getType(); + if (!IntTy) return; + if (SE.getTypeSizeInBits(IntTy) > 64) return; + IntTy = SE.getEffectiveSCEVType(IntTy); + + assert(!Base.AM.BaseGV && "ICmpZero use is not legal!"); + + // Check each interesting stride. + for (SmallSetVector::const_iterator + I = Factors.begin(), E = Factors.end(); I != E; ++I) { + int64_t Factor = *I; + Formula F = Base; + + // Check that the multiplication doesn't overflow. + F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs * Factor; + if ((int64_t)F.AM.BaseOffs / Factor != F.AM.BaseOffs) + continue; - if (!Found) - NewStride = SE->getIntegerSCEV(-SInt, Stride->getType()); - IU->AddUser(NewStride, CondUse->getOffset(), Cond, Cond->getOperand(0)); - IU->IVUsesByStride[Stride]->removeUser(CondUse); + // Check that this scale is legal. + if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI)) + continue; + + const SCEV *FactorS = SE.getSCEV(ConstantInt::get(IntTy, Factor)); - CondUse = &IU->IVUsesByStride[NewStride]->Users.back(); - Stride = NewStride; + // Check that multiplying with each base register doesn't overflow. + for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) { + F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS); + if (getSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i]) + goto next; + } - ++NumCountZero; + // Check that multiplying with the scaled register doesn't overflow. + if (F.ScaledReg) { + F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS); + if (getSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg) + continue; + } - return true; + // If we make it here and it's legal, add it. + if (LU.InsertFormula(F)) + CountRegisters(LU.Formulae.back(), LUIdx); + next:; + } } -/// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for deciding -/// when to exit the loop is used only for that purpose, try to rearrange things -/// so it counts down to a test against zero. -bool LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) { - bool ThisChanged = false; - for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) { - const SCEV *Stride = IU->StrideOrder[i]; - std::map::iterator SI = - IU->IVUsesByStride.find(Stride); - assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!"); - // FIXME: Generalize to non-affine IV's. - if (!SI->first->isLoopInvariant(L)) +/// GenerateFormulaeFromReplacedBaseReg - If removing base register with +/// index i from the BaseRegs list and adding the registers in AddOps +/// to the address forms an interesting formula, pursue it. +void +LSRInstance::GenerateFormulaeFromReplacedBaseReg( + LSRUse &LU, + unsigned LUIdx, + const Formula &Base, unsigned i, + const SmallVectorImpl + &AddOps) { + if (AddOps.empty()) return; + + Formula F = Base; + std::swap(F.BaseRegs[i], F.BaseRegs.back()); + F.BaseRegs.pop_back(); + for (SmallVectorImpl::const_iterator I = AddOps.begin(), + E = AddOps.end(); I != E; ++I) + F.BaseRegs.push_back(*I); + F.AM.HasBaseReg = !F.BaseRegs.empty(); + if (LU.InsertFormula(F)) { + CountRegisters(LU.Formulae.back(), LUIdx); + // Recurse. + GenerateReassociationReuse(LU, LUIdx, LU.Formulae.back()); + } +} + +/// GenerateReassociationReuse - Split out subexpressions from adds and +/// the bases of addrecs. +void LSRInstance::GenerateReassociationReuse(LSRUse &LU, unsigned LUIdx, + Formula Base) { + SmallVector AddOps; + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { + const SCEV *BaseReg = Base.BaseRegs[i]; + if (const SCEVAddExpr *Add = dyn_cast(BaseReg)) { + for (SCEVAddExpr::op_iterator J = Add->op_begin(), JE = Add->op_end(); + J != JE; ++J) { + // Don't pull a constant into a register if the constant could be + // folded into an immediate field. + if (isAlwaysFoldable(*J, true, LU.Kind, LU.AccessTy, TLI, SE)) continue; + SmallVector InnerAddOps; + for (SCEVAddExpr::op_iterator K = Add->op_begin(); K != JE; ++K) + if (K != J) + InnerAddOps.push_back(*K); + // Splitting a 2-operand add both ways is redundant. Pruning this + // now saves compile time. + if (InnerAddOps.size() < 2 && next(J) == JE) + continue; + AddOps.push_back(*J); + const SCEV *InnerAdd = SE.getAddExpr(InnerAddOps); + AddOps.push_back(InnerAdd); + GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps); + AddOps.clear(); + } + } else if (const SCEVAddRecExpr *AR = dyn_cast(BaseReg)) { + if (const SCEVAddExpr *Add = dyn_cast(AR->getStart())) { + for (SCEVAddExpr::op_iterator J = Add->op_begin(), JE = Add->op_end(); + J != JE; ++J) { + // Don't pull a constant into a register if the constant could be + // folded into an immediate field. + if (isAlwaysFoldable(*J, true, LU.Kind, LU.AccessTy, TLI, SE)) + continue; + SmallVector InnerAddOps; + for (SCEVAddExpr::op_iterator K = Add->op_begin(); K != JE; ++K) + if (K != J) + InnerAddOps.push_back(*K); + AddOps.push_back(*J); + const SCEV *InnerAdd = SE.getAddExpr(InnerAddOps); + AddOps.push_back(SE.getAddRecExpr(InnerAdd, + AR->getStepRecurrence(SE), + AR->getLoop())); + GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps); + AddOps.clear(); + } + } else if (!isAlwaysFoldable(AR->getStart(), Base.BaseRegs.size() > 1, + LU.Kind, LU.AccessTy, + TLI, SE)) { + AddOps.push_back(AR->getStart()); + AddOps.push_back(SE.getAddRecExpr(SE.getIntegerSCEV(0, + BaseReg->getType()), + AR->getStepRecurrence(SE), + AR->getLoop())); + GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps); + AddOps.clear(); + } + } + } +} + +/// GenerateCombinationReuse - Generate a formula consisting of all of the +/// loop-dominating registers added into a single register. +void LSRInstance::GenerateCombinationReuse(LSRUse &LU, unsigned LUIdx, + Formula Base) { + if (Base.BaseRegs.size() <= 1) return; + + Formula F = Base; + F.BaseRegs.clear(); + SmallVector Ops; + for (SmallVectorImpl::const_iterator + I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) { + const SCEV *BaseReg = *I; + if (BaseReg->properlyDominates(L->getHeader(), &DT) && + !BaseReg->hasComputableLoopEvolution(L)) + Ops.push_back(BaseReg); + else + F.BaseRegs.push_back(BaseReg); + } + if (Ops.size() > 1) { + F.BaseRegs.push_back(SE.getAddExpr(Ops)); + if (LU.InsertFormula(F)) + CountRegisters(LU.Formulae.back(), LUIdx); + } +} + +/// GenerateScaledReuse - Generate stride factor reuse formulae by making +/// use of scaled-offset address modes, for example. +void LSRInstance::GenerateScaledReuse(LSRUse &LU, unsigned LUIdx, + Formula Base) { + // Determine the integer type for the base formula. + const Type *IntTy = Base.getType(); + if (!IntTy) return; + IntTy = SE.getEffectiveSCEVType(IntTy); + + // Check each interesting stride. + for (SmallSetVector::const_iterator + I = Factors.begin(), E = Factors.end(); I != E; ++I) { + int64_t Factor = *I; + + // If this Formula already has a scaled register, we can't add another one. + if (Base.AM.Scale != 0) continue; - // If stride is a constant and it has an icmpinst use, check if we can - // optimize the loop to count down. - if (isa(Stride) && SI->second->Users.size() == 1) { - Instruction *User = SI->second->Users.begin()->getUser(); - if (!isa(User)) - continue; - const SCEV *CondStride = Stride; - IVStrideUse *Use = &*SI->second->Users.begin(); - if (!OptimizeLoopCountIVOfStride(CondStride, Use, L)) + Formula F = Base; + F.AM.Scale = Factor; + // Check whether this scale is going to be legal. + if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI)) { + // As a special-case, handle special out-of-loop Basic users specially. + // TODO: Reconsider this special case. + if (LU.Kind == LSRUse::Basic && + isLegalUse(F.AM, LSRUse::Special, LU.AccessTy, TLI) && + !L->contains(LU.UserInst)) + LU.Kind = LSRUse::Special; + else continue; - ThisChanged = true; + } + // For each addrec base reg, apply the scale, if possible. + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) + if (const SCEVAddRecExpr *AR = + dyn_cast(Base.BaseRegs[i])) { + const SCEV *FactorS = SE.getSCEV(ConstantInt::get(IntTy, Factor)); + // Divide out the factor, ignoring high bits, since we'll be + // scaling the value back up in the end. + if (const SCEV *Quotient = getSDiv(AR, FactorS, SE, true)) { + // TODO: This could be optimized to avoid all the copying. + Formula NewF = F; + NewF.ScaledReg = Quotient; + std::swap(NewF.BaseRegs[i], NewF.BaseRegs.back()); + NewF.BaseRegs.pop_back(); + NewF.AM.HasBaseReg = !NewF.BaseRegs.empty(); + if (LU.InsertFormula(NewF)) + CountRegisters(LU.Formulae.back(), LUIdx); + } + } + } +} + +/// GenerateTruncateReuse - Generate reuse formulae from different IV types. +void LSRInstance::GenerateTruncateReuse(LSRUse &LU, unsigned LUIdx, + Formula Base) { + // This requires TargetLowering to tell us which truncates are free. + if (!TLI) return; + + // Don't attempt to truncate symbolic values. + if (Base.AM.BaseGV) return; + + // Determine the integer type for the base formula. + const Type *DstTy = Base.getType(); + if (!DstTy) return; + DstTy = SE.getEffectiveSCEVType(DstTy); + + for (SmallSetVector::const_iterator + I = Types.begin(), E = Types.end(); I != E; ++I) { + const Type *SrcTy = *I; + if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) { + Formula F = Base; + if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I); + for (SmallVectorImpl::iterator J = F.BaseRegs.begin(), + JE = F.BaseRegs.end(); J != JE; ++J) + *J = SE.getAnyExtendExpr(*J, SrcTy); + if (LU.InsertFormula(F)) + CountRegisters(LU.Formulae.back(), LUIdx); + } + } +} + +namespace { + +/// WorkItem - Helper class for GenerateConstantOffsetReuse. It's used to +/// defer modifications so that the search phase doesn't have to worry about +/// the data structures moving underneath it. +struct WorkItem { + LSRUse *LU; + size_t LUIdx; + int64_t Imm; + const SCEV *OrigReg; + + WorkItem(LSRUse *U, size_t LI, int64_t I, const SCEV *R) + : LU(U), LUIdx(LI), Imm(I), OrigReg(R) {} + + void print(raw_ostream &OS) const; + void dump() const; +}; + +void WorkItem::print(raw_ostream &OS) const { + OS << "in use "; + LU->print(OS); + OS << " (at index " << LUIdx << "), add offset " << Imm + << " and compensate by adjusting refences to " << *OrigReg << "\n"; +} - // Now check if it's possible to reuse this iv for other stride uses. - for (unsigned j = 0, ee = IU->StrideOrder.size(); j != ee; ++j) { - const SCEV *SStride = IU->StrideOrder[j]; - if (SStride == CondStride) +void WorkItem::dump() const { + print(errs()); errs() << '\n'; +} + +} + +/// GenerateConstantOffsetReuse - Look for registers which are a constant +/// distance apart and try to form reuse opportunities between them. +void LSRInstance::GenerateConstantOffsetReuse() { + // Group the registers by their value without any added constant offset. + typedef std::map ImmMapTy; + typedef DenseMap RegMapTy; + RegMapTy Map; + SmallVector Sequence; + for (SmallVectorImpl::iterator I = RegSequence.begin(), + E = RegSequence.end(); I != E; ++I) { + const SCEV *Reg = *I; + int64_t Imm = ExtractImmediate(Reg, SE); + std::pair Pair = + Map.insert(std::make_pair(Reg, ImmMapTy())); + if (Pair.second) + Sequence.push_back(Reg); + Pair.first->second.insert(std::make_pair(Imm, *I)); + } + + // Insert an artificial expression at offset 0 (if there isn't one already), + // as this may lead to more reuse opportunities. + for (SmallVectorImpl::const_iterator I = Sequence.begin(), + E = Sequence.end(); I != E; ++I) + Map.find(*I)->second.insert(ImmMapTy::value_type(0, 0)); + + // Now examine each set of registers with the same base value. Build up + // a list of work to do and do the work in a separate step so that we're + // not adding formulae and register counts while we're searching. + SmallVector WorkItems; + for (SmallVectorImpl::const_iterator I = Sequence.begin(), + E = Sequence.end(); I != E; ++I) { + const SCEV *Reg = *I; + const ImmMapTy &Imms = Map.find(Reg)->second; + // Examine each offset. + for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); + J != JE; ++J) { + const SCEV *OrigReg = J->second; + // Skip the artifical register at offset 0. + if (!OrigReg) continue; + + int64_t JImm = J->first; + const SmallBitVector &Bits = RegUses.find(OrigReg)->second.Bits; + + // Examine each other offset associated with the same register. This is + // quadradic in the number of registers with the same base, but it's + // uncommon for this to be a large number. + for (ImmMapTy::const_iterator M = Imms.begin(); M != JE; ++M) { + if (M == J) continue; + + // Compute the difference between the two. + int64_t Imm = (uint64_t)JImm - M->first; + for (int LUIdx = Bits.find_first(); LUIdx != -1; + LUIdx = Bits.find_next(LUIdx)) + // Make a memo of this use, offset, and register tuple. + WorkItems.push_back(WorkItem(&Uses[LUIdx], LUIdx, Imm, OrigReg)); + } + } + } + + // Now iterate through the worklist and add new formulae. + for (SmallVectorImpl::const_iterator I = WorkItems.begin(), + E = WorkItems.end(); I != E; ++I) { + const WorkItem &WI = *I; + LSRUse &LU = *WI.LU; + size_t LUIdx = WI.LUIdx; + int64_t Imm = WI.Imm; + const SCEV *OrigReg = WI.OrigReg; + + const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); + const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, + -(uint64_t)Imm)); + + for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) { + Formula F = LU.Formulae[L]; + // Use the immediate in the scaled register. + if (F.ScaledReg == OrigReg) { + int64_t Offs = (uint64_t)F.AM.BaseOffs + + Imm * (uint64_t)F.AM.Scale; + // Don't create 50 + reg(-50). + if (F.referencesReg(SE.getSCEV( + ConstantInt::get(IntTy, -(uint64_t)Offs)))) + continue; + Formula NewF = F; + NewF.AM.BaseOffs = Offs; + if (!isLegalUse(NewF.AM, LU.Kind, LU.AccessTy, TLI)) + continue; + const SCEV *Diff = SE.getAddExpr(NegImmS, NewF.ScaledReg); + if (Diff->isZero()) continue; + NewF.ScaledReg = Diff; + if (LU.InsertFormula(NewF)) + CountRegisters(LU.Formulae.back(), LUIdx); + } + // Use the immediate in a base register. + for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) { + const SCEV *BaseReg = F.BaseRegs[N]; + if (BaseReg != OrigReg) + continue; + Formula NewF = F; + NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm; + if (!isLegalUse(NewF.AM, LU.Kind, LU.AccessTy, TLI)) continue; - std::map::iterator SII = - IU->IVUsesByStride.find(SStride); - assert(SII != IU->IVUsesByStride.end() && "Stride doesn't exist!"); - // FIXME: Generalize to non-affine IV's. - if (!SII->first->isLoopInvariant(L)) + const SCEV *Diff = SE.getAddExpr(NegImmS, BaseReg); + if (Diff->isZero()) continue; + // Don't create 50 + reg(-50). + if (Diff == + SE.getSCEV(ConstantInt::get(IntTy, + -(uint64_t)NewF.AM.BaseOffs))) continue; - // FIXME: Rewrite other stride using CondStride. + NewF.BaseRegs[N] = Diff; + if (LU.InsertFormula(NewF)) + CountRegisters(LU.Formulae.back(), LUIdx); } } } +} + +/// GenerateAllReuseFormulae - Generate formulae for each use. +void +LSRInstance::GenerateAllReuseFormulae() { + SmallVector Save; + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateSymbolicOffsetReuse(LU, LUIdx, LU.Formulae[i]); + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateICmpZeroScaledReuse(LU, LUIdx, LU.Formulae[i]); + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateReassociationReuse(LU, LUIdx, LU.Formulae[i]); + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateCombinationReuse(LU, LUIdx, LU.Formulae[i]); + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateScaledReuse(LU, LUIdx, LU.Formulae[i]); + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateTruncateReuse(LU, LUIdx, LU.Formulae[i]); + } + + GenerateConstantOffsetReuse(); +} + +/// GenerateLoopInvariantRegisterUses - Check for other uses of loop-invariant +/// values which we're tracking. These other uses will pin these values in +/// registers, making them less profitable for elimination. +/// TODO: This currently misses non-constant addrec step registers. +/// TODO: Should this give more weight to users inside the loop? +void +LSRInstance::GenerateLoopInvariantRegisterUses() { + for (size_t i = 0, e = RegSequence.size(); i != e; ++i) + if (const SCEVUnknown *U = dyn_cast(RegSequence[i])) { + const Value *V = U->getValue(); + if (const Instruction *Inst = dyn_cast(V)) + if (L->contains(Inst)) continue; + for (Value::use_const_iterator UI = V->use_begin(), UE = V->use_end(); + UI != UE; ++UI) { + const Instruction *UserInst = dyn_cast(*UI); + // Ignore non-instructions. + if (!UserInst) + continue; + // Ignore instructions in other functions (as can happen with + // Constants). + if (UserInst->getParent()->getParent() != L->getHeader()->getParent()) + continue; + // Ignore instructions not dominated by the loop. + const BasicBlock *UseBB = !isa(UserInst) ? + UserInst->getParent() : + cast(UserInst)->getIncomingBlock( + PHINode::getIncomingValueNumForOperand(UI.getOperandNo())); + if (!DT.dominates(L->getHeader(), UseBB)) + continue; + // Ignore uses which are part of other SCEV expressions, to avoid + // analyzing them multiple times. + if (SE.isSCEVable(UserInst->getType()) && + !isa(SE.getSCEV(const_cast(UserInst)))) + continue; + // Ignore icmp instructions which are already being analyzed. + if (const ICmpInst *ICI = dyn_cast(UserInst)) { + unsigned OtherIdx = !UI.getOperandNo(); + Value *OtherOp = const_cast(ICI->getOperand(OtherIdx)); + if (SE.getSCEV(OtherOp)->hasComputableLoopEvolution(L)) + continue; + } + + LSRUse &LU = getNewUse(); + LU.UserInst = const_cast(UserInst); + LU.OperandValToReplace = UI.getUse(); + LU.InsertSupplementalFormula(U); + CountRegisters(LU.Formulae.back(), Uses.size() - 1); + } + } +} - Changed |= ThisChanged; - return ThisChanged; +#ifndef NDEBUG + +static void debug_winner(SmallVector const &Uses) { + dbgs() << "LSR has selected formulae for each use:\n"; + for (SmallVectorImpl::const_iterator I = Uses.begin(), + E = Uses.end(); I != E; ++I) { + const LSRUse &LU = *I; + dbgs() << " "; + LU.print(dbgs()); + dbgs() << '\n'; + dbgs() << " "; + LU.Formulae.front().print(dbgs()); + dbgs() << "\n"; + } } -bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) { - IU = &getAnalysis(); - SE = &getAnalysis(); - Changed = false; +#endif + +LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) + : IU(P->getAnalysis()), + SE(P->getAnalysis()), + DT(P->getAnalysis()), + TLI(tli), L(l), Changed(false), CurrentArbitraryRegIndex(0) { // If LoopSimplify form is not available, stay out of trouble. - if (!L->getLoopPreheader() || !L->getLoopLatch()) - return false; + if (!L->isLoopSimplifyForm()) return; + + // If there's no interesting work to be done, bail early. + if (IU.IVUsesByStride.empty()) return; + + DEBUG(dbgs() << "\nLSR on loop "; + WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false); + dbgs() << ":\n"); + + // Sort the StrideOrder so we process larger strides first. + std::stable_sort(IU.StrideOrder.begin(), IU.StrideOrder.end(), + StrideCompare(SE)); + + /// OptimizeShadowIV - If IV is used in a int-to-float cast + /// inside the loop then try to eliminate the cast opeation. + OptimizeShadowIV(); + + // Change loop terminating condition to use the postinc iv when possible. + Instruction *IVIncInsertPos; + Changed |= OptimizeLoopTermCond(IVIncInsertPos); + + for (SmallVectorImpl::const_iterator SIter = + IU.StrideOrder.begin(), SEnd = IU.StrideOrder.end(); + SIter != SEnd; ++SIter) { + const SCEV *Stride = *SIter; + + // Collect interesting types. + Types.insert(SE.getEffectiveSCEVType(Stride->getType())); + + // Collect interesting factors. + for (SmallVectorImpl::const_iterator NewStrideIter = + SIter + 1; NewStrideIter != SEnd; ++NewStrideIter) { + const SCEV *OldStride = Stride; + const SCEV *NewStride = *NewStrideIter; + + if (SE.getTypeSizeInBits(OldStride->getType()) != + SE.getTypeSizeInBits(NewStride->getType())) { + if (SE.getTypeSizeInBits(OldStride->getType()) > + SE.getTypeSizeInBits(NewStride->getType())) + NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType()); + else + OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType()); + } + if (const SCEVConstant *Factor = + dyn_cast_or_null(getSDiv(NewStride, OldStride, + SE, true))) + if (Factor->getValue()->getValue().getMinSignedBits() <= 64) + Factors.insert(Factor->getValue()->getValue().getSExtValue()); + } + + std::map::const_iterator SI = + IU.IVUsesByStride.find(Stride); + assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!"); + for (ilist::const_iterator UI = SI->second->Users.begin(), + E = SI->second->Users.end(); UI != E; ++UI) { + // Record the uses. + LSRUse &LU = getNewUse(); + LU.UserInst = UI->getUser(); + LU.OperandValToReplace = UI->getOperandValToReplace(); + if (isAddressUse(LU.UserInst, LU.OperandValToReplace)) { + LU.Kind = LSRUse::Address; + LU.AccessTy = getAccessType(LU.UserInst); + } + if (UI->isUseOfPostIncrementedValue()) + LU.PostIncLoop = L; + + const SCEV *S = IU.getCanonicalExpr(*UI); + + // Equality (== and !=) ICmps are special. We can rewrite (i == N) as + // (N - i == 0), and this allows (N - i) to be the expression that we + // work with rather than just N or i, so we can consider the register + // requirements for both N and i at the same time. Limiting this code + // to equality icmps is not a problem because all interesting loops + // use equality icmps, thanks to IndVarSimplify. + if (ICmpInst *CI = dyn_cast(LU.UserInst)) + if (CI->isEquality()) { + // Swap the operands if needed to put the OperandValToReplace on + // the left, for consistency. + Value *NV = CI->getOperand(1); + if (NV == LU.OperandValToReplace) { + CI->setOperand(1, CI->getOperand(0)); + CI->setOperand(0, NV); + } - if (!IU->IVUsesByStride.empty()) { - DEBUG(dbgs() << "\nLSR on \"" << L->getHeader()->getParent()->getName() - << "\" "; - L->print(dbgs())); + // x == y --> x - y == 0 + const SCEV *N = SE.getSCEV(NV); + if (N->isLoopInvariant(L)) { + LU.Kind = LSRUse::ICmpZero; + S = SE.getMinusSCEV(N, S); + } + + // -1 and the negations of all interesting strides (except the + // negation of -1) are now also interesting. + for (size_t i = 0, e = Factors.size(); i != e; ++i) + if (Factors[i] != -1) + Factors.insert(-(uint64_t)Factors[i]); + Factors.insert(-1); + } - // Sort the StrideOrder so we process larger strides first. - std::stable_sort(IU->StrideOrder.begin(), IU->StrideOrder.end(), - StrideCompare(SE)); + // Ok, now enumerate all the different formulae we can find to compute + // the value for this expression. + LU.InsertInitialFormula(S, L, SE, DT); + CountRegisters(LU.Formulae.back(), Uses.size() - 1); + } + } - // Optimize induction variables. Some indvar uses can be transformed to use - // strides that will be needed for other purposes. A common example of this - // is the exit test for the loop, which can often be rewritten to use the - // computation of some other indvar to decide when to terminate the loop. - OptimizeIndvars(L); + // If all uses use the same type, don't bother looking for truncation-based + // reuse. + if (Types.size() == 1) + Types.clear(); + + // Now use the reuse data to generate a bunch of interesting ways + // to formulate the values needed for the uses. + GenerateAllReuseFormulae(); + + // If there are any uses of registers that we're tracking that have escaped + // IVUsers' attention, add trivial uses for them, so that the register + // voting process takes the into consideration. + GenerateLoopInvariantRegisterUses(); + + // Sort the formulae. TODO: This is redundantly sorted below. + for (SmallVectorImpl::iterator I = Uses.begin(), E = Uses.end(); + I != E; ++I) { + LSRUse &LU = *I; + std::stable_sort(LU.Formulae.begin(), LU.Formulae.end(), + ComplexitySorter()); + } - // Change loop terminating condition to use the postinc iv when possible - // and optimize loop terminating compare. FIXME: Move this after - // StrengthReduceIVUsersOfStride? - OptimizeLoopTermCond(L); + // Ok, we've now collected all the uses and noted their register uses. The + // next step is to start looking at register reuse possibilities. + DEBUG(print(dbgs()); dbgs() << '\n'); + + // Start by assuming we'll assign each use its own register. This is + // sometimes called "full" strength reduction, or "superhero" mode. + // Sometimes this is the best solution, but if there are opportunities for + // reuse we may find a better solution. + Score CurScore; + CurScore.RateInitial(Uses, L, SE); + + // Create a sorted list of registers with those with the most uses appearing + // earlier in the list. We'll visit them first, as they're the most likely + // to represent profitable reuse opportunities. + SmallVector RegOrder; + for (SmallVectorImpl::const_iterator I = + RegSequence.begin(), E = RegSequence.end(); I != E; ++I) + RegOrder.push_back(RegCount(*I, RegUses.find(*I)->second)); + std::stable_sort(RegOrder.begin(), RegOrder.end()); + + // Visit each register. Determine which ones represent profitable reuse + // opportunities and remember them. + // TODO: Extract this code into a function. + for (SmallVectorImpl::const_iterator I = RegOrder.begin(), + E = RegOrder.end(); I != E; ++I) { + const SCEV *Reg = I->Reg; + const SmallBitVector &Bits = I->Sort.Bits; + + // Registers with only one use don't represent reuse opportunities, so + // when we get there, we're done. + if (Bits.count() <= 1) break; + + DEBUG(dbgs() << "Reg " << *Reg << ": "; + I->Sort.print(dbgs()); + dbgs() << '\n'); + + // Determine the total number of registers will be needed if we make use + // of the reuse opportunity represented by the current register. + Score NewScore; + NewScore.Rate(Reg, Bits, Uses, L, SE); + + // Now decide whether this register's reuse opportunity is an overall win. + // Currently the decision is heavily skewed for register pressure. + if (!(NewScore < CurScore)) { + continue; + } - // FIXME: We can shrink overlarge IV's here. e.g. if the code has - // computation in i64 values and the target doesn't support i64, demote - // the computation to 32-bit if safe. + // Ok, use this opportunity. + DEBUG(dbgs() << "This candidate has been accepted.\n"); + CurScore = NewScore; + + // Now that we've selected a new reuse opportunity, delete formulae that + // do not participate in that opportunity. + for (int j = Bits.find_first(); j != -1; j = Bits.find_next(j)) { + LSRUse &LU = Uses[j]; + for (unsigned k = 0, h = LU.Formulae.size(); k != h; ++k) { + Formula &F = LU.Formulae[k]; + if (!F.referencesReg(Reg)) { + std::swap(LU.Formulae[k], LU.Formulae.back()); + LU.Formulae.pop_back(); + --k; --h; + } + } + // Also re-sort the list to put the formulae with the fewest registers + // at the front. + // TODO: Do this earlier, we don't need it each time. + std::stable_sort(LU.Formulae.begin(), LU.Formulae.end(), + ComplexitySorter()); + } + } - // FIXME: Attempt to reuse values across multiple IV's. In particular, we - // could have something like "for(i) { foo(i*8); bar(i*16) }", which should - // be codegened as "for (j = 0;; j+=8) { foo(j); bar(j+j); }" on X86/PPC. - // Need to be careful that IV's are all the same type. Only works for - // intptr_t indvars. + // Ok, we've now made all our decisions. The first formula for each use + // will be used. + DEBUG(dbgs() << "Concluding, we need "; CurScore.print(dbgs()); + dbgs() << ".\n"; + debug_winner(Uses)); + + // Free memory no longer needed. + RegOrder.clear(); + Factors.clear(); + Types.clear(); + RegUses.clear(); + RegSequence.clear(); + + // Keep track of instructions we may have made dead, so that + // we can remove them after we are done working. + SmallVector DeadInsts; + + SCEVExpander Rewriter(SE); + Rewriter.disableCanonicalMode(); + Rewriter.setIVIncInsertPos(L, IVIncInsertPos); + + // Expand the new value definitions and update the users. + for (SmallVectorImpl::const_iterator I = Uses.begin(), + E = Uses.end(); I != E; ++I) { + // Formulae should be legal. + DEBUG(for (SmallVectorImpl::const_iterator J = I->Formulae.begin(), + JE = I->Formulae.end(); J != JE; ++J) + assert(isLegalUse(J->AM, I->Kind, I->AccessTy, TLI) && + "Illegal formula generated!")); + + // Expand the new code and update the user. + I->Rewrite(L, Rewriter, DeadInsts, SE, DT, P); + Changed = true; + } - // IVsByStride keeps IVs for one particular loop. - assert(IVsByStride.empty() && "Stale entries in IVsByStride?"); + // Clean up after ourselves. This must be done before deleting any + // instructions. + Rewriter.clear(); - StrengthReduceIVUsers(L); + Changed |= DeleteTriviallyDeadInstructions(DeadInsts); +} - // After all sharing is done, see if we can adjust the loop to test against - // zero instead of counting up to a maximum. This is usually faster. - OptimizeLoopCountIV(L); +void LSRInstance::print(raw_ostream &OS) const { + OS << "LSR has identified the following interesting factors and types: "; + bool First = true; - // We're done analyzing this loop; release all the state we built up for it. - IVsByStride.clear(); + for (SmallSetVector::const_iterator + I = Factors.begin(), E = Factors.end(); I != E; ++I) { + if (!First) OS << ", "; + First = false; + OS << '*' << *I; + } - // Clean up after ourselves - DeleteTriviallyDeadInstructions(); + for (SmallSetVector::const_iterator + I = Types.begin(), E = Types.end(); I != E; ++I) { + if (!First) OS << ", "; + First = false; + OS << '(' << **I << ')'; } + OS << '\n'; + + OS << "LSR is examining the following uses, and candidate formulae:\n"; + for (SmallVectorImpl::const_iterator I = Uses.begin(), + E = Uses.end(); I != E; ++I) { + const LSRUse &LU = *I; + dbgs() << " "; + LU.print(OS); + OS << '\n'; + for (SmallVectorImpl::const_iterator J = LU.Formulae.begin(), + JE = LU.Formulae.end(); J != JE; ++J) { + OS << " "; + J->print(OS); + OS << "\n"; + } + } +} + +void LSRInstance::dump() const { + print(errs()); errs() << '\n'; +} + +namespace { + +class LoopStrengthReduce : public LoopPass { + /// TLI - Keep a pointer of a TargetLowering to consult for determining + /// transformation profitability. + const TargetLowering *const TLI; + +public: + static char ID; // Pass ID, replacement for typeid + explicit LoopStrengthReduce(const TargetLowering *tli = NULL); + +private: + bool runOnLoop(Loop *L, LPPassManager &LPM); + void getAnalysisUsage(AnalysisUsage &AU) const; +}; + +} + +char LoopStrengthReduce::ID = 0; +static RegisterPass +X("loop-reduce", "Loop Strength Reduction"); + +Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) { + return new LoopStrengthReduce(TLI); +} + +LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli) + : LoopPass(&ID), TLI(tli) {} + +void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { + // We split critical edges, so we change the CFG. However, we do update + // many analyses if they are around. + AU.addPreservedID(LoopSimplifyID); + AU.addPreserved(); + AU.addPreserved("domfrontier"); + + AU.addRequiredID(LoopSimplifyID); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); +} + +bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { + bool Changed = false; + + // Run the main LSR transformation. + Changed |= LSRInstance(TLI, L, this).getChanged(); // At this point, it is worth checking to see if any recurrence PHIs are also // dead, so that we can remove them as well. diff --git a/test/CodeGen/ARM/arm-negative-stride.ll b/test/CodeGen/ARM/arm-negative-stride.ll index 72ec8efcc44..52ab8717c15 100644 --- a/test/CodeGen/ARM/arm-negative-stride.ll +++ b/test/CodeGen/ARM/arm-negative-stride.ll @@ -1,7 +1,32 @@ ; RUN: llc < %s -march=arm | FileCheck %s +; This loop is rewritten with an indvar which counts down, which +; frees up a register from holding the trip count. + define void @test(i32* %P, i32 %A, i32 %i) nounwind { entry: +; CHECK: str r1, [{{r.*}}, +{{r.*}}, lsl #2] + icmp eq i32 %i, 0 ; :0 [#uses=1] + br i1 %0, label %return, label %bb + +bb: ; preds = %bb, %entry + %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %bb ] ; [#uses=2] + %i_addr.09.0 = sub i32 %i, %indvar ; [#uses=1] + %tmp2 = getelementptr i32* %P, i32 %i_addr.09.0 ; [#uses=1] + store i32 %A, i32* %tmp2 + %indvar.next = add i32 %indvar, 1 ; [#uses=2] + icmp eq i32 %indvar.next, %i ; :1 [#uses=1] + br i1 %1, label %return, label %bb + +return: ; preds = %bb, %entry + ret void +} + +; This loop has a non-address use of the count-up indvar, so +; it'll remain. Now the original store uses a negative-stride address. + +define void @test_with_forced_iv(i32* %P, i32 %A, i32 %i) nounwind { +entry: ; CHECK: str r1, [{{r.*}}, -{{r.*}}, lsl #2] icmp eq i32 %i, 0 ; :0 [#uses=1] br i1 %0, label %return, label %bb @@ -11,6 +36,7 @@ bb: ; preds = %bb, %entry %i_addr.09.0 = sub i32 %i, %indvar ; [#uses=1] %tmp2 = getelementptr i32* %P, i32 %i_addr.09.0 ; [#uses=1] store i32 %A, i32* %tmp2 + store i32 %indvar, i32* null %indvar.next = add i32 %indvar, 1 ; [#uses=2] icmp eq i32 %indvar.next, %i ; :1 [#uses=1] br i1 %1, label %return, label %bb diff --git a/test/CodeGen/ARM/lsr-code-insertion.ll b/test/CodeGen/ARM/lsr-code-insertion.ll index 507ec2c7bd3..1bbb96deeef 100644 --- a/test/CodeGen/ARM/lsr-code-insertion.ll +++ b/test/CodeGen/ARM/lsr-code-insertion.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -stats |& grep {40.*Number of machine instrs printed} -; RUN: llc < %s -stats |& grep {.*Number of re-materialization} +; RUN: llc < %s -stats |& grep {39.*Number of machine instrs printed} +; RUN: llc < %s -stats |& not grep {.*Number of re-materialization} ; This test really wants to check that the resultant "cond_true" block only ; has a single store in it, and that cond_true55 only has code to materialize ; the constant and do a store. We do *not* want something like this: diff --git a/test/CodeGen/ARM/remat.ll b/test/CodeGen/ARM/remat.ll index 9565c8bca6b..9072bcb762d 100644 --- a/test/CodeGen/ARM/remat.ll +++ b/test/CodeGen/ARM/remat.ll @@ -1,5 +1,4 @@ -; RUN: llc < %s -mtriple=arm-apple-darwin -; RUN: llc < %s -mtriple=arm-apple-darwin -stats -info-output-file - | grep "Number of re-materialization" | grep 3 +; RUN: llc < %s -mtriple=arm-apple-darwin -stats -info-output-file - | not grep "Number of re-materialization" %struct.CONTENTBOX = type { i32, i32, i32, i32, i32 } %struct.LOCBOX = type { i32, i32, i32, i32 } diff --git a/test/CodeGen/Thumb2/lsr-deficiency.ll b/test/CodeGen/Thumb2/lsr-deficiency.ll index 7b1b57a786e..ac2cd34e4b3 100644 --- a/test/CodeGen/Thumb2/lsr-deficiency.ll +++ b/test/CodeGen/Thumb2/lsr-deficiency.ll @@ -1,25 +1,29 @@ ; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -relocation-model=pic | FileCheck %s ; rdar://7387640 -; FIXME: We still need to rewrite array reference iv of stride -4 with loop -; count iv of stride -1. +; This now reduces to a single induction variable. + +; TODO: It still gets a GPR shuffle at the end of the loop +; This is because something in instruction selection has decided +; that comparing the pre-incremented value with zero is better +; than comparing the post-incremented value with -4. @G = external global i32 ; [#uses=2] @array = external global i32* ; [#uses=1] define arm_apcscc void @t() nounwind optsize { ; CHECK: t: -; CHECK: mov.w r2, #4000 -; CHECK: movw r3, #1001 +; CHECK: mov.w r2, #1000 entry: %.pre = load i32* @G, align 4 ; [#uses=1] br label %bb bb: ; preds = %bb, %entry ; CHECK: LBB1_1: -; CHECK: subs r3, #1 -; CHECK: cmp r3, #0 -; CHECK: sub.w r2, r2, #4 +; CHECK: cmp r2, #0 +; CHECK: sub.w r9, r2, #1 +; CHECK: mov r2, r9 + %0 = phi i32 [ %.pre, %entry ], [ %3, %bb ] ; [#uses=1] %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %bb ] ; [#uses=2] %tmp5 = sub i32 1000, %indvar ; [#uses=1] diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt1.ll b/test/CodeGen/Thumb2/thumb2-ifcvt1.ll index 71199abc572..1d267565e06 100644 --- a/test/CodeGen/Thumb2/thumb2-ifcvt1.ll +++ b/test/CodeGen/Thumb2/thumb2-ifcvt1.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s -define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) { +define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind { ; CHECK: t1: ; CHECK: it ne ; CHECK: cmpne @@ -20,12 +20,12 @@ cond_next: } ; FIXME: Check for # of unconditional branch after adding branch folding post ifcvt. -define i32 @t2(i32 %a, i32 %b) { +define i32 @t2(i32 %a, i32 %b) nounwind { entry: ; CHECK: t2: -; CHECK: ite le -; CHECK: suble +; CHECK: ite gt ; CHECK: subgt +; CHECK: suble %tmp1434 = icmp eq i32 %a, %b ; [#uses=1] br i1 %tmp1434, label %bb17, label %bb.outer @@ -60,14 +60,14 @@ bb17: ; preds = %cond_false, %cond_true, %entry @x = external global i32* ; [#uses=1] -define void @foo(i32 %a) { +define void @foo(i32 %a) nounwind { entry: %tmp = load i32** @x ; [#uses=1] store i32 %a, i32* %tmp ret void } -define void @t3(i32 %a, i32 %b) { +define void @t3(i32 %a, i32 %b) nounwind { entry: ; CHECK: t3: ; CHECK: it lt diff --git a/test/CodeGen/X86/2006-05-11-InstrSched.ll b/test/CodeGen/X86/2006-05-11-InstrSched.ll index bdbe713a295..56d6aa960e2 100644 --- a/test/CodeGen/X86/2006-05-11-InstrSched.ll +++ b/test/CodeGen/X86/2006-05-11-InstrSched.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=x86 -mattr=+sse2 -stats -realign-stack=0 |&\ -; RUN: grep {asm-printer} | grep 31 +; RUN: grep {asm-printer} | grep 34 target datalayout = "e-p:32:32" define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind { @@ -40,7 +40,7 @@ cond_true: ; preds = %cond_true, %entry %tmp137.upgrd.7 = bitcast i32* %tmp137 to <2 x i64>* ; <<2 x i64>*> [#uses=1] store <2 x i64> %tmp131, <2 x i64>* %tmp137.upgrd.7 %tmp147 = add nsw i32 %tmp.10, 8 ; [#uses=1] - %tmp.upgrd.8 = icmp slt i32 %tmp147, %M ; [#uses=1] + %tmp.upgrd.8 = icmp ne i32 %tmp147, %M ; [#uses=1] %indvar.next = add i32 %indvar, 1 ; [#uses=1] br i1 %tmp.upgrd.8, label %cond_true, label %return diff --git a/test/CodeGen/X86/2007-08-13-SpillerReuse.ll b/test/CodeGen/X86/2007-08-13-SpillerReuse.ll deleted file mode 100644 index d6ea5109d1f..00000000000 --- a/test/CodeGen/X86/2007-08-13-SpillerReuse.ll +++ /dev/null @@ -1,102 +0,0 @@ -; RUN: llc < %s -mtriple=i686-apple-darwin | grep "48(%esp)" | count 5 - - %struct..0anon = type { i32 } - %struct.rtvec_def = type { i32, [1 x %struct..0anon] } - %struct.rtx_def = type { i16, i8, i8, [1 x %struct..0anon] } -@rtx_format = external global [116 x i8*] ; <[116 x i8*]*> [#uses=1] -@rtx_length = external global [117 x i32] ; <[117 x i32]*> [#uses=1] - -declare %struct.rtx_def* @fixup_memory_subreg(%struct.rtx_def*, %struct.rtx_def*, i32) - -define %struct.rtx_def* @walk_fixup_memory_subreg(%struct.rtx_def* %x, %struct.rtx_def* %insn) { -entry: - %tmp2 = icmp eq %struct.rtx_def* %x, null ; [#uses=1] - br i1 %tmp2, label %UnifiedReturnBlock, label %cond_next - -cond_next: ; preds = %entry - %tmp6 = getelementptr %struct.rtx_def* %x, i32 0, i32 0 ; [#uses=1] - %tmp7 = load i16* %tmp6 ; [#uses=2] - %tmp78 = zext i16 %tmp7 to i32 ; [#uses=2] - %tmp10 = icmp eq i16 %tmp7, 54 ; [#uses=1] - br i1 %tmp10, label %cond_true13, label %cond_next32 - -cond_true13: ; preds = %cond_next - %tmp15 = getelementptr %struct.rtx_def* %x, i32 0, i32 3 ; <[1 x %struct..0anon]*> [#uses=1] - %tmp1718 = bitcast [1 x %struct..0anon]* %tmp15 to %struct.rtx_def** ; <%struct.rtx_def**> [#uses=1] - %tmp19 = load %struct.rtx_def** %tmp1718 ; <%struct.rtx_def*> [#uses=1] - %tmp20 = getelementptr %struct.rtx_def* %tmp19, i32 0, i32 0 ; [#uses=1] - %tmp21 = load i16* %tmp20 ; [#uses=1] - %tmp22 = icmp eq i16 %tmp21, 57 ; [#uses=1] - br i1 %tmp22, label %cond_true25, label %cond_next32 - -cond_true25: ; preds = %cond_true13 - %tmp29 = tail call %struct.rtx_def* @fixup_memory_subreg( %struct.rtx_def* %x, %struct.rtx_def* %insn, i32 1 ) ; <%struct.rtx_def*> [#uses=1] - ret %struct.rtx_def* %tmp29 - -cond_next32: ; preds = %cond_true13, %cond_next - %tmp34 = getelementptr [116 x i8*]* @rtx_format, i32 0, i32 %tmp78 ; [#uses=1] - %tmp35 = load i8** %tmp34, align 4 ; [#uses=1] - %tmp37 = getelementptr [117 x i32]* @rtx_length, i32 0, i32 %tmp78 ; [#uses=1] - %tmp38 = load i32* %tmp37, align 4 ; [#uses=1] - %i.011 = add i32 %tmp38, -1 ; [#uses=2] - %tmp12513 = icmp sgt i32 %i.011, -1 ; [#uses=1] - br i1 %tmp12513, label %bb, label %UnifiedReturnBlock - -bb: ; preds = %bb123, %cond_next32 - %indvar = phi i32 [ %indvar.next26, %bb123 ], [ 0, %cond_next32 ] ; [#uses=2] - %i.01.0 = sub i32 %i.011, %indvar ; [#uses=5] - %tmp42 = getelementptr i8* %tmp35, i32 %i.01.0 ; [#uses=2] - %tmp43 = load i8* %tmp42 ; [#uses=1] - switch i8 %tmp43, label %bb123 [ - i8 101, label %cond_true47 - i8 69, label %bb105.preheader - ] - -cond_true47: ; preds = %bb - %tmp52 = getelementptr %struct.rtx_def* %x, i32 0, i32 3, i32 %i.01.0 ; <%struct..0anon*> [#uses=1] - %tmp5354 = bitcast %struct..0anon* %tmp52 to %struct.rtx_def** ; <%struct.rtx_def**> [#uses=1] - %tmp55 = load %struct.rtx_def** %tmp5354 ; <%struct.rtx_def*> [#uses=1] - %tmp58 = tail call %struct.rtx_def* @walk_fixup_memory_subreg( %struct.rtx_def* %tmp55, %struct.rtx_def* %insn ) ; <%struct.rtx_def*> [#uses=1] - %tmp62 = getelementptr %struct.rtx_def* %x, i32 0, i32 3, i32 %i.01.0, i32 0 ; [#uses=1] - %tmp58.c = ptrtoint %struct.rtx_def* %tmp58 to i32 ; [#uses=1] - store i32 %tmp58.c, i32* %tmp62 - %tmp6816 = load i8* %tmp42 ; [#uses=1] - %tmp6917 = icmp eq i8 %tmp6816, 69 ; [#uses=1] - br i1 %tmp6917, label %bb105.preheader, label %bb123 - -bb105.preheader: ; preds = %cond_true47, %bb - %tmp11020 = getelementptr %struct.rtx_def* %x, i32 0, i32 3, i32 %i.01.0 ; <%struct..0anon*> [#uses=1] - %tmp11111221 = bitcast %struct..0anon* %tmp11020 to %struct.rtvec_def** ; <%struct.rtvec_def**> [#uses=3] - %tmp11322 = load %struct.rtvec_def** %tmp11111221 ; <%struct.rtvec_def*> [#uses=1] - %tmp11423 = getelementptr %struct.rtvec_def* %tmp11322, i32 0, i32 0 ; [#uses=1] - %tmp11524 = load i32* %tmp11423 ; [#uses=1] - %tmp11625 = icmp eq i32 %tmp11524, 0 ; [#uses=1] - br i1 %tmp11625, label %bb123, label %bb73 - -bb73: ; preds = %bb73, %bb105.preheader - %j.019 = phi i32 [ %tmp104, %bb73 ], [ 0, %bb105.preheader ] ; [#uses=3] - %tmp81 = load %struct.rtvec_def** %tmp11111221 ; <%struct.rtvec_def*> [#uses=2] - %tmp92 = getelementptr %struct.rtvec_def* %tmp81, i32 0, i32 1, i32 %j.019 ; <%struct..0anon*> [#uses=1] - %tmp9394 = bitcast %struct..0anon* %tmp92 to %struct.rtx_def** ; <%struct.rtx_def**> [#uses=1] - %tmp95 = load %struct.rtx_def** %tmp9394 ; <%struct.rtx_def*> [#uses=1] - %tmp98 = tail call %struct.rtx_def* @walk_fixup_memory_subreg( %struct.rtx_def* %tmp95, %struct.rtx_def* %insn ) ; <%struct.rtx_def*> [#uses=1] - %tmp101 = getelementptr %struct.rtvec_def* %tmp81, i32 0, i32 1, i32 %j.019, i32 0 ; [#uses=1] - %tmp98.c = ptrtoint %struct.rtx_def* %tmp98 to i32 ; [#uses=1] - store i32 %tmp98.c, i32* %tmp101 - %tmp104 = add i32 %j.019, 1 ; [#uses=2] - %tmp113 = load %struct.rtvec_def** %tmp11111221 ; <%struct.rtvec_def*> [#uses=1] - %tmp114 = getelementptr %struct.rtvec_def* %tmp113, i32 0, i32 0 ; [#uses=1] - %tmp115 = load i32* %tmp114 ; [#uses=1] - %tmp116 = icmp ult i32 %tmp104, %tmp115 ; [#uses=1] - br i1 %tmp116, label %bb73, label %bb123 - -bb123: ; preds = %bb73, %bb105.preheader, %cond_true47, %bb - %i.0 = add i32 %i.01.0, -1 ; [#uses=1] - %tmp125 = icmp sgt i32 %i.0, -1 ; [#uses=1] - %indvar.next26 = add i32 %indvar, 1 ; [#uses=1] - br i1 %tmp125, label %bb, label %UnifiedReturnBlock - -UnifiedReturnBlock: ; preds = %bb123, %cond_next32, %entry - %UnifiedRetVal = phi %struct.rtx_def* [ null, %entry ], [ %x, %cond_next32 ], [ %x, %bb123 ] ; <%struct.rtx_def*> [#uses=1] - ret %struct.rtx_def* %UnifiedRetVal -} diff --git a/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll b/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll index 721d4c945b1..8e315f4d80f 100644 --- a/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll +++ b/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll @@ -35,7 +35,7 @@ cond_next36.i: ; preds = %cond_next.i bb.i28.i: ; preds = %bb.i28.i, %cond_next36.i ; CHECK: %bb.i28.i ; CHECK: addl $2 -; CHECK: addl $2 +; CHECK: addl $-2 %j.0.reg2mem.0.i16.i = phi i32 [ 0, %cond_next36.i ], [ %indvar.next39.i, %bb.i28.i ] ; [#uses=2] %din_addr.1.reg2mem.0.i17.i = phi double [ 0.000000e+00, %cond_next36.i ], [ %tmp16.i25.i, %bb.i28.i ] ; [#uses=1] %tmp1.i18.i = fptosi double %din_addr.1.reg2mem.0.i17.i to i32 ; [#uses=2] diff --git a/test/CodeGen/X86/2009-09-10-SpillComments.ll b/test/CodeGen/X86/2009-09-10-SpillComments.ll index 1dd9990e71a..f9ca861c558 100644 --- a/test/CodeGen/X86/2009-09-10-SpillComments.ll +++ b/test/CodeGen/X86/2009-09-10-SpillComments.ll @@ -1,5 +1,11 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-linux | FileCheck %s +; This test shouldn't require spills. + +; CHECK: subq $8, %rsp +; CHECK-NOT: $rsp +; CHECK: addq $8, %rsp + %struct..0anon = type { i32 } %struct.rtvec_def = type { i32, [1 x %struct..0anon] } %struct.rtx_def = type { i16, i8, i8, [1 x %struct..0anon] } @@ -10,9 +16,6 @@ declare %struct.rtx_def* @fixup_memory_subreg(%struct.rtx_def*, %struct.rtx_def* define %struct.rtx_def* @walk_fixup_memory_subreg(%struct.rtx_def* %x, %struct.rtx_def* %insn) { entry: -; CHECK: Spill -; CHECK: Folded Spill -; CHECK: Reload %tmp2 = icmp eq %struct.rtx_def* %x, null ; [#uses=1] br i1 %tmp2, label %UnifiedReturnBlock, label %cond_next diff --git a/test/CodeGen/X86/full-lsr.ll b/test/CodeGen/X86/full-lsr.ll index 68575bc401d..3bd58b65be4 100644 --- a/test/CodeGen/X86/full-lsr.ll +++ b/test/CodeGen/X86/full-lsr.ll @@ -1,6 +1,12 @@ -; RUN: llc < %s -march=x86 -enable-full-lsr >%t -; RUN: grep {addl \\\$4,} %t | count 3 -; RUN: not grep {,%} %t +; RUN: llc < %s -march=x86 >%t + +; TODO: Enhance full lsr mode to get this: +; RUNX: grep {addl \\\$4,} %t | count 3 +; RUNX: not grep {,%} %t + +; For now, it should find this, which is still pretty good: +; RUN: not grep {addl \\\$4,} %t +; RUN: grep {,%} %t | count 6 define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind { entry: diff --git a/test/CodeGen/X86/iv-users-in-other-loops.ll b/test/CodeGen/X86/iv-users-in-other-loops.ll index c695c29e068..0410bc0d9a9 100644 --- a/test/CodeGen/X86/iv-users-in-other-loops.ll +++ b/test/CodeGen/X86/iv-users-in-other-loops.ll @@ -1,11 +1,11 @@ ; RUN: llc < %s -march=x86-64 -o %t -; RUN: grep inc %t | count 1 +; RUN: not grep inc %t ; RUN: grep dec %t | count 2 -; RUN: grep addq %t | count 13 +; RUN: grep addq %t | count 10 ; RUN: not grep addb %t ; RUN: grep leaq %t | count 9 -; RUN: grep leal %t | count 3 -; RUN: grep movq %t | count 5 +; RUN: grep leal %t | count 2 +; RUN: grep movq %t | count 10 ; IV users in each of the loops from other loops shouldn't cause LSR ; to insert new induction variables. Previously it would create a diff --git a/test/CodeGen/X86/loop-strength-reduce4.ll b/test/CodeGen/X86/loop-strength-reduce4.ll index 07e46eca75e..6c0eb8c0df9 100644 --- a/test/CodeGen/X86/loop-strength-reduce4.ll +++ b/test/CodeGen/X86/loop-strength-reduce4.ll @@ -1,5 +1,19 @@ -; RUN: llc < %s -march=x86 | grep cmp | grep 64 -; RUN: llc < %s -march=x86 | not grep inc +; RUN: llc < %s -march=x86 -relocation-model=static -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=STATIC +; RUN: llc < %s -march=x86 -relocation-model=pic | FileCheck %s -check-prefix=PIC + +; By starting the IV at -64 instead of 0, a cmp is eliminated, +; as the flags from the add can be used directly. + +; STATIC: movl $-64, %ecx + +; STATIC: movl %eax, _state+76(%ecx) +; STATIC: addl $16, %ecx +; STATIC: jne + +; In PIC mode the symbol can't be folded, so the change-compare-stride +; trick applies. + +; PIC: cmpl $64 @state = external global [0 x i32] ; <[0 x i32]*> [#uses=4] @S = external global [0 x i32] ; <[0 x i32]*> [#uses=4] diff --git a/test/CodeGen/X86/loop-strength-reduce8.ll b/test/CodeGen/X86/loop-strength-reduce8.ll index e14cd8a99e3..6b2247d1d61 100644 --- a/test/CodeGen/X86/loop-strength-reduce8.ll +++ b/test/CodeGen/X86/loop-strength-reduce8.ll @@ -1,4 +1,10 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin | grep leal | not grep 16 +; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s + +; CHECK: leal 16(%eax), %edx +; CHECK: align +; CHECK: addl $4, %edx +; CHECK: decl %ecx +; CHECK: jne LBB1_2 %struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32, i32 } %struct.bitmap_element = type { %struct.bitmap_element*, %struct.bitmap_element*, i32, [2 x i64] } diff --git a/test/CodeGen/X86/lsr-reuse.ll b/test/CodeGen/X86/lsr-reuse.ll new file mode 100644 index 00000000000..a1919bab38a --- /dev/null +++ b/test/CodeGen/X86/lsr-reuse.ll @@ -0,0 +1,159 @@ +; RUN: llc < %s -march=x86-64 | FileCheck %s +target datalayout = "e-p:64:64:64" +target triple = "x86_64-unknown-unknown" + +; Full strength reduction reduces register pressure from 5 to 4 here. + +; CHECK: full_me: +; CHECK: movsd (%rsi), %xmm0 +; CHECK: mulsd (%rdx), %xmm0 +; CHECK: movsd %xmm0, (%rdi) +; CHECK: addq $8, %rsi +; CHECK: addq $8, %rdx +; CHECK: addq $8, %rdi +; CHECK: decq %rcx +; CHECK: jne + +define void @full_me(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind { +entry: + %t0 = icmp sgt i64 %n, 0 + br i1 %t0, label %loop, label %return + +loop: + %i = phi i64 [ %i.next, %loop ], [ 0, %entry ] + %Ai = getelementptr inbounds double* %A, i64 %i + %Bi = getelementptr inbounds double* %B, i64 %i + %Ci = getelementptr inbounds double* %C, i64 %i + %t1 = load double* %Bi + %t2 = load double* %Ci + %m = fmul double %t1, %t2 + store double %m, double* %Ai + %i.next = add nsw i64 %i, 1 + %exitcond = icmp eq i64 %i.next, %n + br i1 %exitcond, label %return, label %loop + +return: + ret void +} + +; In this test, the counting IV exit value is used, so full strength reduction +; would not reduce register pressure. IndVarSimplify ought to simplify such +; cases away, but it's useful here to verify that LSR's register pressure +; heuristics are working as expected. + +; CHECK: count_me_0: +; CHECK: movsd (%rsi,%rax,8), %xmm0 +; CHECK: mulsd (%rdx,%rax,8), %xmm0 +; CHECK: movsd %xmm0, (%rdi,%rax,8) +; CHECK: incq %rax +; CHECK: cmpq %rax, %rcx +; CHECK: jne + +define i64 @count_me_0(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind { +entry: + %t0 = icmp sgt i64 %n, 0 + br i1 %t0, label %loop, label %return + +loop: + %i = phi i64 [ %i.next, %loop ], [ 0, %entry ] + %Ai = getelementptr inbounds double* %A, i64 %i + %Bi = getelementptr inbounds double* %B, i64 %i + %Ci = getelementptr inbounds double* %C, i64 %i + %t1 = load double* %Bi + %t2 = load double* %Ci + %m = fmul double %t1, %t2 + store double %m, double* %Ai + %i.next = add nsw i64 %i, 1 + %exitcond = icmp eq i64 %i.next, %n + br i1 %exitcond, label %return, label %loop + +return: + %q = phi i64 [ 0, %entry ], [ %i.next, %loop ] + ret i64 %q +} + +; In this test, the trip count value is used, so full strength reduction +; would not reduce register pressure. +; (though it would reduce register pressure inside the loop...) + +; CHECK: count_me_1: +; CHECK: movsd (%rsi,%rax,8), %xmm0 +; CHECK: mulsd (%rdx,%rax,8), %xmm0 +; CHECK: movsd %xmm0, (%rdi,%rax,8) +; CHECK: incq %rax +; CHECK: cmpq %rax, %rcx +; CHECK: jne + +define i64 @count_me_1(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind { +entry: + %t0 = icmp sgt i64 %n, 0 + br i1 %t0, label %loop, label %return + +loop: + %i = phi i64 [ %i.next, %loop ], [ 0, %entry ] + %Ai = getelementptr inbounds double* %A, i64 %i + %Bi = getelementptr inbounds double* %B, i64 %i + %Ci = getelementptr inbounds double* %C, i64 %i + %t1 = load double* %Bi + %t2 = load double* %Ci + %m = fmul double %t1, %t2 + store double %m, double* %Ai + %i.next = add nsw i64 %i, 1 + %exitcond = icmp eq i64 %i.next, %n + br i1 %exitcond, label %return, label %loop + +return: + %q = phi i64 [ 0, %entry ], [ %n, %loop ] + ret i64 %q +} + +; This should be fully strength-reduced to reduce register pressure, however +; the current heuristics get distracted by all the reuse with the stride-1 +; induction variable first. + +; But even so, be clever and start the stride-1 variable at a non-zero value +; to eliminate an in-loop immediate value. + +; CHECK: count_me_2: +; CHECK: movl $5, %eax +; CHECK: align +; CHECK: BB4_1: +; CHECK: movsd (%rdi,%rax,8), %xmm0 +; CHECK: addsd (%rsi,%rax,8), %xmm0 +; CHECK: movsd %xmm0, (%rdx,%rax,8) +; CHECK: movsd 40(%rdi,%rax,8), %xmm0 +; CHECK: addsd 40(%rsi,%rax,8), %xmm0 +; CHECK: movsd %xmm0, 40(%rdx,%rax,8) +; CHECK: incq %rax +; CHECK: cmpq $5005, %rax +; CHECK: jne + +define void @count_me_2(double* nocapture %A, double* nocapture %B, double* nocapture %C) nounwind { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %i.next, %loop ] + %i5 = add i64 %i, 5 + %Ai = getelementptr double* %A, i64 %i5 + %t2 = load double* %Ai + %Bi = getelementptr double* %B, i64 %i5 + %t4 = load double* %Bi + %t5 = fadd double %t2, %t4 + %Ci = getelementptr double* %C, i64 %i5 + store double %t5, double* %Ci + %i10 = add i64 %i, 10 + %Ai10 = getelementptr double* %A, i64 %i10 + %t9 = load double* %Ai10 + %Bi10 = getelementptr double* %B, i64 %i10 + %t11 = load double* %Bi10 + %t12 = fadd double %t9, %t11 + %Ci10 = getelementptr double* %C, i64 %i10 + store double %t12, double* %Ci10 + %i.next = add i64 %i, 1 + %exitcond = icmp eq i64 %i.next, 5000 + br i1 %exitcond, label %return, label %loop + +return: + ret void +} diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll index bc493bd8f72..7111d687ed4 100644 --- a/test/CodeGen/X86/masked-iv-safe.ll +++ b/test/CodeGen/X86/masked-iv-safe.ll @@ -4,9 +4,9 @@ ; RUN: not grep sar %t ; RUN: not grep shl %t ; RUN: grep add %t | count 2 -; RUN: grep inc %t | count 4 +; RUN: grep inc %t | count 3 ; RUN: grep dec %t | count 2 -; RUN: grep lea %t | count 2 +; RUN: grep lea %t | count 3 ; Optimize away zext-inreg and sext-inreg on the loop induction ; variable using trip-count information. @@ -127,6 +127,9 @@ return: ret void } +; TODO: If we could handle all the loads and stores as post-inc users, we could +; use {-1,+,1} in the induction variable register, and we'd get another inc, +; one fewer add, and a comparison with zero. define void @another_count_up(double* %d, i64 %n) nounwind { entry: br label %loop diff --git a/test/CodeGen/X86/pr3495-2.ll b/test/CodeGen/X86/pr3495-2.ll index 1372a1522bd..71aa5a04881 100644 --- a/test/CodeGen/X86/pr3495-2.ll +++ b/test/CodeGen/X86/pr3495-2.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -march=x86 -relocation-model=pic -disable-fp-elim -stats |& grep {Number of reloads omited} +target datalayout = "e-p:32:32:32" target triple = "i386-apple-darwin9.6" %struct.constraintVCGType = type { i32, i32, i32, i32 } %struct.nodeVCGType = type { %struct.constraintVCGType*, i32, i32, i32, %struct.constraintVCGType*, i32, i32, i32 } diff --git a/test/CodeGen/X86/remat-mov-0.ll b/test/CodeGen/X86/remat-mov-0.ll index c4f768ca529..5fb445c9357 100644 --- a/test/CodeGen/X86/remat-mov-0.ll +++ b/test/CodeGen/X86/remat-mov-0.ll @@ -1,13 +1,33 @@ -; RUN: llc < %s -march=x86-64 | grep {xorl %edi, %edi} | count 4 +; RUN: llc < %s -march=x86-64 | FileCheck %s ; CodeGen should remat the zero instead of spilling it. declare void @foo(i64 %p) +; CHECK: bar: +; CHECK: xorl %edi, %edi +; CHECK: xorl %edi, %edi define void @bar() nounwind { call void @foo(i64 0) call void @foo(i64 0) - call void @foo(i64 0) - call void @foo(i64 0) ret void } + +; CHECK: bat: +; CHECK: movq $-1, %rdi +; CHECK: movq $-1, %rdi +define void @bat() nounwind { + call void @foo(i64 -1) + call void @foo(i64 -1) + ret void +} + +; CHECK: bau: +; CHECK: movl $1, %edi +; CHECK: movl $1, %edi +define void @bau() nounwind { + call void @foo(i64 1) + call void @foo(i64 1) + ret void +} + diff --git a/test/CodeGen/X86/remat-mov-1.ll b/test/CodeGen/X86/remat-mov-1.ll deleted file mode 100644 index d71b7a5b910..00000000000 --- a/test/CodeGen/X86/remat-mov-1.ll +++ /dev/null @@ -1,40 +0,0 @@ -; RUN: llc < %s -march=x86 | grep -- -1 | grep mov | count 2 - - %struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 } - %struct.ImgT = type { i8, i8*, i8*, %struct.FILE*, i32, i32, i32, i32, i8*, double*, float*, float*, float*, i32*, double, double, i32*, double*, i32*, i32* } - %struct._CompT = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, i8, %struct._PixT*, %struct._CompT*, i8, %struct._CompT* } - %struct._PixT = type { i32, i32, %struct._PixT* } - %struct.__sFILEX = type opaque - %struct.__sbuf = type { i8*, i32 } - -declare fastcc void @MergeComponents(%struct._CompT*, %struct._CompT*, %struct._CompT*, %struct._CompT**, %struct.ImgT*) nounwind - -define fastcc void @MergeToLeft(%struct._CompT* %comp, %struct._CompT** %head, %struct.ImgT* %img) nounwind { -entry: - br label %bb208 - -bb105: ; preds = %bb200 - br i1 false, label %bb197, label %bb149 - -bb149: ; preds = %bb105 - %tmp151 = getelementptr %struct._CompT* %comp, i32 0, i32 0 ; [#uses=1] - br label %bb193 - -bb193: ; preds = %bb184, %bb149 - %tmp196 = load i32* %tmp151, align 4 ; [#uses=1] - br label %bb197 - -bb197: ; preds = %bb193, %bb105 - %last_comp.0 = phi i32 [ %tmp196, %bb193 ], [ 0, %bb105 ] ; [#uses=0] - %indvar.next = add i32 %indvar, 1 ; [#uses=1] - br label %bb200 - -bb200: ; preds = %bb208, %bb197 - %indvar = phi i32 [ 0, %bb208 ], [ %indvar.next, %bb197 ] ; [#uses=2] - %xm.0 = sub i32 %indvar, 0 ; [#uses=1] - %tmp202 = icmp slt i32 %xm.0, 1 ; [#uses=1] - br i1 %tmp202, label %bb105, label %bb208 - -bb208: ; preds = %bb200, %entry - br label %bb200 -} diff --git a/test/CodeGen/X86/subreg-to-reg-5.ll b/test/CodeGen/X86/subreg-to-reg-5.ll deleted file mode 100644 index ba4c307d109..00000000000 --- a/test/CodeGen/X86/subreg-to-reg-5.ll +++ /dev/null @@ -1,35 +0,0 @@ -; RUN: llc < %s -march=x86-64 > %t -; RUN: grep addl %t -; RUN: not egrep {movl|movq} %t - -define float @foo(float* %B) nounwind { -entry: - br label %bb2 - -bb2: ; preds = %bb3, %entry - %B_addr.0.rec = phi i64 [ %indvar.next154, %bb3 ], [ 0, %entry ] ; [#uses=2] - %z = icmp slt i64 %B_addr.0.rec, 20000 - br i1 %z, label %bb3, label %bb4 - -bb3: ; preds = %bb2 - %indvar.next154 = add i64 %B_addr.0.rec, 1 ; [#uses=1] - br label %bb2 - -bb4: ; preds = %bb2 - %B_addr.0 = getelementptr float* %B, i64 %B_addr.0.rec ; [#uses=1] - %t1 = ptrtoint float* %B_addr.0 to i64 ; [#uses=1] - %t2 = and i64 %t1, 4294967295 ; [#uses=1] - %t3 = icmp eq i64 %t2, 0 ; [#uses=1] - br i1 %t3, label %bb5, label %bb10.preheader - -bb10.preheader: ; preds = %bb4 - br label %bb9 - -bb5: ; preds = %bb4 - ret float 7.0 - -bb9: ; preds = %bb10.preheader - %t5 = getelementptr float* %B, i64 0 ; [#uses=1] - %t7 = load float* %t5 ; [#uses=1] - ret float %t7 -} diff --git a/test/Transforms/IndVarSimplify/gep-with-mul-base.ll b/test/Transforms/IndVarSimplify/gep-with-mul-base.ll index 78095940763..19d54ff2a22 100644 --- a/test/Transforms/IndVarSimplify/gep-with-mul-base.ll +++ b/test/Transforms/IndVarSimplify/gep-with-mul-base.ll @@ -1,6 +1,7 @@ ; RUN: opt < %s -indvars -S > %t -; RUN: grep add %t | count 8 -; RUN: grep mul %t | count 7 +; RUN: grep add %t | count 6 +; RUN: grep sub %t | count 2 +; RUN: grep mul %t | count 6 define void @foo(i64 %n, i64 %m, i64 %o, double* nocapture %p) nounwind { entry: diff --git a/test/Transforms/LoopStrengthReduce/2008-08-06-CmpStride.ll b/test/Transforms/LoopStrengthReduce/2008-08-06-CmpStride.ll index 7c7a21c013f..99cb8569b3f 100644 --- a/test/Transforms/LoopStrengthReduce/2008-08-06-CmpStride.ll +++ b/test/Transforms/LoopStrengthReduce/2008-08-06-CmpStride.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-reduce -S | grep ugt -; PR2535 +; RUN: llc -march=x86-64 < %s -o - | grep {cmpl \\$\[1\], %} @.str = internal constant [4 x i8] c"%d\0A\00" @@ -16,7 +15,7 @@ forbody: %add166 = or i32 %mul15, 1 ; [#uses=1] * call i32 (i8*, ...)* @printf( i8* noalias getelementptr ([4 x i8]* @.str, i32 0, i32 0), i32 %add166 ) nounwind %inc = add i32 %i.0, 1 ; [#uses=3] - %cmp = icmp ult i32 %inc, 1027 ; [#uses=1] + %cmp = icmp ne i32 %inc, 1027 ; [#uses=1] br i1 %cmp, label %forbody, label %afterfor afterfor: ; preds = %forcond diff --git a/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-0.ll b/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-0.ll index 36941ad6d36..d9abc8ba665 100644 --- a/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-0.ll +++ b/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-0.ll @@ -1,10 +1,9 @@ -; RUN: llc %s -o - --x86-asm-syntax=att | grep {cmpl \$4} +; RUN: llc < %s -o - | grep {testl %ecx, %ecx} target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-apple-darwin9" -; This is like change-compare-stride-trickiness-1.ll except the comparison -; happens before the relevant use, so the comparison stride can't be -; easily changed. +; The comparison happens before the relevant use, but it can still be rewritten +; to compare with zero. define void @foo() nounwind { entry: diff --git a/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-1.ll b/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-1.ll index 8a3978bb2ee..ea8a259ecd8 100644 --- a/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-1.ll +++ b/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-1.ll @@ -1,10 +1,10 @@ -; RUN: llc %s -o - --x86-asm-syntax=att | grep {cmpq \$8} +; RUN: llc %s -o - --x86-asm-syntax=att | grep {cmp. \$8} target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-apple-darwin9" -; This is like change-compare-stride-trickiness-0.ll except the comparison -; happens after the relevant use, so the comparison stride can be -; easily changed. +; The comparison happens after the relevant use, so the stride can easily +; be changed. The comparison can be done in a narrower mode than the +; induction variable. define void @foo() nounwind { entry: diff --git a/test/Transforms/LoopStrengthReduce/count-to-zero.ll b/test/Transforms/LoopStrengthReduce/count-to-zero.ll index 8cc3b5c1034..feb79f8a0c7 100644 --- a/test/Transforms/LoopStrengthReduce/count-to-zero.ll +++ b/test/Transforms/LoopStrengthReduce/count-to-zero.ll @@ -19,7 +19,7 @@ bb3: ; preds = %bb1 %tmp4 = add i32 %c_addr.1, -1 ; [#uses=1] %c_addr.1.be = select i1 %tmp2, i32 %tmp3, i32 %tmp4 ; [#uses=1] %indvar.next = add i32 %indvar, 1 ; [#uses=1] -; CHECK: sub i32 %lsr.iv, 1 +; CHECK: add i32 %lsr.iv, -1 br label %bb6 bb6: ; preds = %bb3, %entry diff --git a/test/Transforms/LoopStrengthReduce/icmp_use_postinc.ll b/test/Transforms/LoopStrengthReduce/icmp_use_postinc.ll deleted file mode 100644 index 4ad5d1478d6..00000000000 --- a/test/Transforms/LoopStrengthReduce/icmp_use_postinc.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: opt < %s -loop-reduce -S | FileCheck %s - -define i32 @main(i32 %argc, i8** nocapture %argv) nounwind ssp { -entry: - br i1 undef, label %bb4.preheader, label %bb.nph8 - -bb4.preheader: ; preds = %entry - br label %bb4 - -bb1: ; preds = %bb4 - br i1 undef, label %bb.nph8, label %bb3 - -bb3: ; preds = %bb1 - %phitmp = add i32 %indvar, 1 ; [#uses=1] - br label %bb4 - -bb4: ; preds = %bb3, %bb4.preheader -; CHECK: %lsr.iv = phi -; CHECK: %lsr.iv.next = add i32 %lsr.iv, 1 -; CHECK: %0 = icmp slt i32 %lsr.iv.next, %argc - %indvar = phi i32 [ 1, %bb4.preheader ], [ %phitmp, %bb3 ] ; [#uses=2] - %0 = icmp slt i32 %indvar, %argc ; [#uses=1] - br i1 %0, label %bb1, label %bb.nph8 - -bb.nph8: ; preds = %bb4, %bb1, %entry - unreachable -} -- 2.34.1