X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTransforms%2FScalar%2FLoopStrengthReduce.cpp;h=689ea20868965e47b889fe23528c75ab7287cb6f;hb=c6a669b6e7fb13669abefa62c6ac4f871b84bb0b;hp=73d3f9db8965402347df6678473aff42b38b0a92;hpb=ae086250853cb901d59943bd98aa9a0ad9a85b4d;p=oota-llvm.git diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 73d3f9db896..689ea208689 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -337,14 +337,42 @@ void Formula::dump() const { print(errs()); errs() << '\n'; } -/// getSDiv - Return an expression for LHS /s RHS, if it can be determined, -/// or null otherwise. If IgnoreSignificantBits is true, expressions like -/// (X * Y) /s Y are simplified to Y, ignoring that the multiplication may -/// overflow, which is useful when the result will be used in a context where -/// the most significant bits are ignored. -static const SCEV *getSDiv(const SCEV *LHS, const SCEV *RHS, - ScalarEvolution &SE, - bool IgnoreSignificantBits = false) { +/// isAddRecSExtable - Return true if the given addrec can be sign-extended +/// without changing its value. +static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { + const Type *WideTy = + IntegerType::get(SE.getContext(), + SE.getTypeSizeInBits(AR->getType()) + 1); + return isa(SE.getSignExtendExpr(AR, WideTy)); +} + +/// isAddSExtable - Return true if the given add can be sign-extended +/// without changing its value. +static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) { + const Type *WideTy = + IntegerType::get(SE.getContext(), + SE.getTypeSizeInBits(A->getType()) + 1); + return isa(SE.getSignExtendExpr(A, WideTy)); +} + +/// isMulSExtable - Return true if the given add can be sign-extended +/// without changing its value. +static bool isMulSExtable(const SCEVMulExpr *A, ScalarEvolution &SE) { + const Type *WideTy = + IntegerType::get(SE.getContext(), + SE.getTypeSizeInBits(A->getType()) + 1); + return isa(SE.getSignExtendExpr(A, WideTy)); +} + +/// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined +/// and if the remainder is known to be zero, or null otherwise. If +/// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified +/// to Y, ignoring that the multiplication may overflow, which is useful when +/// the result will be used in a context where the most significant bits are +/// ignored. +static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, + ScalarEvolution &SE, + bool IgnoreSignificantBits = false) { // Handle the trivial case, which works for any SCEV type. if (LHS == RHS) return SE.getIntegerSCEV(1, LHS->getType()); @@ -365,39 +393,44 @@ static const SCEV *getSDiv(const SCEV *LHS, const SCEV *RHS, .sdiv(RC->getValue()->getValue())); } - // Distribute the sdiv over addrec operands. + // Distribute the sdiv over addrec operands, if the addrec doesn't overflow. if (const SCEVAddRecExpr *AR = dyn_cast(LHS)) { - const SCEV *Start = getSDiv(AR->getStart(), RHS, SE, - IgnoreSignificantBits); - if (!Start) return 0; - const SCEV *Step = getSDiv(AR->getStepRecurrence(SE), RHS, SE, - IgnoreSignificantBits); - if (!Step) return 0; - return SE.getAddRecExpr(Start, Step, AR->getLoop()); + if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) { + const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE, + IgnoreSignificantBits); + if (!Start) return 0; + const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE, + IgnoreSignificantBits); + if (!Step) return 0; + return SE.getAddRecExpr(Start, Step, AR->getLoop()); + } } - // Distribute the sdiv over add operands. + // Distribute the sdiv over add operands, if the add doesn't overflow. if (const SCEVAddExpr *Add = dyn_cast(LHS)) { - SmallVector Ops; - for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); - I != E; ++I) { - const SCEV *Op = getSDiv(*I, RHS, SE, - IgnoreSignificantBits); - if (!Op) return 0; - Ops.push_back(Op); + if (IgnoreSignificantBits || isAddSExtable(Add, SE)) { + SmallVector Ops; + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) { + const SCEV *Op = getExactSDiv(*I, RHS, SE, + IgnoreSignificantBits); + if (!Op) return 0; + Ops.push_back(Op); + } + return SE.getAddExpr(Ops); } - return SE.getAddExpr(Ops); } // Check for a multiply operand that we can pull RHS out of. if (const SCEVMulExpr *Mul = dyn_cast(LHS)) - if (IgnoreSignificantBits || Mul->hasNoSignedWrap()) { + if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) { SmallVector Ops; bool Found = false; for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end(); I != E; ++I) { if (!Found) - if (const SCEV *Q = getSDiv(*I, RHS, SE, IgnoreSignificantBits)) { + if (const SCEV *Q = getExactSDiv(*I, RHS, SE, + IgnoreSignificantBits)) { Ops.push_back(Q); Found = true; continue; @@ -579,6 +612,10 @@ private: SmallPtrSet &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT); + void RatePrimaryRegister(const SCEV *Reg, + SmallPtrSet &Regs, + const Loop *L, + ScalarEvolution &SE, DominatorTree &DT); }; } @@ -588,49 +625,59 @@ void Cost::RateRegister(const SCEV *Reg, SmallPtrSet &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT) { - if (Regs.insert(Reg)) { - if (const SCEVAddRecExpr *AR = dyn_cast(Reg)) { - if (AR->getLoop() == L) - AddRecCost += 1; /// TODO: This should be a function of the stride. - - // If this is an addrec for a loop that's already been visited by LSR, - // don't second-guess its addrec phi nodes. LSR isn't currently smart - // enough to reason about more than one loop at a time. Consider these - // registers free and leave them alone. - else if (L->contains(AR->getLoop()) || - (!AR->getLoop()->contains(L) && - DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) { - for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); - PHINode *PN = dyn_cast(I); ++I) - if (SE.isSCEVable(PN->getType()) && - (SE.getEffectiveSCEVType(PN->getType()) == - SE.getEffectiveSCEVType(AR->getType())) && - SE.getSCEV(PN) == AR) - goto no_cost; - - // If this isn't one of the addrecs that the loop already has, it - // would require a costly new phi and add. - ++NumBaseAdds; + if (const SCEVAddRecExpr *AR = dyn_cast(Reg)) { + if (AR->getLoop() == L) + AddRecCost += 1; /// TODO: This should be a function of the stride. + + // If this is an addrec for a loop that's already been visited by LSR, + // don't second-guess its addrec phi nodes. LSR isn't currently smart + // enough to reason about more than one loop at a time. Consider these + // registers free and leave them alone. + else if (L->contains(AR->getLoop()) || + (!AR->getLoop()->contains(L) && + DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) { + for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); + PHINode *PN = dyn_cast(I); ++I) + if (SE.isSCEVable(PN->getType()) && + (SE.getEffectiveSCEVType(PN->getType()) == + SE.getEffectiveSCEVType(AR->getType())) && + SE.getSCEV(PN) == AR) + return; + + // If this isn't one of the addrecs that the loop already has, it + // would require a costly new phi and add. TODO: This isn't + // precisely modeled right now. + ++NumBaseAdds; + if (!Regs.count(AR->getStart())) RateRegister(AR->getStart(), Regs, L, SE, DT); - } - - // Add the step value register, if it needs one. - // TODO: The non-affine case isn't precisely modeled here. - if (!AR->isAffine() || !isa(AR->getOperand(1))) - RateRegister(AR->getOperand(1), Regs, L, SE, DT); } - ++NumRegs; - // Rough heuristic; favor registers which don't require extra setup - // instructions in the preheader. - if (!isa(Reg) && - !isa(Reg) && - !(isa(Reg) && - (isa(cast(Reg)->getStart()) || - isa(cast(Reg)->getStart())))) - ++SetupCost; - no_cost:; + // Add the step value register, if it needs one. + // TODO: The non-affine case isn't precisely modeled here. + if (!AR->isAffine() || !isa(AR->getOperand(1))) + if (!Regs.count(AR->getStart())) + RateRegister(AR->getOperand(1), Regs, L, SE, DT); } + ++NumRegs; + + // Rough heuristic; favor registers which don't require extra setup + // instructions in the preheader. + if (!isa(Reg) && + !isa(Reg) && + !(isa(Reg) && + (isa(cast(Reg)->getStart()) || + isa(cast(Reg)->getStart())))) + ++SetupCost; +} + +/// RatePrimaryRegister - Record this register in the set. If we haven't seen it +/// before, rate it. +void Cost::RatePrimaryRegister(const SCEV *Reg, + SmallPtrSet &Regs, + const Loop *L, + ScalarEvolution &SE, DominatorTree &DT) { + if (Regs.insert(Reg)) + RateRegister(Reg, Regs, L, SE, DT); } void Cost::RateFormula(const Formula &F, @@ -645,7 +692,7 @@ void Cost::RateFormula(const Formula &F, Loose(); return; } - RateRegister(ScaledReg, Regs, L, SE, DT); + RatePrimaryRegister(ScaledReg, Regs, L, SE, DT); } for (SmallVectorImpl::const_iterator I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { @@ -654,7 +701,7 @@ void Cost::RateFormula(const Formula &F, Loose(); return; } - RateRegister(BaseReg, Regs, L, SE, DT); + RatePrimaryRegister(BaseReg, Regs, L, SE, DT); NumIVMuls += isa(BaseReg) && BaseReg->hasComputableLoopEvolution(L); @@ -865,7 +912,7 @@ public: MaxOffset(INT64_MIN), AllFixupsOutsideLoop(true) {} - bool InsertFormula(size_t LUIdx, const Formula &F); + bool InsertFormula(const Formula &F); void check() const; @@ -875,7 +922,7 @@ public: /// InsertFormula - If the given formula has not yet been inserted, add it to /// the list, and return true. Return false otherwise. -bool LSRUse::InsertFormula(size_t LUIdx, const Formula &F) { +bool LSRUse::InsertFormula(const Formula &F) { SmallVector Key = F.BaseRegs; if (F.ScaledReg) Key.push_back(F.ScaledReg); // Unstable sort by host order ok, because this is only used for uniquifying. @@ -911,7 +958,7 @@ void LSRUse::print(raw_ostream &OS) const { case ICmpZero: OS << "ICmpZero"; break; case Address: OS << "Address of "; - if (isa(AccessTy)) + if (AccessTy->isPointerTy()) OS << "pointer"; // the full pointer type could be really verbose else OS << *AccessTy; @@ -1010,8 +1057,7 @@ static bool isAlwaysFoldable(int64_t BaseOffs, GlobalValue *BaseGV, bool HasBaseReg, LSRUse::KindType Kind, const Type *AccessTy, - const TargetLowering *TLI, - ScalarEvolution &SE) { + const TargetLowering *TLI) { // Fast-path: zero is always foldable. if (BaseOffs == 0 && !BaseGV) return true; @@ -1139,7 +1185,7 @@ class LSRInstance { const Type *AccessTy); public: - void InsertInitialFormula(const SCEV *S, Loop *L, LSRUse &LU, size_t LUIdx); + void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); void CountRegisters(const Formula &F, size_t LUIdx); bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F); @@ -1170,16 +1216,18 @@ public: Value *Expand(const LSRFixup &LF, const Formula &F, - BasicBlock::iterator IP, Loop *L, Instruction *IVIncInsertPos, + BasicBlock::iterator IP, SCEVExpander &Rewriter, - SmallVectorImpl &DeadInsts, - ScalarEvolution &SE, DominatorTree &DT) const; + SmallVectorImpl &DeadInsts) const; + void RewriteForPHI(PHINode *PN, const LSRFixup &LF, + const Formula &F, + SCEVExpander &Rewriter, + SmallVectorImpl &DeadInsts, + Pass *P) const; void Rewrite(const LSRFixup &LF, const Formula &F, - Loop *L, Instruction *IVIncInsertPos, SCEVExpander &Rewriter, SmallVectorImpl &DeadInsts, - ScalarEvolution &SE, DominatorTree &DT, Pass *P) const; void ImplementSolution(const SmallVectorImpl &Solution, Pass *P); @@ -1508,7 +1556,7 @@ LSRInstance::OptimizeLoopTermCond() { A = SE.getSignExtendExpr(A, B->getType()); } if (const SCEVConstant *D = - dyn_cast_or_null(getSDiv(B, A, SE))) { + dyn_cast_or_null(getExactSDiv(B, A, SE))) { // Stride of one or negative one can have reuse with non-addresses. if (D->getValue()->isOne() || D->getValue()->isAllOnesValue()) @@ -1517,6 +1565,10 @@ LSRInstance::OptimizeLoopTermCond() { if (D->getValue()->getValue().getMinSignedBits() >= 64 || D->getValue()->getValue().isMinSignedValue()) goto decline_post_inc; + // Without TLI, assume that any stride might be valid, and so any + // use might be shared. + if (!TLI) + goto decline_post_inc; // Check for possible scaled-address reuse. const Type *AccessTy = getAccessType(UI->getUser()); TargetLowering::AddrMode AM; @@ -1597,12 +1649,12 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, // Conservatively assume HasBaseReg is true for now. if (NewOffset < LU.MinOffset) { if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, /*HasBaseReg=*/true, - Kind, AccessTy, TLI, SE)) + Kind, AccessTy, TLI)) return false; NewMinOffset = NewOffset; } else if (NewOffset > LU.MaxOffset) { if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, /*HasBaseReg=*/true, - Kind, AccessTy, TLI, SE)) + Kind, AccessTy, TLI)) return false; NewMaxOffset = NewOffset; } @@ -1629,8 +1681,7 @@ LSRInstance::getUse(const SCEV *&Expr, int64_t Offset = ExtractImmediate(Expr, SE); // Basic uses can't accept any offset, for example. - if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, - Kind, AccessTy, TLI, SE)) { + if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) { Expr = Copy; Offset = 0; } @@ -1665,21 +1716,29 @@ LSRInstance::getUse(const SCEV *&Expr, void LSRInstance::CollectInterestingTypesAndFactors() { SmallSetVector Strides; - // Collect interesting types and factors. + // Collect interesting types and strides. for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { const SCEV *Stride = UI->getStride(); // Collect interesting types. Types.insert(SE.getEffectiveSCEVType(Stride->getType())); - // Collect interesting factors. + // Add the stride for this loop. + Strides.insert(Stride); + + // Add strides for other mentioned loops. + for (const SCEVAddRecExpr *AR = dyn_cast(UI->getOffset()); + AR; AR = dyn_cast(AR->getStart())) + Strides.insert(AR->getStepRecurrence(SE)); + } + + // Compute interesting factors from the set of interesting strides. + for (SmallSetVector::const_iterator + I = Strides.begin(), E = Strides.end(); I != E; ++I) for (SmallSetVector::const_iterator NewStrideIter = - Strides.begin(), SEnd = Strides.end(); NewStrideIter != SEnd; - ++NewStrideIter) { - const SCEV *OldStride = Stride; + next(I); NewStrideIter != E; ++NewStrideIter) { + const SCEV *OldStride = *I; const SCEV *NewStride = *NewStrideIter; - if (OldStride == NewStride) - continue; if (SE.getTypeSizeInBits(OldStride->getType()) != SE.getTypeSizeInBits(NewStride->getType())) { @@ -1690,19 +1749,18 @@ void LSRInstance::CollectInterestingTypesAndFactors() { OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType()); } if (const SCEVConstant *Factor = - dyn_cast_or_null(getSDiv(NewStride, OldStride, - SE, true))) { + dyn_cast_or_null(getExactSDiv(NewStride, OldStride, + SE, true))) { if (Factor->getValue()->getValue().getMinSignedBits() <= 64) Factors.insert(Factor->getValue()->getValue().getSExtValue()); } else if (const SCEVConstant *Factor = - dyn_cast_or_null(getSDiv(OldStride, NewStride, - SE, true))) { + dyn_cast_or_null(getExactSDiv(OldStride, + NewStride, + SE, true))) { if (Factor->getValue()->getValue().getMinSignedBits() <= 64) Factors.insert(Factor->getValue()->getValue().getSExtValue()); } } - Strides.insert(Stride); - } // If all uses use the same type, don't bother looking for truncation-based // reuse. @@ -1770,7 +1828,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { // If this is the first use of this LSRUse, give it a formula. if (LU.Formulae.empty()) { - InsertInitialFormula(S, L, LU, LF.LUIdx); + InsertInitialFormula(S, LU, LF.LUIdx); CountRegisters(LU.Formulae.back(), LF.LUIdx); } } @@ -1779,8 +1837,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { } void -LSRInstance::InsertInitialFormula(const SCEV *S, Loop *L, - LSRUse &LU, size_t LUIdx) { +LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { Formula F; F.InitialMatch(S, L, SE, DT); bool Inserted = InsertFormula(LU, LUIdx, F); @@ -1810,7 +1867,7 @@ void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { /// InsertFormula - If the given formula has not yet been inserted, add it to /// the list, and return true. Return false otherwise. bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { - if (!LU.InsertFormula(LUIdx, F)) + if (!LU.InsertFormula(F)) return false; CountRegisters(F, LUIdx); @@ -1902,10 +1959,10 @@ static void CollectSubexprs(const SCEV *S, const SCEVConstant *C, } else if (const SCEVAddRecExpr *AR = dyn_cast(S)) { // Split a non-zero base out of an addrec. if (!AR->getStart()->isZero()) { - CollectSubexprs(AR->getStart(), C, Ops, SE); CollectSubexprs(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()), AR->getStepRecurrence(SE), AR->getLoop()), C, Ops, SE); + CollectSubexprs(AR->getStart(), C, Ops, SE); return; } } else if (const SCEVMulExpr *Mul = dyn_cast(S)) { @@ -1977,7 +2034,7 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, /// GenerateCombinations - Generate a formula consisting of all of the /// loop-dominating registers added into a single register. void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, - Formula Base) { + Formula Base) { // This method is only intersting on a plurality of registers. if (Base.BaseRegs.size() <= 1) return; @@ -1994,8 +2051,14 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, F.BaseRegs.push_back(BaseReg); } if (Ops.size() > 1) { - F.BaseRegs.push_back(SE.getAddExpr(Ops)); - (void)InsertFormula(LU, LUIdx, F); + const SCEV *Sum = SE.getAddExpr(Ops); + // TODO: If Sum is zero, it probably means ScalarEvolution missed an + // opportunity to fold something. For now, just ignore such cases + // rather than procede with zero in a register. + if (!Sum->isZero()) { + F.BaseRegs.push_back(Sum); + (void)InsertFormula(LU, LUIdx, F); + } } } @@ -2081,14 +2144,18 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula F = Base; // Check that the multiplication doesn't overflow. + if (F.AM.BaseOffs == INT64_MIN && Factor == -1) + continue; F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs * Factor; - if ((int64_t)F.AM.BaseOffs / Factor != Base.AM.BaseOffs) + if (F.AM.BaseOffs / Factor != Base.AM.BaseOffs) continue; // Check that multiplying with the use offset doesn't overflow. int64_t Offset = LU.MinOffset; + if (Offset == INT64_MIN && Factor == -1) + continue; Offset = (uint64_t)Offset * Factor; - if ((int64_t)Offset / Factor != LU.MinOffset) + if (Offset / Factor != LU.MinOffset) continue; // Check that this scale is legal. @@ -2103,14 +2170,14 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, // Check that multiplying with each base register doesn't overflow. for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) { F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS); - if (getSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i]) + if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i]) goto next; } // Check that multiplying with the scaled register doesn't overflow. if (F.ScaledReg) { F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS); - if (getSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg) + if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg) continue; } @@ -2165,7 +2232,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, continue; // Divide out the factor, ignoring high bits, since we'll be // scaling the value back up in the end. - if (const SCEV *Quotient = getSDiv(AR, FactorS, SE, true)) { + if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) { // TODO: This could be optimized to avoid all the copying. Formula F = Base; F.ScaledReg = Quotient; @@ -2214,7 +2281,7 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, namespace { -/// WorkItem - Helper class for GenerateConstantOffsetReuse. It's used to +/// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to /// defer modifications so that the search phase doesn't have to worry about /// the data structures moving underneath it. struct WorkItem { @@ -2271,6 +2338,10 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { const SCEV *Reg = *I; const ImmMapTy &Imms = Map.find(Reg)->second; + // It's not worthwhile looking for reuse if there's only one offset. + if (Imms.size() == 1) + continue; + DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':'; for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); J != JE; ++J) @@ -2299,7 +2370,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { }; for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) { ImmMapTy::const_iterator M = OtherImms[i]; - if (M == J) continue; + if (M == J || M == JE) continue; // Compute the difference between the two. int64_t Imm = (uint64_t)JImm - M->first; @@ -2399,7 +2470,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { /// GenerateAllReuseFormulae - Generate formulae for each use. void LSRInstance::GenerateAllReuseFormulae() { - // This is split into two loops so that hasRegsUsedByUsesOtherThan + // This is split into multiple loops so that hasRegsUsedByUsesOtherThan // queries are more precise. for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { LSRUse &LU = Uses[LUIdx]; @@ -2418,6 +2489,9 @@ LSRInstance::GenerateAllReuseFormulae() { GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]); for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) GenerateScales(LU, LUIdx, LU.Formulae[i]); + } + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) GenerateTruncates(LU, LUIdx, LU.Formulae[i]); } @@ -2489,7 +2563,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { } DEBUG(if (Changed) { - dbgs() << "After filtering out undesirable candidates:\n"; + dbgs() << "\n" + "After filtering out undesirable candidates:\n"; print_uses(dbgs()); }); } @@ -2606,13 +2681,13 @@ void LSRInstance::SolveRecurse(SmallVectorImpl &Solution, SmallSetVector ReqRegs; for (SmallPtrSet::const_iterator I = CurRegs.begin(), E = CurRegs.end(); I != E; ++I) - if (LU.Regs.count(*I)) { + if (LU.Regs.count(*I)) ReqRegs.insert(*I); - break; - } + bool AnySatisfiedReqRegs = false; SmallPtrSet NewRegs; Cost NewCost; +retry: for (SmallVectorImpl::const_iterator I = LU.Formulae.begin(), E = LU.Formulae.end(); I != E; ++I) { const Formula &F = *I; @@ -2626,6 +2701,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl &Solution, F.BaseRegs.end()) goto skip; } + AnySatisfiedReqRegs = true; // Evaluate the cost of the current formula. If it's already worse than // the current best, prune the search at that point. @@ -2654,6 +2730,13 @@ void LSRInstance::SolveRecurse(SmallVectorImpl &Solution, } skip:; } + + // If none of the formulae had all of the required registers, relax the + // constraint so that we don't exclude all formulae. + if (!AnySatisfiedReqRegs) { + ReqRegs.clear(); + goto retry; + } } void LSRInstance::Solve(SmallVectorImpl &Solution) const { @@ -2696,10 +2779,8 @@ static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) { Value *LSRInstance::Expand(const LSRFixup &LF, const Formula &F, BasicBlock::iterator IP, - Loop *L, Instruction *IVIncInsertPos, SCEVExpander &Rewriter, - SmallVectorImpl &DeadInsts, - ScalarEvolution &SE, DominatorTree &DT) const { + SmallVectorImpl &DeadInsts) const { const LSRUse &LU = Uses[LF.LUIdx]; // Then, collect some instructions which we will remain dominated by when @@ -2776,8 +2857,10 @@ Value *LSRInstance::Expand(const LSRFixup &LF, if (AR->getLoop() == LF.PostIncLoop) { Reg = SE.getAddExpr(Reg, AR->getStepRecurrence(SE)); // If the user is inside the loop, insert the code after the increment - // so that it is dominated by its operand. - if (L->contains(LF.UserInst)) + // so that it is dominated by its operand. If the original insert point + // was already dominated by the increment, keep it, because there may + // be loop-variant operands that need to be respected also. + if (L->contains(LF.UserInst) && !DT.dominates(IVIncInsertPos, IP)) IP = IVIncInsertPos; break; } @@ -2881,73 +2964,81 @@ Value *LSRInstance::Expand(const LSRFixup &LF, return FullV; } +/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use +/// of their operands effectively happens in their predecessor blocks, so the +/// expression may need to be expanded in multiple places. +void LSRInstance::RewriteForPHI(PHINode *PN, + const LSRFixup &LF, + const Formula &F, + SCEVExpander &Rewriter, + SmallVectorImpl &DeadInsts, + Pass *P) const { + DenseMap Inserted; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) == LF.OperandValToReplace) { + BasicBlock *BB = PN->getIncomingBlock(i); + + // If this is a critical edge, split the edge so that we do not insert + // the code on all predecessor/successor paths. We do this unless this + // is the canonical backedge for this loop, which complicates post-inc + // users. + if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 && + !isa(BB->getTerminator()) && + (PN->getParent() != L->getHeader() || !L->contains(BB))) { + // Split the critical edge. + BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P); + + // If PN is outside of the loop and BB is in the loop, we want to + // move the block to be immediately before the PHI block, not + // immediately after BB. + if (L->contains(BB) && !L->contains(PN)) + NewBB->moveBefore(PN->getParent()); + + // Splitting the edge can reduce the number of PHI entries we have. + e = PN->getNumIncomingValues(); + BB = NewBB; + i = PN->getBasicBlockIndex(BB); + } + + std::pair::iterator, bool> Pair = + Inserted.insert(std::make_pair(BB, static_cast(0))); + if (!Pair.second) + PN->setIncomingValue(i, Pair.first->second); + else { + Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts); + + // If this is reuse-by-noop-cast, insert the noop cast. + const Type *OpTy = LF.OperandValToReplace->getType(); + if (FullV->getType() != OpTy) + FullV = + CastInst::Create(CastInst::getCastOpcode(FullV, false, + OpTy, false), + FullV, LF.OperandValToReplace->getType(), + "tmp", BB->getTerminator()); + + PN->setIncomingValue(i, FullV); + Pair.first->second = FullV; + } + } +} + /// Rewrite - Emit instructions for the leading candidate expression for this /// LSRUse (this is called "expanding"), and update the UserInst to reference /// the newly expanded value. void LSRInstance::Rewrite(const LSRFixup &LF, const Formula &F, - Loop *L, Instruction *IVIncInsertPos, SCEVExpander &Rewriter, SmallVectorImpl &DeadInsts, - ScalarEvolution &SE, DominatorTree &DT, Pass *P) const { - const Type *OpTy = LF.OperandValToReplace->getType(); - // First, find an insertion point that dominates UserInst. For PHI nodes, // find the nearest block which dominates all the relevant uses. if (PHINode *PN = dyn_cast(LF.UserInst)) { - DenseMap Inserted; - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (PN->getIncomingValue(i) == LF.OperandValToReplace) { - BasicBlock *BB = PN->getIncomingBlock(i); - - // If this is a critical edge, split the edge so that we do not insert - // the code on all predecessor/successor paths. We do this unless this - // is the canonical backedge for this loop, which complicates post-inc - // users. - if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 && - !isa(BB->getTerminator()) && - (PN->getParent() != L->getHeader() || !L->contains(BB))) { - // Split the critical edge. - BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P); - - // If PN is outside of the loop and BB is in the loop, we want to - // move the block to be immediately before the PHI block, not - // immediately after BB. - if (L->contains(BB) && !L->contains(PN)) - NewBB->moveBefore(PN->getParent()); - - // Splitting the edge can reduce the number of PHI entries we have. - e = PN->getNumIncomingValues(); - BB = NewBB; - i = PN->getBasicBlockIndex(BB); - } - - std::pair::iterator, bool> Pair = - Inserted.insert(std::make_pair(BB, static_cast(0))); - if (!Pair.second) - PN->setIncomingValue(i, Pair.first->second); - else { - Value *FullV = Expand(LF, F, BB->getTerminator(), L, IVIncInsertPos, - Rewriter, DeadInsts, SE, DT); - - // If this is reuse-by-noop-cast, insert the noop cast. - if (FullV->getType() != OpTy) - FullV = - CastInst::Create(CastInst::getCastOpcode(FullV, false, - OpTy, false), - FullV, LF.OperandValToReplace->getType(), - "tmp", BB->getTerminator()); - - PN->setIncomingValue(i, FullV); - Pair.first->second = FullV; - } - } + RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P); } else { - Value *FullV = Expand(LF, F, LF.UserInst, L, IVIncInsertPos, - Rewriter, DeadInsts, SE, DT); + Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts); // If this is reuse-by-noop-cast, insert the noop cast. + const Type *OpTy = LF.OperandValToReplace->getType(); if (FullV->getType() != OpTy) { Instruction *Cast = CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false), @@ -2984,8 +3075,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl &Solution, for (size_t i = 0, e = Fixups.size(); i != e; ++i) { size_t LUIdx = Fixups[i].LUIdx; - Rewrite(Fixups[i], *Solution[LUIdx], L, IVIncInsertPos, Rewriter, - DeadInsts, SE, DT, P); + Rewrite(Fixups[i], *Solution[LUIdx], Rewriter, DeadInsts, P); Changed = true; }