X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTransforms%2FScalar%2FLoopStrengthReduce.cpp;h=689ea20868965e47b889fe23528c75ab7287cb6f;hb=c6a669b6e7fb13669abefa62c6ac4f871b84bb0b;hp=73d3f9db8965402347df6678473aff42b38b0a92;hpb=ae086250853cb901d59943bd98aa9a0ad9a85b4d;p=oota-llvm.git

diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 73d3f9db896..689ea208689 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -337,14 +337,42 @@ void Formula::dump() const {
   print(errs()); errs() << '\n';
 }
 
-/// getSDiv - Return an expression for LHS /s RHS, if it can be determined,
-/// or null otherwise. If IgnoreSignificantBits is true, expressions like
-/// (X * Y) /s Y are simplified to Y, ignoring that the multiplication may
-/// overflow, which is useful when the result will be used in a context where
-/// the most significant bits are ignored.
-static const SCEV *getSDiv(const SCEV *LHS, const SCEV *RHS,
-                           ScalarEvolution &SE,
-                           bool IgnoreSignificantBits = false) {
+/// isAddRecSExtable - Return true if the given addrec can be sign-extended
+/// without changing its value.
+static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+  const Type *WideTy =
+    IntegerType::get(SE.getContext(),
+                     SE.getTypeSizeInBits(AR->getType()) + 1);
+  return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+}
+
+/// isAddSExtable - Return true if the given add can be sign-extended
+/// without changing its value.
+static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
+  const Type *WideTy =
+    IntegerType::get(SE.getContext(),
+                     SE.getTypeSizeInBits(A->getType()) + 1);
+  return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
+}
+
+/// isMulSExtable - Return true if the given add can be sign-extended
+/// without changing its value.
+static bool isMulSExtable(const SCEVMulExpr *A, ScalarEvolution &SE) {
+  const Type *WideTy =
+    IntegerType::get(SE.getContext(),
+                     SE.getTypeSizeInBits(A->getType()) + 1);
+  return isa<SCEVMulExpr>(SE.getSignExtendExpr(A, WideTy));
+}
+
+/// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
+/// and if the remainder is known to be zero,  or null otherwise. If
+/// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified
+/// to Y, ignoring that the multiplication may overflow, which is useful when
+/// the result will be used in a context where the most significant bits are
+/// ignored.
+static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
+                                ScalarEvolution &SE,
+                                bool IgnoreSignificantBits = false) {
   // Handle the trivial case, which works for any SCEV type.
   if (LHS == RHS)
     return SE.getIntegerSCEV(1, LHS->getType());
@@ -365,39 +393,44 @@ static const SCEV *getSDiv(const SCEV *LHS, const SCEV *RHS,
                .sdiv(RC->getValue()->getValue()));
   }
 
-  // Distribute the sdiv over addrec operands.
+  // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
-    const SCEV *Start = getSDiv(AR->getStart(), RHS, SE,
-                                IgnoreSignificantBits);
-    if (!Start) return 0;
-    const SCEV *Step = getSDiv(AR->getStepRecurrence(SE), RHS, SE,
-                               IgnoreSignificantBits);
-    if (!Step) return 0;
-    return SE.getAddRecExpr(Start, Step, AR->getLoop());
+    if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
+      const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
+                                       IgnoreSignificantBits);
+      if (!Start) return 0;
+      const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
+                                      IgnoreSignificantBits);
+      if (!Step) return 0;
+      return SE.getAddRecExpr(Start, Step, AR->getLoop());
+    }
   }
 
-  // Distribute the sdiv over add operands.
+  // Distribute the sdiv over add operands, if the add doesn't overflow.
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
-    SmallVector<const SCEV *, 8> Ops;
-    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
-         I != E; ++I) {
-      const SCEV *Op = getSDiv(*I, RHS, SE,
-                               IgnoreSignificantBits);
-      if (!Op) return 0;
-      Ops.push_back(Op);
+    if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
+      SmallVector<const SCEV *, 8> Ops;
+      for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
+           I != E; ++I) {
+        const SCEV *Op = getExactSDiv(*I, RHS, SE,
+                                      IgnoreSignificantBits);
+        if (!Op) return 0;
+        Ops.push_back(Op);
+      }
+      return SE.getAddExpr(Ops);
     }
-    return SE.getAddExpr(Ops);
   }
 
   // Check for a multiply operand that we can pull RHS out of.
   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS))
-    if (IgnoreSignificantBits || Mul->hasNoSignedWrap()) {
+    if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
       SmallVector<const SCEV *, 4> Ops;
       bool Found = false;
       for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
            I != E; ++I) {
         if (!Found)
-          if (const SCEV *Q = getSDiv(*I, RHS, SE, IgnoreSignificantBits)) {
+          if (const SCEV *Q = getExactSDiv(*I, RHS, SE,
+                                           IgnoreSignificantBits)) {
             Ops.push_back(Q);
             Found = true;
             continue;
@@ -579,6 +612,10 @@ private:
                     SmallPtrSet<const SCEV *, 16> &Regs,
                     const Loop *L,
                     ScalarEvolution &SE, DominatorTree &DT);
+  void RatePrimaryRegister(const SCEV *Reg,
+                           SmallPtrSet<const SCEV *, 16> &Regs,
+                           const Loop *L,
+                           ScalarEvolution &SE, DominatorTree &DT);
 };
 
 }
@@ -588,49 +625,59 @@ void Cost::RateRegister(const SCEV *Reg,
                         SmallPtrSet<const SCEV *, 16> &Regs,
                         const Loop *L,
                         ScalarEvolution &SE, DominatorTree &DT) {
-  if (Regs.insert(Reg)) {
-    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
-      if (AR->getLoop() == L)
-        AddRecCost += 1; /// TODO: This should be a function of the stride.
-
-      // If this is an addrec for a loop that's already been visited by LSR,
-      // don't second-guess its addrec phi nodes. LSR isn't currently smart
-      // enough to reason about more than one loop at a time. Consider these
-      // registers free and leave them alone.
-      else if (L->contains(AR->getLoop()) ||
-               (!AR->getLoop()->contains(L) &&
-                DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
-        for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
-             PHINode *PN = dyn_cast<PHINode>(I); ++I)
-          if (SE.isSCEVable(PN->getType()) &&
-              (SE.getEffectiveSCEVType(PN->getType()) ==
-               SE.getEffectiveSCEVType(AR->getType())) &&
-              SE.getSCEV(PN) == AR)
-            goto no_cost;
-
-        // If this isn't one of the addrecs that the loop already has, it
-        // would require a costly new phi and add.
-        ++NumBaseAdds;
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
+    if (AR->getLoop() == L)
+      AddRecCost += 1; /// TODO: This should be a function of the stride.
+
+    // If this is an addrec for a loop that's already been visited by LSR,
+    // don't second-guess its addrec phi nodes. LSR isn't currently smart
+    // enough to reason about more than one loop at a time. Consider these
+    // registers free and leave them alone.
+    else if (L->contains(AR->getLoop()) ||
+             (!AR->getLoop()->contains(L) &&
+              DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
+      for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
+           PHINode *PN = dyn_cast<PHINode>(I); ++I)
+        if (SE.isSCEVable(PN->getType()) &&
+            (SE.getEffectiveSCEVType(PN->getType()) ==
+             SE.getEffectiveSCEVType(AR->getType())) &&
+            SE.getSCEV(PN) == AR)
+          return;
+
+      // If this isn't one of the addrecs that the loop already has, it
+      // would require a costly new phi and add. TODO: This isn't
+      // precisely modeled right now.
+      ++NumBaseAdds;
+      if (!Regs.count(AR->getStart()))
         RateRegister(AR->getStart(), Regs, L, SE, DT);
-      }
-
-      // Add the step value register, if it needs one.
-      // TODO: The non-affine case isn't precisely modeled here.
-      if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
-        RateRegister(AR->getOperand(1), Regs, L, SE, DT);
     }
-    ++NumRegs;
 
-    // Rough heuristic; favor registers which don't require extra setup
-    // instructions in the preheader.
-    if (!isa<SCEVUnknown>(Reg) &&
-        !isa<SCEVConstant>(Reg) &&
-        !(isa<SCEVAddRecExpr>(Reg) &&
-          (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
-           isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
-      ++SetupCost;
-  no_cost:;
+    // Add the step value register, if it needs one.
+    // TODO: The non-affine case isn't precisely modeled here.
+    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
+      if (!Regs.count(AR->getStart()))
+        RateRegister(AR->getOperand(1), Regs, L, SE, DT);
   }
+  ++NumRegs;
+
+  // Rough heuristic; favor registers which don't require extra setup
+  // instructions in the preheader.
+  if (!isa<SCEVUnknown>(Reg) &&
+      !isa<SCEVConstant>(Reg) &&
+      !(isa<SCEVAddRecExpr>(Reg) &&
+        (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
+         isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
+    ++SetupCost;
+}
+
+/// RatePrimaryRegister - Record this register in the set. If we haven't seen it
+/// before, rate it.
+void Cost::RatePrimaryRegister(const SCEV *Reg,
+                               SmallPtrSet<const SCEV *, 16> &Regs,
+                               const Loop *L,
+                               ScalarEvolution &SE, DominatorTree &DT) {
+  if (Regs.insert(Reg))
+    RateRegister(Reg, Regs, L, SE, DT);
 }
 
 void Cost::RateFormula(const Formula &F,
@@ -645,7 +692,7 @@ void Cost::RateFormula(const Formula &F,
       Loose();
       return;
     }
-    RateRegister(ScaledReg, Regs, L, SE, DT);
+    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT);
   }
   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
        E = F.BaseRegs.end(); I != E; ++I) {
@@ -654,7 +701,7 @@ void Cost::RateFormula(const Formula &F,
       Loose();
       return;
     }
-    RateRegister(BaseReg, Regs, L, SE, DT);
+    RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
 
     NumIVMuls += isa<SCEVMulExpr>(BaseReg) &&
                  BaseReg->hasComputableLoopEvolution(L);
@@ -865,7 +912,7 @@ public:
                                       MaxOffset(INT64_MIN),
                                       AllFixupsOutsideLoop(true) {}
 
-  bool InsertFormula(size_t LUIdx, const Formula &F);
+  bool InsertFormula(const Formula &F);
 
   void check() const;
 
@@ -875,7 +922,7 @@ public:
 
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
-bool LSRUse::InsertFormula(size_t LUIdx, const Formula &F) {
+bool LSRUse::InsertFormula(const Formula &F) {
   SmallVector<const SCEV *, 2> Key = F.BaseRegs;
   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   // Unstable sort by host order ok, because this is only used for uniquifying.
@@ -911,7 +958,7 @@ void LSRUse::print(raw_ostream &OS) const {
   case ICmpZero: OS << "ICmpZero"; break;
   case Address:
     OS << "Address of ";
-    if (isa<PointerType>(AccessTy))
+    if (AccessTy->isPointerTy())
       OS << "pointer"; // the full pointer type could be really verbose
     else
       OS << *AccessTy;
@@ -1010,8 +1057,7 @@ static bool isAlwaysFoldable(int64_t BaseOffs,
                              GlobalValue *BaseGV,
                              bool HasBaseReg,
                              LSRUse::KindType Kind, const Type *AccessTy,
-                             const TargetLowering *TLI,
-                             ScalarEvolution &SE) {
+                             const TargetLowering *TLI) {
   // Fast-path: zero is always foldable.
   if (BaseOffs == 0 && !BaseGV) return true;
 
@@ -1139,7 +1185,7 @@ class LSRInstance {
                                     const Type *AccessTy);
 
 public:
-  void InsertInitialFormula(const SCEV *S, Loop *L, LSRUse &LU, size_t LUIdx);
+  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
   void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
   void CountRegisters(const Formula &F, size_t LUIdx);
   bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
@@ -1170,16 +1216,18 @@ public:
 
   Value *Expand(const LSRFixup &LF,
                 const Formula &F,
-                BasicBlock::iterator IP, Loop *L, Instruction *IVIncInsertPos,
+                BasicBlock::iterator IP,
                 SCEVExpander &Rewriter,
-                SmallVectorImpl<WeakVH> &DeadInsts,
-                ScalarEvolution &SE, DominatorTree &DT) const;
+                SmallVectorImpl<WeakVH> &DeadInsts) const;
+  void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
+                     const Formula &F,
+                     SCEVExpander &Rewriter,
+                     SmallVectorImpl<WeakVH> &DeadInsts,
+                     Pass *P) const;
   void Rewrite(const LSRFixup &LF,
                const Formula &F,
-               Loop *L, Instruction *IVIncInsertPos,
                SCEVExpander &Rewriter,
                SmallVectorImpl<WeakVH> &DeadInsts,
-               ScalarEvolution &SE, DominatorTree &DT,
                Pass *P) const;
   void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
                          Pass *P);
@@ -1508,7 +1556,7 @@ LSRInstance::OptimizeLoopTermCond() {
               A = SE.getSignExtendExpr(A, B->getType());
           }
           if (const SCEVConstant *D =
-                dyn_cast_or_null<SCEVConstant>(getSDiv(B, A, SE))) {
+                dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
             // Stride of one or negative one can have reuse with non-addresses.
             if (D->getValue()->isOne() ||
                 D->getValue()->isAllOnesValue())
@@ -1517,6 +1565,10 @@ LSRInstance::OptimizeLoopTermCond() {
             if (D->getValue()->getValue().getMinSignedBits() >= 64 ||
                 D->getValue()->getValue().isMinSignedValue())
               goto decline_post_inc;
+            // Without TLI, assume that any stride might be valid, and so any
+            // use might be shared.
+            if (!TLI)
+              goto decline_post_inc;
             // Check for possible scaled-address reuse.
             const Type *AccessTy = getAccessType(UI->getUser());
             TargetLowering::AddrMode AM;
@@ -1597,12 +1649,12 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
   // Conservatively assume HasBaseReg is true for now.
   if (NewOffset < LU.MinOffset) {
     if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, /*HasBaseReg=*/true,
-                          Kind, AccessTy, TLI, SE))
+                          Kind, AccessTy, TLI))
       return false;
     NewMinOffset = NewOffset;
   } else if (NewOffset > LU.MaxOffset) {
     if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, /*HasBaseReg=*/true,
-                          Kind, AccessTy, TLI, SE))
+                          Kind, AccessTy, TLI))
       return false;
     NewMaxOffset = NewOffset;
   }
@@ -1629,8 +1681,7 @@ LSRInstance::getUse(const SCEV *&Expr,
   int64_t Offset = ExtractImmediate(Expr, SE);
 
   // Basic uses can't accept any offset, for example.
-  if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true,
-                        Kind, AccessTy, TLI, SE)) {
+  if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) {
     Expr = Copy;
     Offset = 0;
   }
@@ -1665,21 +1716,29 @@ LSRInstance::getUse(const SCEV *&Expr,
 void LSRInstance::CollectInterestingTypesAndFactors() {
   SmallSetVector<const SCEV *, 4> Strides;
 
-  // Collect interesting types and factors.
+  // Collect interesting types and strides.
   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
     const SCEV *Stride = UI->getStride();
 
     // Collect interesting types.
     Types.insert(SE.getEffectiveSCEVType(Stride->getType()));
 
-    // Collect interesting factors.
+    // Add the stride for this loop.
+    Strides.insert(Stride);
+
+    // Add strides for other mentioned loops.
+    for (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(UI->getOffset());
+         AR; AR = dyn_cast<SCEVAddRecExpr>(AR->getStart()))
+      Strides.insert(AR->getStepRecurrence(SE));
+  }
+
+  // Compute interesting factors from the set of interesting strides.
+  for (SmallSetVector<const SCEV *, 4>::const_iterator
+       I = Strides.begin(), E = Strides.end(); I != E; ++I)
     for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
-         Strides.begin(), SEnd = Strides.end(); NewStrideIter != SEnd;
-         ++NewStrideIter) {
-      const SCEV *OldStride = Stride;
+         next(I); NewStrideIter != E; ++NewStrideIter) {
+      const SCEV *OldStride = *I;
       const SCEV *NewStride = *NewStrideIter;
-      if (OldStride == NewStride)
-        continue;
 
       if (SE.getTypeSizeInBits(OldStride->getType()) !=
           SE.getTypeSizeInBits(NewStride->getType())) {
@@ -1690,19 +1749,18 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
           OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
       }
       if (const SCEVConstant *Factor =
-            dyn_cast_or_null<SCEVConstant>(getSDiv(NewStride, OldStride,
-                                                   SE, true))) {
+            dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
+                                                        SE, true))) {
         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
           Factors.insert(Factor->getValue()->getValue().getSExtValue());
       } else if (const SCEVConstant *Factor =
-                   dyn_cast_or_null<SCEVConstant>(getSDiv(OldStride, NewStride,
-                                                          SE, true))) {
+                   dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
+                                                               NewStride,
+                                                               SE, true))) {
         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
           Factors.insert(Factor->getValue()->getValue().getSExtValue());
       }
     }
-    Strides.insert(Stride);
-  }
 
   // If all uses use the same type, don't bother looking for truncation-based
   // reuse.
@@ -1770,7 +1828,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 
     // If this is the first use of this LSRUse, give it a formula.
     if (LU.Formulae.empty()) {
-      InsertInitialFormula(S, L, LU, LF.LUIdx);
+      InsertInitialFormula(S, LU, LF.LUIdx);
       CountRegisters(LU.Formulae.back(), LF.LUIdx);
     }
   }
@@ -1779,8 +1837,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 }
 
 void
-LSRInstance::InsertInitialFormula(const SCEV *S, Loop *L,
-                                  LSRUse &LU, size_t LUIdx) {
+LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
   Formula F;
   F.InitialMatch(S, L, SE, DT);
   bool Inserted = InsertFormula(LU, LUIdx, F);
@@ -1810,7 +1867,7 @@ void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
-  if (!LU.InsertFormula(LUIdx, F))
+  if (!LU.InsertFormula(F))
     return false;
 
   CountRegisters(F, LUIdx);
@@ -1902,10 +1959,10 @@ static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     // Split a non-zero base out of an addrec.
     if (!AR->getStart()->isZero()) {
-      CollectSubexprs(AR->getStart(), C, Ops, SE);
       CollectSubexprs(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()),
                                        AR->getStepRecurrence(SE),
                                        AR->getLoop()), C, Ops, SE);
+      CollectSubexprs(AR->getStart(), C, Ops, SE);
       return;
     }
   } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
@@ -1977,7 +2034,7 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
 /// GenerateCombinations - Generate a formula consisting of all of the
 /// loop-dominating registers added into a single register.
 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
-                                           Formula Base) {
+                                       Formula Base) {
   // This method is only intersting on a plurality of registers.
   if (Base.BaseRegs.size() <= 1) return;
 
@@ -1994,8 +2051,14 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
       F.BaseRegs.push_back(BaseReg);
   }
   if (Ops.size() > 1) {
-    F.BaseRegs.push_back(SE.getAddExpr(Ops));
-    (void)InsertFormula(LU, LUIdx, F);
+    const SCEV *Sum = SE.getAddExpr(Ops);
+    // TODO: If Sum is zero, it probably means ScalarEvolution missed an
+    // opportunity to fold something. For now, just ignore such cases
+    // rather than procede with zero in a register.
+    if (!Sum->isZero()) {
+      F.BaseRegs.push_back(Sum);
+      (void)InsertFormula(LU, LUIdx, F);
+    }
   }
 }
 
@@ -2081,14 +2144,18 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
     Formula F = Base;
 
     // Check that the multiplication doesn't overflow.
+    if (F.AM.BaseOffs == INT64_MIN && Factor == -1)
+      continue;
     F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
-    if ((int64_t)F.AM.BaseOffs / Factor != Base.AM.BaseOffs)
+    if (F.AM.BaseOffs / Factor != Base.AM.BaseOffs)
       continue;
 
     // Check that multiplying with the use offset doesn't overflow.
     int64_t Offset = LU.MinOffset;
+    if (Offset == INT64_MIN && Factor == -1)
+      continue;
     Offset = (uint64_t)Offset * Factor;
-    if ((int64_t)Offset / Factor != LU.MinOffset)
+    if (Offset / Factor != LU.MinOffset)
       continue;
 
     // Check that this scale is legal.
@@ -2103,14 +2170,14 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
     // Check that multiplying with each base register doesn't overflow.
     for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
       F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
-      if (getSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
+      if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
         goto next;
     }
 
     // Check that multiplying with the scaled register doesn't overflow.
     if (F.ScaledReg) {
       F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
-      if (getSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
+      if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
         continue;
     }
 
@@ -2165,7 +2232,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx,
           continue;
         // Divide out the factor, ignoring high bits, since we'll be
         // scaling the value back up in the end.
-        if (const SCEV *Quotient = getSDiv(AR, FactorS, SE, true)) {
+        if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
           // TODO: This could be optimized to avoid all the copying.
           Formula F = Base;
           F.ScaledReg = Quotient;
@@ -2214,7 +2281,7 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx,
 
 namespace {
 
-/// WorkItem - Helper class for GenerateConstantOffsetReuse. It's used to
+/// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to
 /// defer modifications so that the search phase doesn't have to worry about
 /// the data structures moving underneath it.
 struct WorkItem {
@@ -2271,6 +2338,10 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
     const SCEV *Reg = *I;
     const ImmMapTy &Imms = Map.find(Reg)->second;
 
+    // It's not worthwhile looking for reuse if there's only one offset.
+    if (Imms.size() == 1)
+      continue;
+
     DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
           for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
                J != JE; ++J)
@@ -2299,7 +2370,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
       };
       for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
         ImmMapTy::const_iterator M = OtherImms[i];
-        if (M == J) continue;
+        if (M == J || M == JE) continue;
 
         // Compute the difference between the two.
         int64_t Imm = (uint64_t)JImm - M->first;
@@ -2399,7 +2470,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
 /// GenerateAllReuseFormulae - Generate formulae for each use.
 void
 LSRInstance::GenerateAllReuseFormulae() {
-  // This is split into two loops so that hasRegsUsedByUsesOtherThan
+  // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
   // queries are more precise.
   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
     LSRUse &LU = Uses[LUIdx];
@@ -2418,6 +2489,9 @@ LSRInstance::GenerateAllReuseFormulae() {
       GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
       GenerateScales(LU, LUIdx, LU.Formulae[i]);
+  }
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
       GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
   }
@@ -2489,7 +2563,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
   }
 
   DEBUG(if (Changed) {
-          dbgs() << "After filtering out undesirable candidates:\n";
+          dbgs() << "\n"
+                    "After filtering out undesirable candidates:\n";
           print_uses(dbgs());
         });
 }
@@ -2606,13 +2681,13 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
   SmallSetVector<const SCEV *, 4> ReqRegs;
   for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
        E = CurRegs.end(); I != E; ++I)
-    if (LU.Regs.count(*I)) {
+    if (LU.Regs.count(*I))
       ReqRegs.insert(*I);
-      break;
-    }
 
+  bool AnySatisfiedReqRegs = false;
   SmallPtrSet<const SCEV *, 16> NewRegs;
   Cost NewCost;
+retry:
   for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
        E = LU.Formulae.end(); I != E; ++I) {
     const Formula &F = *I;
@@ -2626,6 +2701,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
           F.BaseRegs.end())
         goto skip;
     }
+    AnySatisfiedReqRegs = true;
 
     // Evaluate the cost of the current formula. If it's already worse than
     // the current best, prune the search at that point.
@@ -2654,6 +2730,13 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
     }
   skip:;
   }
+
+  // If none of the formulae had all of the required registers, relax the
+  // constraint so that we don't exclude all formulae.
+  if (!AnySatisfiedReqRegs) {
+    ReqRegs.clear();
+    goto retry;
+  }
 }
 
 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
@@ -2696,10 +2779,8 @@ static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) {
 Value *LSRInstance::Expand(const LSRFixup &LF,
                            const Formula &F,
                            BasicBlock::iterator IP,
-                           Loop *L, Instruction *IVIncInsertPos,
                            SCEVExpander &Rewriter,
-                           SmallVectorImpl<WeakVH> &DeadInsts,
-                           ScalarEvolution &SE, DominatorTree &DT) const {
+                           SmallVectorImpl<WeakVH> &DeadInsts) const {
   const LSRUse &LU = Uses[LF.LUIdx];
 
   // Then, collect some instructions which we will remain dominated by when
@@ -2776,8 +2857,10 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       if (AR->getLoop() == LF.PostIncLoop) {
         Reg = SE.getAddExpr(Reg, AR->getStepRecurrence(SE));
         // If the user is inside the loop, insert the code after the increment
-        // so that it is dominated by its operand.
-        if (L->contains(LF.UserInst))
+        // so that it is dominated by its operand. If the original insert point
+        // was already dominated by the increment, keep it, because there may
+        // be loop-variant operands that need to be respected also.
+        if (L->contains(LF.UserInst) && !DT.dominates(IVIncInsertPos, IP))
           IP = IVIncInsertPos;
         break;
       }
@@ -2881,73 +2964,81 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   return FullV;
 }
 
+/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use
+/// of their operands effectively happens in their predecessor blocks, so the
+/// expression may need to be expanded in multiple places.
+void LSRInstance::RewriteForPHI(PHINode *PN,
+                                const LSRFixup &LF,
+                                const Formula &F,
+                                SCEVExpander &Rewriter,
+                                SmallVectorImpl<WeakVH> &DeadInsts,
+                                Pass *P) const {
+  DenseMap<BasicBlock *, Value *> Inserted;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
+      BasicBlock *BB = PN->getIncomingBlock(i);
+
+      // If this is a critical edge, split the edge so that we do not insert
+      // the code on all predecessor/successor paths.  We do this unless this
+      // is the canonical backedge for this loop, which complicates post-inc
+      // users.
+      if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
+          !isa<IndirectBrInst>(BB->getTerminator()) &&
+          (PN->getParent() != L->getHeader() || !L->contains(BB))) {
+        // Split the critical edge.
+        BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);
+
+        // If PN is outside of the loop and BB is in the loop, we want to
+        // move the block to be immediately before the PHI block, not
+        // immediately after BB.
+        if (L->contains(BB) && !L->contains(PN))
+          NewBB->moveBefore(PN->getParent());
+
+        // Splitting the edge can reduce the number of PHI entries we have.
+        e = PN->getNumIncomingValues();
+        BB = NewBB;
+        i = PN->getBasicBlockIndex(BB);
+      }
+
+      std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
+        Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
+      if (!Pair.second)
+        PN->setIncomingValue(i, Pair.first->second);
+      else {
+        Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);
+
+        // If this is reuse-by-noop-cast, insert the noop cast.
+        const Type *OpTy = LF.OperandValToReplace->getType();
+        if (FullV->getType() != OpTy)
+          FullV =
+            CastInst::Create(CastInst::getCastOpcode(FullV, false,
+                                                     OpTy, false),
+                             FullV, LF.OperandValToReplace->getType(),
+                             "tmp", BB->getTerminator());
+
+        PN->setIncomingValue(i, FullV);
+        Pair.first->second = FullV;
+      }
+    }
+}
+
 /// Rewrite - Emit instructions for the leading candidate expression for this
 /// LSRUse (this is called "expanding"), and update the UserInst to reference
 /// the newly expanded value.
 void LSRInstance::Rewrite(const LSRFixup &LF,
                           const Formula &F,
-                          Loop *L, Instruction *IVIncInsertPos,
                           SCEVExpander &Rewriter,
                           SmallVectorImpl<WeakVH> &DeadInsts,
-                          ScalarEvolution &SE, DominatorTree &DT,
                           Pass *P) const {
-  const Type *OpTy = LF.OperandValToReplace->getType();
-
   // First, find an insertion point that dominates UserInst. For PHI nodes,
   // find the nearest block which dominates all the relevant uses.
   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
-    DenseMap<BasicBlock *, Value *> Inserted;
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
-        BasicBlock *BB = PN->getIncomingBlock(i);
-
-        // If this is a critical edge, split the edge so that we do not insert
-        // the code on all predecessor/successor paths.  We do this unless this
-        // is the canonical backedge for this loop, which complicates post-inc
-        // users.
-        if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
-            !isa<IndirectBrInst>(BB->getTerminator()) &&
-            (PN->getParent() != L->getHeader() || !L->contains(BB))) {
-          // Split the critical edge.
-          BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);
-
-          // If PN is outside of the loop and BB is in the loop, we want to
-          // move the block to be immediately before the PHI block, not
-          // immediately after BB.
-          if (L->contains(BB) && !L->contains(PN))
-            NewBB->moveBefore(PN->getParent());
-
-          // Splitting the edge can reduce the number of PHI entries we have.
-          e = PN->getNumIncomingValues();
-          BB = NewBB;
-          i = PN->getBasicBlockIndex(BB);
-        }
-
-        std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
-          Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
-        if (!Pair.second)
-          PN->setIncomingValue(i, Pair.first->second);
-        else {
-          Value *FullV = Expand(LF, F, BB->getTerminator(), L, IVIncInsertPos,
-                                Rewriter, DeadInsts, SE, DT);
-
-          // If this is reuse-by-noop-cast, insert the noop cast.
-          if (FullV->getType() != OpTy)
-            FullV =
-              CastInst::Create(CastInst::getCastOpcode(FullV, false,
-                                                       OpTy, false),
-                               FullV, LF.OperandValToReplace->getType(),
-                               "tmp", BB->getTerminator());
-
-          PN->setIncomingValue(i, FullV);
-          Pair.first->second = FullV;
-        }
-      }
+    RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P);
   } else {
-    Value *FullV = Expand(LF, F, LF.UserInst, L, IVIncInsertPos,
-                          Rewriter, DeadInsts, SE, DT);
+    Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);
 
     // If this is reuse-by-noop-cast, insert the noop cast.
+    const Type *OpTy = LF.OperandValToReplace->getType();
     if (FullV->getType() != OpTy) {
       Instruction *Cast =
         CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
@@ -2984,8 +3075,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
   for (size_t i = 0, e = Fixups.size(); i != e; ++i) {
     size_t LUIdx = Fixups[i].LUIdx;
 
-    Rewrite(Fixups[i], *Solution[LUIdx], L, IVIncInsertPos, Rewriter,
-            DeadInsts, SE, DT, P);
+    Rewrite(Fixups[i], *Solution[LUIdx], Rewriter, DeadInsts, P);
 
     Changed = true;
   }