lib/Transforms/Scalar/LoopStrengthReduce.cpp

   1 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This transformation analyzes and transforms the induction variables (and
  11 // computations derived from them) into forms suitable for efficient execution
  12 // on the target.
  13 //
  14 // This pass performs a strength reduction on array references inside loops that
  15 // have as one or more of their components the loop induction variable, it
  16 // rewrites expressions to take advantage of scaled-index addressing modes
  17 // available on the target, and it performs a variety of other optimizations
  18 // related to loop induction variables.
  19 //
  20 // Terminology note: this code has a lot of handling for "post-increment" or
  21 // "post-inc" users. This is not talking about post-increment addressing modes;
  22 // it is instead talking about code like this:
  23 //
  24 //   %i = phi [ 0, %entry ], [ %i.next, %latch ]
  25 //   ...
  26 //   %i.next = add %i, 1
  27 //   %c = icmp eq %i.next, %n
  28 //
  29 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
  30 // it's useful to think about these as the same register, with some uses using
  31 // the value of the register before the add and some using // it after. In this
  32 // example, the icmp is a post-increment user, since it uses %i.next, which is
  33 // the value of the induction variable after the increment. The other common
  34 // case of post-increment users is users outside the loop.
  35 //
  36 // TODO: More sophistication in the way Formulae are generated.
  37 //
  38 // TODO: Handle multiple loops at a time.
  39 //
  40 // TODO: test/CodeGen/X86/full-lsr.ll should get full lsr. The problem is
  41 //       that {0,+,1}<%bb> is getting picked first because all 7 uses can
  42 //       use it, and while it's a pretty good solution, it means that LSR
  43 //       doesn't look further to find an even better solution.
  44 //
  45 // TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
  46 //       instead of a GlobalValue?
  47 //
  48 // TODO: When truncation is free, truncate ICmp users' operands to make it a
  49 //       smaller encoding (on x86 at least).
  50 //
  51 // TODO: When a negated register is used by an add (such as in a list of
  52 //       multiple base registers, or as the increment expression in an addrec),
  53 //       we may not actually need both reg and (-1 * reg) in registers; the
  54 //       negation can be implemented by using a sub instead of an add. The
  55 //       lack of support for taking this into consideration when making
  56 //       register pressure decisions is partly worked around by the "Special"
  57 //       use kind.
  58 //
  59 //===----------------------------------------------------------------------===//
  60
  61 #define DEBUG_TYPE "loop-reduce"
  62 #include "llvm/Transforms/Scalar.h"
  63 #include "llvm/Constants.h"
  64 #include "llvm/Instructions.h"
  65 #include "llvm/IntrinsicInst.h"
  66 #include "llvm/DerivedTypes.h"
  67 #include "llvm/Analysis/IVUsers.h"
  68 #include "llvm/Analysis/Dominators.h"
  69 #include "llvm/Analysis/LoopPass.h"
  70 #include "llvm/Analysis/ScalarEvolutionExpander.h"
  71 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  72 #include "llvm/Transforms/Utils/Local.h"
  73 #include "llvm/ADT/SmallBitVector.h"
  74 #include "llvm/ADT/SetVector.h"
  75 #include "llvm/Support/Debug.h"
  76 #include "llvm/Support/ValueHandle.h"
  77 #include "llvm/Support/raw_ostream.h"
  78 #include "llvm/Target/TargetLowering.h"
  79 #include <algorithm>
  80 using namespace llvm;
  81
  82 namespace {
  83
  84 // Constant strides come first which in turns are sorted by their absolute
  85 // values. If absolute values are the same, then positive strides comes first.
  86 // e.g.
  87 // 4, -1, X, 1, 2 ==> 1, -1, 2, 4, X
  88 struct StrideCompare {
  89   const ScalarEvolution &SE;
  90   explicit StrideCompare(const ScalarEvolution &se) : SE(se) {}
  91
  92   bool operator()(const SCEV *const &LHS, const SCEV *const &RHS) const {
  93     const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS);
  94     const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
  95     if (LHSC && RHSC) {
  96       unsigned BitWidth = std::max(SE.getTypeSizeInBits(LHS->getType()),
  97                                    SE.getTypeSizeInBits(RHS->getType()));
  98       APInt  LV = LHSC->getValue()->getValue();
  99       APInt  RV = RHSC->getValue()->getValue();
 100       LV.sextOrTrunc(BitWidth);
 101       RV.sextOrTrunc(BitWidth);
 102       APInt ALV = LV.abs();
 103       APInt ARV = RV.abs();
 104       if (ALV == ARV) {
 105         if (LV != RV)
 106           return LV.sgt(RV);
 107       } else {
 108         return ALV.ult(ARV);
 109       }
 110
 111       // If it's the same value but different type, sort by bit width so
 112       // that we emit larger induction variables before smaller
 113       // ones, letting the smaller be re-written in terms of larger ones.
 114       return SE.getTypeSizeInBits(RHS->getType()) <
 115              SE.getTypeSizeInBits(LHS->getType());
 116     }
 117     return LHSC && !RHSC;
 118   }
 119 };
 120
 121 /// RegSortData - This class holds data which is used to order reuse
 122 /// candidates.
 123 class RegSortData {
 124 public:
 125   /// Bits - This represents the set of LSRUses (by index) which reference a
 126   /// particular register.
 127   SmallBitVector Bits;
 128
 129   /// MaxComplexity - This represents the greatest complexity (see the comments
 130   /// on Formula::getComplexity) seen with a particular register.
 131   uint32_t MaxComplexity;
 132
 133   /// Index - This holds an arbitrary value used as a last-resort tie breaker
 134   /// to ensure deterministic behavior.
 135   unsigned Index;
 136
 137   RegSortData() {}
 138
 139   void print(raw_ostream &OS) const;
 140   void dump() const;
 141 };
 142
 143 }
 144
 145 void RegSortData::print(raw_ostream &OS) const {
 146   OS << "[NumUses=" << Bits.count()
 147      << ", MaxComplexity=";
 148   OS.write_hex(MaxComplexity);
 149   OS << ", Index=" << Index << ']';
 150 }
 151
 152 void RegSortData::dump() const {
 153   print(errs()); errs() << '\n';
 154 }
 155
 156 namespace {
 157
 158 /// RegCount - This is a helper class to sort a given set of registers
 159 /// according to associated RegSortData values.
 160 class RegCount {
 161 public:
 162   const SCEV *Reg;
 163   RegSortData Sort;
 164
 165   RegCount(const SCEV *R, const RegSortData &RSD)
 166     : Reg(R), Sort(RSD) {}
 167
 168   // Sort by count. Returning true means the register is preferred.
 169   bool operator<(const RegCount &Other) const {
 170     // Sort by the number of unique uses of this register.
 171     unsigned A = Sort.Bits.count();
 172     unsigned B = Other.Sort.Bits.count();
 173     if (A != B) return A > B;
 174
 175     if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
 176       const SCEVAddRecExpr *BR = dyn_cast<SCEVAddRecExpr>(Other.Reg);
 177       // AddRecs have higher priority than other things.
 178       if (!BR) return true;
 179
 180       // Prefer affine values.
 181       if (AR->isAffine() != BR->isAffine())
 182         return AR->isAffine();
 183
 184       const Loop *AL = AR->getLoop();
 185       const Loop *BL = BR->getLoop();
 186       if (AL != BL) {
 187         unsigned ADepth = AL->getLoopDepth();
 188         unsigned BDepth = BL->getLoopDepth();
 189         // Prefer a less deeply nested addrec.
 190         if (ADepth != BDepth)
 191           return ADepth < BDepth;
 192
 193         // Different loops at the same depth; do something arbitrary.
 194         BasicBlock *AH = AL->getHeader();
 195         BasicBlock *BH = BL->getHeader();
 196         for (Function::iterator I = AH, E = AH->getParent()->end(); I != E; ++I)
 197           if (&*I == BH) return true;
 198         return false;
 199       }
 200
 201       // Sort addrecs by stride.
 202       const SCEV *AStep = AR->getOperand(1);
 203       const SCEV *BStep = BR->getOperand(1);
 204       if (AStep != BStep) {
 205         if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(AStep)) {
 206           const SCEVConstant *BC = dyn_cast<SCEVConstant>(BStep);
 207           if (!BC) return true;
 208           // Arbitrarily prefer wider registers.
 209           if (AC->getValue()->getValue().getBitWidth() !=
 210               BC->getValue()->getValue().getBitWidth())
 211             return AC->getValue()->getValue().getBitWidth() >
 212                    BC->getValue()->getValue().getBitWidth();
 213           // Ignore the sign bit, assuming that striding by a negative value
 214           // is just as easy as by a positive value.
 215           // Prefer the addrec with the lesser absolute value stride, as it
 216           // will allow uses to have simpler addressing modes.
 217           return AC->getValue()->getValue().abs()
 218             .ult(BC->getValue()->getValue().abs());
 219         }
 220       }
 221
 222       // Then sort by the register which will permit the simplest uses.
 223       // This is a heuristic; currently we only track the most complex use as a
 224       // representative.
 225       if (Sort.MaxComplexity != Other.Sort.MaxComplexity)
 226         return Sort.MaxComplexity < Other.Sort.MaxComplexity;
 227
 228       // Then sort them by their start values.
 229       const SCEV *AStart = AR->getStart();
 230       const SCEV *BStart = BR->getStart();
 231       if (AStart != BStart) {
 232         if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(AStart)) {
 233           const SCEVConstant *BC = dyn_cast<SCEVConstant>(BStart);
 234           if (!BC) return true;
 235           // Arbitrarily prefer wider registers.
 236           if (AC->getValue()->getValue().getBitWidth() !=
 237               BC->getValue()->getValue().getBitWidth())
 238             return AC->getValue()->getValue().getBitWidth() >
 239                    BC->getValue()->getValue().getBitWidth();
 240           // Prefer positive over negative if the absolute values are the same.
 241           if (AC->getValue()->getValue().abs() ==
 242               BC->getValue()->getValue().abs())
 243             return AC->getValue()->getValue().isStrictlyPositive();
 244           // Prefer the addrec with the lesser absolute value start.
 245           return AC->getValue()->getValue().abs()
 246             .ult(BC->getValue()->getValue().abs());
 247         }
 248       }
 249     } else {
 250       // AddRecs have higher priority than other things.
 251       if (isa<SCEVAddRecExpr>(Other.Reg)) return false;
 252       // Sort by the register which will permit the simplest uses.
 253       // This is a heuristic; currently we only track the most complex use as a
 254       // representative.
 255       if (Sort.MaxComplexity != Other.Sort.MaxComplexity)
 256         return Sort.MaxComplexity < Other.Sort.MaxComplexity;
 257     }
 258
 259
 260     // Tie-breaker: the arbitrary index, to ensure a reliable ordering.
 261     return Sort.Index < Other.Sort.Index;
 262   }
 263
 264   void print(raw_ostream &OS) const;
 265   void dump() const;
 266 };
 267
 268 }
 269
 270 void RegCount::print(raw_ostream &OS) const {
 271   OS << *Reg << ':';
 272   Sort.print(OS);
 273 }
 274
 275 void RegCount::dump() const {
 276   print(errs()); errs() << '\n';
 277 }
 278
 279 namespace {
 280
 281 /// Formula - This class holds information that describes a formula for
 282 /// satisfying a use. It may include broken-out immediates and scaled registers.
 283 struct Formula {
 284   /// AM - This is used to represent complex addressing, as well as other kinds
 285   /// of interesting uses.
 286   TargetLowering::AddrMode AM;
 287
 288   /// BaseRegs - The list of "base" registers for this use. When this is
 289   /// non-empty, AM.HasBaseReg should be set to true.
 290   SmallVector<const SCEV *, 2> BaseRegs;
 291
 292   /// ScaledReg - The 'scaled' register for this use. This should be non-null
 293   /// when AM.Scale is not zero.
 294   const SCEV *ScaledReg;
 295
 296   Formula() : ScaledReg(0) {}
 297
 298   unsigned getNumRegs() const;
 299   uint32_t getComplexity() const;
 300   const Type *getType() const;
 301
 302   void InitialMatch(const SCEV *S, Loop *L,
 303                     ScalarEvolution &SE, DominatorTree &DT);
 304
 305   /// referencesReg - Test if this formula references the given register.
 306   bool referencesReg(const SCEV *S) const {
 307     return S == ScaledReg ||
 308            std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
 309   }
 310
 311   bool operator==(const Formula &Other) const {
 312     return BaseRegs == Other.BaseRegs &&
 313            ScaledReg == Other.ScaledReg &&
 314            AM.Scale == Other.AM.Scale &&
 315            AM.BaseOffs == Other.AM.BaseOffs &&
 316            AM.BaseGV == Other.AM.BaseGV;
 317   }
 318
 319   // This sorts at least partially based on host pointer values which are
 320   // not deterministic, so it is only usable for uniqification.
 321   bool operator<(const Formula &Other) const {
 322     if (BaseRegs != Other.BaseRegs)
 323       return BaseRegs < Other.BaseRegs;
 324     if (ScaledReg != Other.ScaledReg)
 325       return ScaledReg < Other.ScaledReg;
 326     if (AM.Scale != Other.AM.Scale)
 327       return AM.Scale < Other.AM.Scale;
 328     if (AM.BaseOffs != Other.AM.BaseOffs)
 329       return AM.BaseOffs < Other.AM.BaseOffs;
 330     if (AM.BaseGV != Other.AM.BaseGV)
 331       return AM.BaseGV < Other.AM.BaseGV;
 332     return false;
 333   }
 334
 335   void print(raw_ostream &OS) const;
 336   void dump() const;
 337 };
 338
 339 }
 340
 341 /// getNumRegs - Return the total number of register operands used by this
 342 /// formula. This does not include register uses implied by non-constant
 343 /// addrec strides.
 344 unsigned Formula::getNumRegs() const {
 345   return !!ScaledReg + BaseRegs.size();
 346 }
 347
 348 /// getComplexity - Return an oversimplified value indicating the complexity
 349 /// of this formula. This is used as a tie-breaker in choosing register
 350 /// preferences.
 351 uint32_t Formula::getComplexity() const {
 352   // Encode the information in a uint32_t so that comparing with operator<
 353   // will be interesting.
 354   return
 355     // Most significant, the number of registers. This saturates because we
 356     // need the bits, and because beyond a few hundred it doesn't really matter.
 357     (std::min(getNumRegs(), (1u<<15)-1) << 17) |
 358     // Having multiple base regs is worse than having a base reg and a scale.
 359     ((BaseRegs.size() > 1) << 16) |
 360     // Scale absolute value.
 361     ((AM.Scale != 0 ? (Log2_64(abs64(AM.Scale)) + 1) : 0u) << 9) |
 362     // Scale sign, which is less significant than the absolute value.
 363     ((AM.Scale < 0) << 8) |
 364     // Offset absolute value.
 365     ((AM.BaseOffs != 0 ? (Log2_64(abs64(AM.BaseOffs)) + 1) : 0u) << 1) |
 366     // If a GV is present, treat it like a maximal offset.
 367     ((AM.BaseGV ? ((1u<<7)-1) : 0) << 1) |
 368     // Offset sign, which is less significant than the absolute offset.
 369     ((AM.BaseOffs < 0) << 0);
 370 }
 371
 372 /// getType - Return the type of this formula, if it has one, or null
 373 /// otherwise. This type is meaningless except for the bit size.
 374 const Type *Formula::getType() const {
 375   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
 376          ScaledReg ? ScaledReg->getType() :
 377          AM.BaseGV ? AM.BaseGV->getType() :
 378          0;
 379 }
 380
 381 namespace {
 382
 383 /// ComplexitySorter - A predicate which orders Formulae by the number of
 384 /// registers they contain.
 385 struct ComplexitySorter {
 386   bool operator()(const Formula &LHS, const Formula &RHS) const {
 387     unsigned L = LHS.getNumRegs();
 388     unsigned R = RHS.getNumRegs();
 389     if (L != R) return L < R;
 390
 391     return LHS.getComplexity() < RHS.getComplexity();
 392   }
 393 };
 394
 395 }
 396
 397 /// DoInitialMatch - Recurrsion helper for InitialMatch.
 398 static void DoInitialMatch(const SCEV *S, Loop *L,
 399                            SmallVectorImpl<const SCEV *> &Good,
 400                            SmallVectorImpl<const SCEV *> &Bad,
 401                            ScalarEvolution &SE, DominatorTree &DT) {
 402   // Collect expressions which properly dominate the loop header.
 403   if (S->properlyDominates(L->getHeader(), &DT)) {
 404     Good.push_back(S);
 405     return;
 406   }
 407
 408   // Look at add operands.
 409   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 410     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
 411          I != E; ++I)
 412       DoInitialMatch(*I, L, Good, Bad, SE, DT);
 413     return;
 414   }
 415
 416   // Look at addrec operands.
 417   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 418     if (!AR->getStart()->isZero()) {
 419       DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT);
 420       DoInitialMatch(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()),
 421                                       AR->getStepRecurrence(SE),
 422                                       AR->getLoop()),
 423                      L, Good, Bad, SE, DT);
 424       return;
 425     }
 426   }
 427
 428   // Handle a multiplication by -1 (negation) if it didn't fold.
 429   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
 430     if (Mul->getOperand(0)->isAllOnesValue()) {
 431       SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
 432       const SCEV *NewMul = SE.getMulExpr(Ops);
 433
 434       SmallVector<const SCEV *, 4> MyGood;
 435       SmallVector<const SCEV *, 4> MyBad;
 436       DoInitialMatch(NewMul, L, MyGood, MyBad, SE, DT);
 437       const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
 438         SE.getEffectiveSCEVType(NewMul->getType())));
 439       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
 440            E = MyGood.end(); I != E; ++I)
 441         Good.push_back(SE.getMulExpr(NegOne, *I));
 442       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
 443            E = MyBad.end(); I != E; ++I)
 444         Bad.push_back(SE.getMulExpr(NegOne, *I));
 445       return;
 446     }
 447
 448   // Ok, we can't do anything interesting. Just stuff the whole thing into a
 449   // register and hope for the best.
 450   Bad.push_back(S);
 451 }
 452
 453 /// InitialMatch - Incorporate loop-variant parts of S into this Formula,
 454 /// attempting to keep all loop-invariant and loop-computable values in a
 455 /// single base register.
 456 void Formula::InitialMatch(const SCEV *S, Loop *L,
 457                            ScalarEvolution &SE, DominatorTree &DT) {
 458   SmallVector<const SCEV *, 4> Good;
 459   SmallVector<const SCEV *, 4> Bad;
 460   DoInitialMatch(S, L, Good, Bad, SE, DT);
 461   if (!Good.empty()) {
 462     BaseRegs.push_back(SE.getAddExpr(Good));
 463     AM.HasBaseReg = true;
 464   }
 465   if (!Bad.empty()) {
 466     BaseRegs.push_back(SE.getAddExpr(Bad));
 467     AM.HasBaseReg = true;
 468   }
 469 }
 470
 471 void Formula::print(raw_ostream &OS) const {
 472   bool First = true;
 473   if (AM.BaseGV) {
 474     if (!First) OS << " + "; else First = false;
 475     WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false);
 476   }
 477   if (AM.BaseOffs != 0) {
 478     if (!First) OS << " + "; else First = false;
 479     OS << AM.BaseOffs;
 480   }
 481   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
 482        E = BaseRegs.end(); I != E; ++I) {
 483     if (!First) OS << " + "; else First = false;
 484     OS << "reg(";
 485     OS << **I;
 486     OS << ")";
 487   }
 488   if (AM.Scale != 0) {
 489     if (!First) OS << " + "; else First = false;
 490     OS << AM.Scale << "*reg(";
 491     if (ScaledReg)
 492       OS << *ScaledReg;
 493     else
 494       OS << "<unknown>";
 495     OS << ")";
 496   }
 497 }
 498
 499 void Formula::dump() const {
 500   print(errs()); errs() << '\n';
 501 }
 502
 503 /// getSDiv - Return an expression for LHS /s RHS, if it can be determined,
 504 /// or null otherwise. If IgnoreSignificantBits is true, expressions like
 505 /// (X * Y) /s Y are simplified to Y, ignoring that the multiplication may
 506 /// overflow, which is useful when the result will be used in a context where
 507 /// the most significant bits are ignored.
 508 static const SCEV *getSDiv(const SCEV *LHS, const SCEV *RHS,
 509                            ScalarEvolution &SE,
 510                            bool IgnoreSignificantBits = false) {
 511   // Handle the trivial case, which works for any SCEV type.
 512   if (LHS == RHS)
 513     return SE.getIntegerSCEV(1, LHS->getType());
 514
 515   // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do some
 516   // folding.
 517   if (RHS->isAllOnesValue())
 518     return SE.getMulExpr(LHS, RHS);
 519
 520   // Check for a division of a constant by a constant.
 521   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
 522     const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
 523     if (!RC)
 524       return 0;
 525     if (C->getValue()->getValue().srem(RC->getValue()->getValue()) != 0)
 526       return 0;
 527     return SE.getConstant(C->getValue()->getValue()
 528                .sdiv(RC->getValue()->getValue()));
 529   }
 530
 531   // Distribute the sdiv over addrec operands.
 532   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
 533     const SCEV *Start = getSDiv(AR->getStart(), RHS, SE,
 534                                 IgnoreSignificantBits);
 535     if (!Start) return 0;
 536     const SCEV *Step = getSDiv(AR->getStepRecurrence(SE), RHS, SE,
 537                                IgnoreSignificantBits);
 538     if (!Step) return 0;
 539     return SE.getAddRecExpr(Start, Step, AR->getLoop());
 540   }
 541
 542   // Distribute the sdiv over add operands.
 543   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
 544     SmallVector<const SCEV *, 8> Ops;
 545     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
 546          I != E; ++I) {
 547       const SCEV *Op = getSDiv(*I, RHS, SE,
 548                                IgnoreSignificantBits);
 549       if (!Op) return 0;
 550       Ops.push_back(Op);
 551     }
 552     return SE.getAddExpr(Ops);
 553   }
 554
 555   // Check for a multiply operand that we can pull RHS out of.
 556   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS))
 557     if (IgnoreSignificantBits || Mul->hasNoSignedWrap()) {
 558       SmallVector<const SCEV *, 4> Ops;
 559       bool Found = false;
 560       for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
 561            I != E; ++I) {
 562         if (!Found)
 563           if (const SCEV *Q = getSDiv(*I, RHS, SE, IgnoreSignificantBits)) {
 564             Ops.push_back(Q);
 565             Found = true;
 566             continue;
 567           }
 568         Ops.push_back(*I);
 569       }
 570       return Found ? SE.getMulExpr(Ops) : 0;
 571     }
 572
 573   // Otherwise we don't know.
 574   return 0;
 575 }
 576
 577 namespace {
 578
 579 /// LSRUse - This class holds the state that LSR keeps for each use in
 580 /// IVUsers, as well as uses invented by LSR itself. It includes information
 581 /// about what kinds of things can be folded into the user, information
 582 /// about the user itself, and information about how the use may be satisfied.
 583 /// TODO: Represent multiple users of the same expression in common?
 584 class LSRUse {
 585   SmallSet<Formula, 8> FormulaeUniquifier;
 586
 587 public:
 588   /// KindType - An enum for a kind of use, indicating what types of
 589   /// scaled and immediate operands it might support.
 590   enum KindType {
 591     Basic,   ///< A normal use, with no folding.
 592     Special, ///< A special case of basic, allowing -1 scales.
 593     Address, ///< An address use; folding according to TargetLowering
 594     ICmpZero ///< An equality icmp with both operands folded into one.
 595     // TODO: Add a generic icmp too?
 596   };
 597
 598   KindType Kind;
 599   const Type *AccessTy;
 600   Instruction *UserInst;
 601   Value *OperandValToReplace;
 602
 603   /// PostIncLoop - If this user is to use the post-incremented value of an
 604   /// induction variable, this variable is non-null and holds the loop
 605   /// associated with the induction variable.
 606   const Loop *PostIncLoop;
 607
 608   /// Formulae - A list of ways to build a value that can satisfy this user.
 609   /// After the list is populated, one of these is selected heuristically and
 610   /// used to formulate a replacement for OperandValToReplace in UserInst.
 611   SmallVector<Formula, 12> Formulae;
 612
 613   LSRUse() : Kind(Basic), AccessTy(0),
 614              UserInst(0), OperandValToReplace(0), PostIncLoop(0) {}
 615
 616   void InsertInitialFormula(const SCEV *S, Loop *L,
 617                             ScalarEvolution &SE, DominatorTree &DT);
 618   void InsertSupplementalFormula(const SCEV *S);
 619
 620   bool InsertFormula(const Formula &F);
 621
 622   void Rewrite(Loop *L, SCEVExpander &Rewriter,
 623                SmallVectorImpl<WeakVH> &DeadInsts,
 624                ScalarEvolution &SE, DominatorTree &DT,
 625                Pass *P) const;
 626
 627   void print(raw_ostream &OS) const;
 628   void dump() const;
 629
 630 private:
 631   Value *Expand(BasicBlock::iterator IP,
 632                 Loop *L, SCEVExpander &Rewriter,
 633                 SmallVectorImpl<WeakVH> &DeadInsts,
 634                 ScalarEvolution &SE, DominatorTree &DT) const;
 635 };
 636
 637 }
 638
 639 /// ExtractImmediate - If S involves the addition of a constant integer value,
 640 /// return that integer value, and mutate S to point to a new SCEV with that
 641 /// value excluded.
 642 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
 643   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
 644     if (C->getValue()->getValue().getMinSignedBits() <= 64) {
 645       S = SE.getIntegerSCEV(0, C->getType());
 646       return C->getValue()->getSExtValue();
 647     }
 648   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 649     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
 650     int64_t Result = ExtractImmediate(NewOps.front(), SE);
 651     S = SE.getAddExpr(NewOps);
 652     return Result;
 653   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 654     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
 655     int64_t Result = ExtractImmediate(NewOps.front(), SE);
 656     S = SE.getAddRecExpr(NewOps, AR->getLoop());
 657     return Result;
 658   }
 659   return 0;
 660 }
 661
 662 /// ExtractSymbol - If S involves the addition of a GlobalValue address,
 663 /// return that symbol, and mutate S to point to a new SCEV with that
 664 /// value excluded.
 665 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
 666   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
 667     if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
 668       S = SE.getIntegerSCEV(0, GV->getType());
 669       return GV;
 670     }
 671   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 672     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
 673     GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
 674     S = SE.getAddExpr(NewOps);
 675     return Result;
 676   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 677     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
 678     GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
 679     S = SE.getAddRecExpr(NewOps, AR->getLoop());
 680     return Result;
 681   }
 682   return 0;
 683 }
 684
 685 /// isLegalUse - Test whether the use described by AM is "legal", meaning
 686 /// it can be completely folded into the user instruction at isel time.
 687 /// This includes address-mode folding and special icmp tricks.
 688 static bool isLegalUse(const TargetLowering::AddrMode &AM,
 689                        LSRUse::KindType Kind, const Type *AccessTy,
 690                        const TargetLowering *TLI) {
 691   switch (Kind) {
 692   case LSRUse::Address:
 693     // If we have low-level target information, ask the target if it can
 694     // completely fold this address.
 695     if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy);
 696
 697     // Otherwise, just guess that reg+reg addressing is legal.
 698     return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1;
 699
 700   case LSRUse::ICmpZero:
 701     // There's not even a target hook for querying whether it would be legal
 702     // to fold a GV into an ICmp.
 703     if (AM.BaseGV)
 704       return false;
 705
 706     // ICmp only has two operands; don't allow more than two non-trivial parts.
 707     if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0)
 708       return false;
 709
 710     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale
 711     // by putting the scaled register in the other operand of the icmp.
 712     if (AM.Scale != 0 && AM.Scale != -1)
 713       return false;
 714
 715     // If we have low-level target information, ask the target if it can
 716     // fold an integer immediate on an icmp.
 717     if (AM.BaseOffs != 0) {
 718       if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs);
 719       return false;
 720     }
 721
 722     return true;
 723
 724   case LSRUse::Basic:
 725     // Only handle single-register values.
 726     return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;
 727
 728   case LSRUse::Special:
 729     // Only handle -1 scales, or no scale.
 730     return AM.Scale == 0 || AM.Scale == -1;
 731   }
 732
 733   return false;
 734 }
 735
 736 static bool isAlwaysFoldable(const SCEV *S,
 737                              bool HasBaseReg,
 738                              LSRUse::KindType Kind, const Type *AccessTy,
 739                              const TargetLowering *TLI,
 740                              ScalarEvolution &SE) {
 741   // Fast-path: zero is always foldable.
 742   if (S->isZero()) return true;
 743
 744   // Conservatively, create an address with an immediate and a
 745   // base and a scale.
 746   TargetLowering::AddrMode AM;
 747   AM.BaseOffs = ExtractImmediate(S, SE);
 748   AM.BaseGV = ExtractSymbol(S, SE);
 749   AM.HasBaseReg = HasBaseReg;
 750   AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
 751
 752   // If there's anything else involved, it's not foldable.
 753   if (!S->isZero()) return false;
 754
 755   return isLegalUse(AM, Kind, AccessTy, TLI);
 756 }
 757
 758 /// InsertFormula - If the given formula has not yet been inserted, add it
 759 /// to the list, and return true. Return false otherwise.
 760 bool LSRUse::InsertFormula(const Formula &F) {
 761   Formula Copy = F;
 762
 763   // Sort the base regs, to avoid adding the same solution twice with
 764   // the base regs in different orders. This uses host pointer values, but
 765   // it doesn't matter since it's only used for uniquifying.
 766   std::sort(Copy.BaseRegs.begin(), Copy.BaseRegs.end());
 767
 768   DEBUG(for (SmallVectorImpl<const SCEV *>::const_iterator I =
 769              F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
 770           assert(!(*I)->isZero() && "Zero allocated in a base register!");
 771         assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
 772                "Zero allocated in a scaled register!"));
 773
 774   if (FormulaeUniquifier.insert(Copy)) {
 775     Formulae.push_back(F);
 776     return true;
 777   }
 778
 779   return false;
 780 }
 781
 782 void
 783 LSRUse::InsertInitialFormula(const SCEV *S, Loop *L,
 784                              ScalarEvolution &SE, DominatorTree &DT) {
 785   Formula F;
 786   F.InitialMatch(S, L, SE, DT);
 787   bool Inserted = InsertFormula(F);
 788   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
 789 }
 790
 791 void
 792 LSRUse::InsertSupplementalFormula(const SCEV *S) {
 793   Formula F;
 794   F.BaseRegs.push_back(S);
 795   F.AM.HasBaseReg = true;
 796   bool Inserted = InsertFormula(F);
 797   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
 798 }
 799
 800 /// getImmediateDominator - A handy utility for the specific DominatorTree
 801 /// query that we need here.
 802 ///
 803 static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) {
 804   DomTreeNode *Node = DT.getNode(BB);
 805   if (!Node) return 0;
 806   Node = Node->getIDom();
 807   if (!Node) return 0;
 808   return Node->getBlock();
 809 }
 810
 811 Value *LSRUse::Expand(BasicBlock::iterator IP,
 812                       Loop *L, SCEVExpander &Rewriter,
 813                       SmallVectorImpl<WeakVH> &DeadInsts,
 814                       ScalarEvolution &SE, DominatorTree &DT) const {
 815   // Then, collect some instructions which we will remain dominated by when
 816   // expanding the replacement. These must be dominated by any operands that
 817   // will be required in the expansion.
 818   SmallVector<Instruction *, 4> Inputs;
 819   if (Instruction *I = dyn_cast<Instruction>(OperandValToReplace))
 820     Inputs.push_back(I);
 821   if (Kind == ICmpZero)
 822     if (Instruction *I =
 823           dyn_cast<Instruction>(cast<ICmpInst>(UserInst)->getOperand(1)))
 824       Inputs.push_back(I);
 825   if (PostIncLoop && !L->contains(UserInst))
 826     Inputs.push_back(L->getLoopLatch()->getTerminator());
 827
 828   // Then, climb up the immediate dominator tree as far as we can go while
 829   // still being dominated by the input positions.
 830   for (;;) {
 831     bool AllDominate = true;
 832     Instruction *BetterPos = 0;
 833     BasicBlock *IDom = getImmediateDominator(IP->getParent(), DT);
 834     if (!IDom) break;
 835     Instruction *Tentative = IDom->getTerminator();
 836     for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
 837          E = Inputs.end(); I != E; ++I) {
 838       Instruction *Inst = *I;
 839       if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
 840         AllDominate = false;
 841         break;
 842       }
 843       if (IDom == Inst->getParent() &&
 844           (!BetterPos || DT.dominates(BetterPos, Inst)))
 845         BetterPos = next(BasicBlock::iterator(Inst));
 846     }
 847     if (!AllDominate)
 848       break;
 849     if (BetterPos)
 850       IP = BetterPos;
 851     else
 852       IP = Tentative;
 853   }
 854   while (isa<PHINode>(IP)) ++IP;
 855
 856   // The first formula in the list is the winner.
 857   const Formula &F = Formulae.front();
 858
 859   // Inform the Rewriter if we have a post-increment use, so that it can
 860   // perform an advantageous expansion.
 861   Rewriter.setPostInc(PostIncLoop);
 862
 863   // This is the type that the user actually needs.
 864   const Type *OpTy = OperandValToReplace->getType();
 865   // This will be the type that we'll initially expand to.
 866   const Type *Ty = F.getType();
 867   if (!Ty)
 868     // No type known; just expand directly to the ultimate type.
 869     Ty = OpTy;
 870   else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
 871     // Expand directly to the ultimate type if it's the right size.
 872     Ty = OpTy;
 873   // This is the type to do integer arithmetic in.
 874   const Type *IntTy = SE.getEffectiveSCEVType(Ty);
 875
 876   // Build up a list of operands to add together to form the full base.
 877   SmallVector<const SCEV *, 8> Ops;
 878
 879   // Expand the BaseRegs portion.
 880   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
 881        E = F.BaseRegs.end(); I != E; ++I) {
 882     const SCEV *Reg = *I;
 883     assert(!Reg->isZero() && "Zero allocated in a base register!");
 884
 885     // If we're expanding for a post-inc user for the add-rec's loop, make the
 886     // post-inc adjustment.
 887     if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg))
 888       if (AR->getLoop() == PostIncLoop)
 889         Reg = SE.getAddExpr(Reg, AR->getStepRecurrence(SE));
 890
 891     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
 892   }
 893
 894   // Expand the ScaledReg portion.
 895   Value *ICmpScaledV = 0;
 896   if (F.AM.Scale != 0) {
 897     const SCEV *ScaledS = F.ScaledReg;
 898
 899     // If we're expanding for a post-inc user for the add-rec's loop, make the
 900     // post-inc adjustment.
 901     if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ScaledS))
 902       if (AR->getLoop() == PostIncLoop)
 903         ScaledS = SE.getAddExpr(ScaledS, AR->getStepRecurrence(SE));
 904
 905     if (Kind == ICmpZero) {
 906       // An interesting way of "folding" with an icmp is to use a negated
 907       // scale, which we'll implement by inserting it into the other operand
 908       // of the icmp.
 909       assert(F.AM.Scale == -1 &&
 910              "The only scale supported by ICmpZero uses is -1!");
 911       ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
 912     } else {
 913       // Otherwise just expand the scaled register and an explicit scale,
 914       // which is expected to be matched as part of the address.
 915       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
 916       const Type *ScaledTy = SE.getEffectiveSCEVType(ScaledS->getType());
 917       ScaledS = SE.getMulExpr(ScaledS,
 918                               SE.getSCEV(ConstantInt::get(ScaledTy,
 919                                                           F.AM.Scale)));
 920       Ops.push_back(ScaledS);
 921     }
 922   }
 923
 924   // Expand the immediate portions.
 925   if (F.AM.BaseGV)
 926     Ops.push_back(SE.getSCEV(F.AM.BaseGV));
 927   if (F.AM.BaseOffs != 0) {
 928     if (Kind == ICmpZero) {
 929       // The other interesting way of "folding" with an ICmpZero is to use a
 930       // negated immediate.
 931       if (!ICmpScaledV)
 932         ICmpScaledV = ConstantInt::get(IntTy, -F.AM.BaseOffs);
 933       else {
 934         Ops.push_back(SE.getUnknown(ICmpScaledV));
 935         ICmpScaledV = ConstantInt::get(IntTy, F.AM.BaseOffs);
 936       }
 937     } else {
 938       // Just add the immediate values. These again are expected to be matched
 939       // as part of the address.
 940       Ops.push_back(SE.getSCEV(ConstantInt::get(IntTy, F.AM.BaseOffs)));
 941     }
 942   }
 943
 944   // Emit instructions summing all the operands.
 945   const SCEV *FullS = Ops.empty() ?
 946                       SE.getIntegerSCEV(0, IntTy) :
 947                       SE.getAddExpr(Ops);
 948   Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);
 949
 950   // We're done expanding now, so reset the rewriter.
 951   Rewriter.setPostInc(0);
 952
 953   // An ICmpZero Formula represents an ICmp which we're handling as a
 954   // comparison against zero. Now that we've expanded an expression for that
 955   // form, update the ICmp's other operand.
 956   if (Kind == ICmpZero) {
 957     ICmpInst *CI = cast<ICmpInst>(UserInst);
 958     DeadInsts.push_back(CI->getOperand(1));
 959     assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
 960                            "a scale at the same time!");
 961     if (F.AM.Scale == -1) {
 962       if (ICmpScaledV->getType() != OpTy) {
 963         Instruction *Cast =
 964           CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
 965                                                    OpTy, false),
 966                            ICmpScaledV, OpTy, "tmp", CI);
 967         ICmpScaledV = Cast;
 968       }
 969       CI->setOperand(1, ICmpScaledV);
 970     } else {
 971       assert(F.AM.Scale == 0 &&
 972              "ICmp does not support folding a global value and "
 973              "a scale at the same time!");
 974       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
 975                                            -(uint64_t)F.AM.BaseOffs);
 976       if (C->getType() != OpTy)
 977         C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
 978                                                           OpTy, false),
 979                                   C, OpTy);
 980
 981       CI->setOperand(1, C);
 982     }
 983   }
 984
 985   return FullV;
 986 }
 987
 988 /// Rewrite - Emit instructions for the leading candidate expression for this
 989 /// LSRUse (this is called "expanding"), and update the UserInst to reference
 990 /// the newly expanded value.
 991 void LSRUse::Rewrite(Loop *L, SCEVExpander &Rewriter,
 992                      SmallVectorImpl<WeakVH> &DeadInsts,
 993                      ScalarEvolution &SE, DominatorTree &DT,
 994                      Pass *P) const {
 995   const Type *OpTy = OperandValToReplace->getType();
 996
 997   // First, find an insertion point that dominates UserInst. For PHI nodes,
 998   // find the nearest block which dominates all the relevant uses.
 999   if (PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1000     DenseMap<BasicBlock *, Value *> Inserted;
1001     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1002       if (PN->getIncomingValue(i) == OperandValToReplace) {
1003         BasicBlock *BB = PN->getIncomingBlock(i);
1004
1005         // If this is a critical edge, split the edge so that we do not insert
1006         // the code on all predecessor/successor paths.  We do this unless this
1007         // is the canonical backedge for this loop, which complicates post-inc
1008         // users.
1009         if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
1010             !isa<IndirectBrInst>(BB->getTerminator()) &&
1011             (PN->getParent() != L->getHeader() || !L->contains(BB))) {
1012           // Split the critical edge.
1013           BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);
1014
1015           // If PN is outside of the loop and BB is in the loop, we want to
1016           // move the block to be immediately before the PHI block, not
1017           // immediately after BB.
1018           if (L->contains(BB) && !L->contains(PN))
1019             NewBB->moveBefore(PN->getParent());
1020
1021           // Splitting the edge can reduce the number of PHI entries we have.
1022           e = PN->getNumIncomingValues();
1023           BB = NewBB;
1024           i = PN->getBasicBlockIndex(BB);
1025         }
1026
1027         std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
1028           Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
1029         if (!Pair.second)
1030           PN->setIncomingValue(i, Pair.first->second);
1031         else {
1032           Value *FullV = Expand(BB->getTerminator(),
1033                                 L, Rewriter, DeadInsts, SE, DT);
1034
1035           // If this is reuse-by-noop-cast, insert the noop cast.
1036           if (FullV->getType() != OpTy)
1037             FullV =
1038               CastInst::Create(CastInst::getCastOpcode(FullV, false,
1039                                                        OpTy, false),
1040                                FullV, OperandValToReplace->getType(),
1041                                "tmp", BB->getTerminator());
1042
1043           PN->setIncomingValue(i, FullV);
1044           Pair.first->second = FullV;
1045         }
1046       }
1047   } else {
1048     Value *FullV = Expand(UserInst, L, Rewriter, DeadInsts, SE, DT);
1049
1050     // If this is reuse-by-noop-cast, insert the noop cast.
1051     if (FullV->getType() != OpTy) {
1052       Instruction *Cast =
1053         CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
1054                          FullV, OpTy, "tmp", UserInst);
1055       FullV = Cast;
1056     }
1057
1058     // Update the user.
1059     UserInst->replaceUsesOfWith(OperandValToReplace, FullV);
1060   }
1061
1062   DeadInsts.push_back(OperandValToReplace);
1063 }
1064
1065 void LSRUse::print(raw_ostream &OS) const {
1066   OS << "LSR Use: Kind=";
1067   switch (Kind) {
1068   case Basic:    OS << "Basic"; break;
1069   case Special:  OS << "Special"; break;
1070   case ICmpZero: OS << "ICmpZero"; break;
1071   case Address:
1072     OS << "Address of ";
1073     if (isa<PointerType>(AccessTy))
1074       OS << "pointer"; // the full pointer type could be really verbose
1075     else
1076       OS << *AccessTy;
1077   }
1078
1079   OS << ", UserInst=";
1080   // Store is common and interesting enough to be worth special-casing.
1081   if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1082     OS << "store ";
1083     WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
1084   } else if (UserInst->getType()->isVoidTy())
1085     OS << UserInst->getOpcodeName();
1086   else
1087     WriteAsOperand(OS, UserInst, /*PrintType=*/false);
1088
1089   OS << ", OperandValToReplace=";
1090   WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);
1091
1092   if (PostIncLoop) {
1093     OS << ", PostIncLoop=";
1094     WriteAsOperand(OS, PostIncLoop->getHeader(), /*PrintType=*/false);
1095   }
1096 }
1097
1098 void LSRUse::dump() const {
1099   print(errs()); errs() << '\n';
1100 }
1101
1102 namespace {
1103
1104 /// Score - This class is used to measure and compare candidate formulae.
1105 class Score {
1106   unsigned NumRegs;
1107   unsigned NumPhis;
1108   unsigned NumIVMuls;
1109   unsigned NumBaseAdds;
1110   unsigned NumImms;
1111
1112 public:
1113   Score()
1114     : NumRegs(0), NumPhis(0), NumIVMuls(0), NumBaseAdds(0), NumImms(0) {}
1115
1116   void RateInitial(SmallVector<LSRUse, 16> const &Uses, const Loop *L,
1117                    ScalarEvolution &SE);
1118
1119   void Rate(const SCEV *Reg, const SmallBitVector &Bits,
1120             const SmallVector<LSRUse, 16> &Uses, const Loop *L,
1121             ScalarEvolution &SE);
1122
1123   bool operator<(const Score &Other) const;
1124
1125   void print_details(raw_ostream &OS, const SCEV *Reg,
1126                      const SmallPtrSet<const SCEV *, 8> &Regs) const;
1127
1128   void print(raw_ostream &OS) const;
1129   void dump() const;
1130
1131 private:
1132   void RateRegister(const SCEV *Reg, SmallPtrSet<const SCEV *, 8> &Regs,
1133                     const Loop *L);
1134   void RateFormula(const Formula &F, SmallPtrSet<const SCEV *, 8> &Regs,
1135                    const Loop *L);
1136
1137   void Loose();
1138 };
1139
1140 }
1141
1142 /// RateRegister - Tally up interesting quantities from the given register.
1143 void Score::RateRegister(const SCEV *Reg,
1144                          SmallPtrSet<const SCEV *, 8> &Regs,
1145                          const Loop *L) {
1146   if (Regs.insert(Reg))
1147     if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1148       NumPhis += AR->getLoop() == L;
1149
1150       // Add the step value register, if it needs one.
1151       if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
1152         RateRegister(AR->getOperand(1), Regs, L);
1153     }
1154 }
1155
1156 void Score::RateFormula(const Formula &F,
1157                         SmallPtrSet<const SCEV *, 8> &Regs,
1158                         const Loop *L) {
1159   // Tally up the registers.
1160   if (F.ScaledReg)
1161     RateRegister(F.ScaledReg, Regs, L);
1162   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
1163        E = F.BaseRegs.end(); I != E; ++I) {
1164     const SCEV *BaseReg = *I;
1165     RateRegister(BaseReg, Regs, L);
1166
1167     NumIVMuls += isa<SCEVMulExpr>(BaseReg) &&
1168                  BaseReg->hasComputableLoopEvolution(L);
1169   }
1170
1171   if (F.BaseRegs.size() > 1)
1172     NumBaseAdds += F.BaseRegs.size() - 1;
1173
1174   // Tally up the non-zero immediates.
1175   if (F.AM.BaseGV || F.AM.BaseOffs != 0)
1176     ++NumImms;
1177 }
1178
1179 /// Loose - Set this score to a loosing value.
1180 void Score::Loose() {
1181   NumRegs = ~0u;
1182   NumPhis = ~0u;
1183   NumIVMuls = ~0u;
1184   NumBaseAdds = ~0u;
1185   NumImms = ~0u;
1186 }
1187
1188 /// RateInitial - Compute a score for the initial "fully reduced" solution.
1189 void Score::RateInitial(SmallVector<LSRUse, 16> const &Uses, const Loop *L,
1190                         ScalarEvolution &SE) {
1191   SmallPtrSet<const SCEV *, 8> Regs;
1192   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
1193        E = Uses.end(); I != E; ++I)
1194     RateFormula(I->Formulae.front(), Regs, L);
1195   NumRegs += Regs.size();
1196
1197   DEBUG(print_details(dbgs(), 0, Regs));
1198 }
1199
1200 /// Rate - Compute a score for the solution where the reuse associated with
1201 /// putting Reg in a register is selected.
1202 void Score::Rate(const SCEV *Reg, const SmallBitVector &Bits,
1203                  const SmallVector<LSRUse, 16> &Uses, const Loop *L,
1204                  ScalarEvolution &SE) {
1205   SmallPtrSet<const SCEV *, 8> Regs;
1206   for (size_t i = 0, e = Uses.size(); i != e; ++i) {
1207     const LSRUse &LU = Uses[i];
1208
1209     const Formula *BestFormula = 0;
1210     if (i >= Bits.size() || !Bits.test(i))
1211       // This use doesn't use the current register. Just go with the current
1212       // leading candidate formula.
1213       BestFormula = &LU.Formulae.front();
1214     else
1215       // Find the best formula for this use that uses the current register.
1216       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
1217            E = LU.Formulae.end(); I != E; ++I) {
1218         const Formula &F = *I;
1219         if (F.referencesReg(Reg) &&
1220             (!BestFormula || ComplexitySorter()(F, *BestFormula)))
1221           BestFormula = &F;
1222       }
1223
1224     // If we didn't find *any* forumlae, because earlier we eliminated some
1225     // in greedy fashion, skip the current register's reuse opportunity.
1226     if (!BestFormula) {
1227       DEBUG(dbgs() << "Reuse with reg " << *Reg
1228                    << " wouldn't help any users.\n");
1229       Loose();
1230       return;
1231     }
1232
1233     // For an in-loop post-inc user, don't allow multiple base registers,
1234     // because that would require an awkward in-loop add after the increment.
1235     if (LU.PostIncLoop && LU.PostIncLoop->contains(LU.UserInst) &&
1236         BestFormula->BaseRegs.size() > 1) {
1237       DEBUG(dbgs() << "Reuse with reg " << *Reg
1238                    << " would require an in-loop post-inc add: ";
1239             BestFormula->dump());
1240       Loose();
1241       return;
1242     }
1243
1244     RateFormula(*BestFormula, Regs, L);
1245   }
1246   NumRegs += Regs.size();
1247
1248   DEBUG(print_details(dbgs(), Reg, Regs));
1249 }
1250
1251 /// operator< - Choose the better score.
1252 bool Score::operator<(const Score &Other) const {
1253   if (NumRegs != Other.NumRegs)
1254     return NumRegs < Other.NumRegs;
1255   if (NumPhis != Other.NumPhis)
1256     return NumPhis < Other.NumPhis;
1257   if (NumIVMuls != Other.NumIVMuls)
1258     return NumIVMuls < Other.NumIVMuls;
1259   if (NumBaseAdds != Other.NumBaseAdds)
1260     return NumBaseAdds < Other.NumBaseAdds;
1261   return NumImms < Other.NumImms;
1262 }
1263
1264 void Score::print_details(raw_ostream &OS,
1265                           const SCEV *Reg,
1266                           const SmallPtrSet<const SCEV *, 8> &Regs) const {
1267   if (Reg) OS << "Reuse with reg " << *Reg << " would require ";
1268   else     OS << "The initial solution would require ";
1269   print(OS);
1270   OS << ". Regs:";
1271   for (SmallPtrSet<const SCEV *, 8>::const_iterator I = Regs.begin(),
1272        E = Regs.end(); I != E; ++I)
1273     OS << ' ' << **I;
1274   OS << '\n';
1275 }
1276
1277 void Score::print(raw_ostream &OS) const {
1278   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
1279   if (NumPhis != 0)
1280     OS << ", including " << NumPhis << " PHI" << (NumPhis == 1 ? "" : "s");
1281   if (NumIVMuls != 0)
1282     OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
1283   if (NumBaseAdds != 0)
1284     OS << ", plus " << NumBaseAdds << " base add"
1285        << (NumBaseAdds == 1 ? "" : "s");
1286   if (NumImms != 0)
1287     OS << ", plus " << NumImms << " imm" << (NumImms == 1 ? "" : "s");
1288 }
1289
1290 void Score::dump() const {
1291   print(errs()); errs() << '\n';
1292 }
1293
1294 /// isAddressUse - Returns true if the specified instruction is using the
1295 /// specified value as an address.
1296 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
1297   bool isAddress = isa<LoadInst>(Inst);
1298   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1299     if (SI->getOperand(1) == OperandVal)
1300       isAddress = true;
1301   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1302     // Addressing modes can also be folded into prefetches and a variety
1303     // of intrinsics.
1304     switch (II->getIntrinsicID()) {
1305       default: break;
1306       case Intrinsic::prefetch:
1307       case Intrinsic::x86_sse2_loadu_dq:
1308       case Intrinsic::x86_sse2_loadu_pd:
1309       case Intrinsic::x86_sse_loadu_ps:
1310       case Intrinsic::x86_sse_storeu_ps:
1311       case Intrinsic::x86_sse2_storeu_pd:
1312       case Intrinsic::x86_sse2_storeu_dq:
1313       case Intrinsic::x86_sse2_storel_dq:
1314         if (II->getOperand(1) == OperandVal)
1315           isAddress = true;
1316         break;
1317     }
1318   }
1319   return isAddress;
1320 }
1321
1322 /// getAccessType - Return the type of the memory being accessed.
1323 static const Type *getAccessType(const Instruction *Inst) {
1324   const Type *AccessTy = Inst->getType();
1325   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
1326     AccessTy = SI->getOperand(0)->getType();
1327   else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1328     // Addressing modes can also be folded into prefetches and a variety
1329     // of intrinsics.
1330     switch (II->getIntrinsicID()) {
1331     default: break;
1332     case Intrinsic::x86_sse_storeu_ps:
1333     case Intrinsic::x86_sse2_storeu_pd:
1334     case Intrinsic::x86_sse2_storeu_dq:
1335     case Intrinsic::x86_sse2_storel_dq:
1336       AccessTy = II->getOperand(1)->getType();
1337       break;
1338     }
1339   }
1340   return AccessTy;
1341 }
1342
1343 /// DeleteTriviallyDeadInstructions - If any of the instructions is the
1344 /// specified set are trivially dead, delete them and see if this makes any of
1345 /// their operands subsequently dead.
1346 static bool
1347 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
1348   bool Changed = false;
1349
1350   while (!DeadInsts.empty()) {
1351     Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
1352
1353     if (I == 0 || !isInstructionTriviallyDead(I))
1354       continue;
1355
1356     for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
1357       if (Instruction *U = dyn_cast<Instruction>(*OI)) {
1358         *OI = 0;
1359         if (U->use_empty())
1360           DeadInsts.push_back(U);
1361       }
1362
1363     I->eraseFromParent();
1364     Changed = true;
1365   }
1366
1367   return Changed;
1368 }
1369
1370 namespace {
1371
1372 /// LSRInstance - This class holds state for the main loop strength
1373 /// reduction logic.
1374 class LSRInstance {
1375   IVUsers &IU;
1376   ScalarEvolution &SE;
1377   DominatorTree &DT;
1378   const TargetLowering *const TLI;
1379   Loop *const L;
1380   bool Changed;
1381
1382   /// CurrentArbitraryRegIndex - To ensure a deterministic ordering, assign an
1383   /// arbitrary index value to each register as a sort tie breaker.
1384   unsigned CurrentArbitraryRegIndex;
1385
1386   /// Factors - Interesting factors between use strides.
1387   SmallSetVector<int64_t, 4> Factors;
1388
1389   /// Types - Interesting use types, to facilitate truncation reuse.
1390   SmallSetVector<const Type *, 4> Types;
1391
1392   /// Uses - The list of interesting uses.
1393   SmallVector<LSRUse, 16> Uses;
1394
1395   // TODO: Reorganize these data structures.
1396   typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
1397   RegUsesTy RegUses;
1398   SmallVector<const SCEV *, 16> RegSequence;
1399
1400   void OptimizeShadowIV();
1401   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
1402                          const SCEV* &CondStride);
1403   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
1404   bool StrideMightBeShared(const SCEV* Stride);
1405   bool OptimizeLoopTermCond(Instruction *&IVIncInsertPos);
1406
1407   LSRUse &getNewUse() {
1408     Uses.push_back(LSRUse());
1409     return Uses.back();
1410   }
1411
1412   void CountRegister(const SCEV *Reg, uint32_t Complexity, size_t LUIdx);
1413   void CountRegisters(const Formula &F, size_t LUIdx);
1414
1415   void GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx,
1416                                    Formula Base);
1417   void GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx,
1418                                    Formula Base);
1419   void GenerateFormulaeFromReplacedBaseReg(LSRUse &LU,
1420                                            unsigned LUIdx,
1421                                            const Formula &Base, unsigned i,
1422                                            const SmallVectorImpl<const SCEV *>
1423                                              &AddOps);
1424   void GenerateReassociationReuse(LSRUse &LU, unsigned LUIdx,
1425                                   Formula Base);
1426   void GenerateCombinationReuse(LSRUse &LU, unsigned LUIdx,
1427                                 Formula Base);
1428   void GenerateScaledReuse(LSRUse &LU, unsigned LUIdx,
1429                            Formula Base);
1430   void GenerateTruncateReuse(LSRUse &LU, unsigned LUIdx,
1431                              Formula Base);
1432
1433   void GenerateConstantOffsetReuse();
1434
1435   void GenerateAllReuseFormulae();
1436
1437   void GenerateLoopInvariantRegisterUses();
1438
1439 public:
1440   LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
1441
1442   bool getChanged() const { return Changed; }
1443
1444   void print(raw_ostream &OS) const;
1445   void dump() const;
1446 };
1447
1448 }
1449
1450 /// OptimizeShadowIV - If IV is used in a int-to-float cast
1451 /// inside the loop then try to eliminate the cast opeation.
1452 void LSRInstance::OptimizeShadowIV() {
1453   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1454   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1455     return;
1456
1457   for (size_t StrideIdx = 0, e = IU.StrideOrder.size();
1458        StrideIdx != e; ++StrideIdx) {
1459     std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
1460       IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]);
1461     assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
1462     if (!isa<SCEVConstant>(SI->first))
1463       continue;
1464
1465     for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
1466            E = SI->second->Users.end(); UI != E; /* empty */) {
1467       ilist<IVStrideUse>::iterator CandidateUI = UI;
1468       ++UI;
1469       Instruction *ShadowUse = CandidateUI->getUser();
1470       const Type *DestTy = NULL;
1471
1472       /* If shadow use is a int->float cast then insert a second IV
1473          to eliminate this cast.
1474
1475            for (unsigned i = 0; i < n; ++i)
1476              foo((double)i);
1477
1478          is transformed into
1479
1480            double d = 0.0;
1481            for (unsigned i = 0; i < n; ++i, ++d)
1482              foo(d);
1483       */
1484       if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
1485         DestTy = UCast->getDestTy();
1486       else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
1487         DestTy = SCast->getDestTy();
1488       if (!DestTy) continue;
1489
1490       if (TLI) {
1491         // If target does not support DestTy natively then do not apply
1492         // this transformation.
1493         EVT DVT = TLI->getValueType(DestTy);
1494         if (!TLI->isTypeLegal(DVT)) continue;
1495       }
1496
1497       PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
1498       if (!PH) continue;
1499       if (PH->getNumIncomingValues() != 2) continue;
1500
1501       const Type *SrcTy = PH->getType();
1502       int Mantissa = DestTy->getFPMantissaWidth();
1503       if (Mantissa == -1) continue;
1504       if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
1505         continue;
1506
1507       unsigned Entry, Latch;
1508       if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
1509         Entry = 0;
1510         Latch = 1;
1511       } else {
1512         Entry = 1;
1513         Latch = 0;
1514       }
1515
1516       ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
1517       if (!Init) continue;
1518       Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
1519
1520       BinaryOperator *Incr =
1521         dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
1522       if (!Incr) continue;
1523       if (Incr->getOpcode() != Instruction::Add
1524           && Incr->getOpcode() != Instruction::Sub)
1525         continue;
1526
1527       /* Initialize new IV, double d = 0.0 in above example. */
1528       ConstantInt *C = NULL;
1529       if (Incr->getOperand(0) == PH)
1530         C = dyn_cast<ConstantInt>(Incr->getOperand(1));
1531       else if (Incr->getOperand(1) == PH)
1532         C = dyn_cast<ConstantInt>(Incr->getOperand(0));
1533       else
1534         continue;
1535
1536       if (!C) continue;
1537
1538       // Ignore negative constants, as the code below doesn't handle them
1539       // correctly. TODO: Remove this restriction.
1540       if (!C->getValue().isStrictlyPositive()) continue;
1541
1542       /* Add new PHINode. */
1543       PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH);
1544
1545       /* create new increment. '++d' in above example. */
1546       Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
1547       BinaryOperator *NewIncr =
1548         BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
1549                                  Instruction::FAdd : Instruction::FSub,
1550                                NewPH, CFP, "IV.S.next.", Incr);
1551
1552       NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
1553       NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
1554
1555       /* Remove cast operation */
1556       ShadowUse->replaceAllUsesWith(NewPH);
1557       ShadowUse->eraseFromParent();
1558       break;
1559     }
1560   }
1561 }
1562
1563 /// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
1564 /// set the IV user and stride information and return true, otherwise return
1565 /// false.
1566 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond,
1567                                     IVStrideUse *&CondUse,
1568                                     const SCEV* &CondStride) {
1569   for (unsigned StrideIdx = 0, e = IU.StrideOrder.size();
1570        StrideIdx != e && !CondUse; ++StrideIdx) {
1571     std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
1572       IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]);
1573     assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
1574
1575     for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
1576          E = SI->second->Users.end(); UI != E; ++UI)
1577       if (UI->getUser() == Cond) {
1578         // NOTE: we could handle setcc instructions with multiple uses here, but
1579         // InstCombine does it as well for simple uses, it's not clear that it
1580         // occurs enough in real life to handle.
1581         CondUse = UI;
1582         CondStride = SI->first;
1583         return true;
1584       }
1585   }
1586   return false;
1587 }
1588
1589 /// OptimizeMax - Rewrite the loop's terminating condition if it uses
1590 /// a max computation.
1591 ///
1592 /// This is a narrow solution to a specific, but acute, problem. For loops
1593 /// like this:
1594 ///
1595 ///   i = 0;
1596 ///   do {
1597 ///     p[i] = 0.0;
1598 ///   } while (++i < n);
1599 ///
1600 /// the trip count isn't just 'n', because 'n' might not be positive. And
1601 /// unfortunately this can come up even for loops where the user didn't use
1602 /// a C do-while loop. For example, seemingly well-behaved top-test loops
1603 /// will commonly be lowered like this:
1604 //
1605 ///   if (n > 0) {
1606 ///     i = 0;
1607 ///     do {
1608 ///       p[i] = 0.0;
1609 ///     } while (++i < n);
1610 ///   }
1611 ///
1612 /// and then it's possible for subsequent optimization to obscure the if
1613 /// test in such a way that indvars can't find it.
1614 ///
1615 /// When indvars can't find the if test in loops like this, it creates a
1616 /// max expression, which allows it to give the loop a canonical
1617 /// induction variable:
1618 ///
1619 ///   i = 0;
1620 ///   max = n < 1 ? 1 : n;
1621 ///   do {
1622 ///     p[i] = 0.0;
1623 ///   } while (++i != max);
1624 ///
1625 /// Canonical induction variables are necessary because the loop passes
1626 /// are designed around them. The most obvious example of this is the
1627 /// LoopInfo analysis, which doesn't remember trip count values. It
1628 /// expects to be able to rediscover the trip count each time it is
1629 /// needed, and it does this using a simple analysis that only succeeds if
1630 /// the loop has a canonical induction variable.
1631 ///
1632 /// However, when it comes time to generate code, the maximum operation
1633 /// can be quite costly, especially if it's inside of an outer loop.
1634 ///
1635 /// This function solves this problem by detecting this type of loop and
1636 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
1637 /// the instructions for the maximum computation.
1638 ///
1639 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
1640   // Check that the loop matches the pattern we're looking for.
1641   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
1642       Cond->getPredicate() != CmpInst::ICMP_NE)
1643     return Cond;
1644
1645   SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
1646   if (!Sel || !Sel->hasOneUse()) return Cond;
1647
1648   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1649   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1650     return Cond;
1651   const SCEV *One = SE.getIntegerSCEV(1, BackedgeTakenCount->getType());
1652
1653   // Add one to the backedge-taken count to get the trip count.
1654   const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One);
1655
1656   // Check for a max calculation that matches the pattern.
1657   if (!isa<SCEVSMaxExpr>(IterationCount) && !isa<SCEVUMaxExpr>(IterationCount))
1658     return Cond;
1659   const SCEVNAryExpr *Max = cast<SCEVNAryExpr>(IterationCount);
1660   if (Max != SE.getSCEV(Sel)) return Cond;
1661
1662   // To handle a max with more than two operands, this optimization would
1663   // require additional checking and setup.
1664   if (Max->getNumOperands() != 2)
1665     return Cond;
1666
1667   const SCEV *MaxLHS = Max->getOperand(0);
1668   const SCEV *MaxRHS = Max->getOperand(1);
1669   if (!MaxLHS || MaxLHS != One) return Cond;
1670   // Check the relevant induction variable for conformance to
1671   // the pattern.
1672   const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
1673   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
1674   if (!AR || !AR->isAffine() ||
1675       AR->getStart() != One ||
1676       AR->getStepRecurrence(SE) != One)
1677     return Cond;
1678
1679   assert(AR->getLoop() == L &&
1680          "Loop condition operand is an addrec in a different loop!");
1681
1682   // Check the right operand of the select, and remember it, as it will
1683   // be used in the new comparison instruction.
1684   Value *NewRHS = 0;
1685   if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
1686     NewRHS = Sel->getOperand(1);
1687   else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
1688     NewRHS = Sel->getOperand(2);
1689   if (!NewRHS) return Cond;
1690
1691   // Determine the new comparison opcode. It may be signed or unsigned,
1692   // and the original comparison may be either equality or inequality.
1693   CmpInst::Predicate Pred =
1694     isa<SCEVSMaxExpr>(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
1695   if (Cond->getPredicate() == CmpInst::ICMP_EQ)
1696     Pred = CmpInst::getInversePredicate(Pred);
1697
1698   // Ok, everything looks ok to change the condition into an SLT or SGE and
1699   // delete the max calculation.
1700   ICmpInst *NewCond =
1701     new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
1702
1703   // Delete the max calculation instructions.
1704   Cond->replaceAllUsesWith(NewCond);
1705   CondUse->setUser(NewCond);
1706   Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
1707   Cond->eraseFromParent();
1708   Sel->eraseFromParent();
1709   if (Cmp->use_empty())
1710     Cmp->eraseFromParent();
1711   return NewCond;
1712 }
1713
1714 bool LSRInstance::StrideMightBeShared(const SCEV* Stride) {
1715   int64_t SInt = cast<SCEVConstant>(Stride)->getValue()->getSExtValue();
1716   for (unsigned i = 0, e = IU.StrideOrder.size(); i != e; ++i) {
1717     std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
1718       IU.IVUsesByStride.find(IU.StrideOrder[i]);
1719     const SCEV *Share = SI->first;
1720     if (!isa<SCEVConstant>(SI->first) || Share == Stride)
1721       continue;
1722     int64_t SSInt = cast<SCEVConstant>(Share)->getValue()->getSExtValue();
1723     if (SSInt == SInt)
1724       return true; // This can definitely be reused.
1725     if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0)
1726       continue;
1727     int64_t Scale = SSInt / SInt;
1728
1729     // This AM will be used for conservative queries. At this point in the
1730     // process we don't know which users will have a base reg, immediate,
1731     // etc., so we conservatively assume that it may not, making more
1732     // strides valid, thus erring on the side of assuming that there
1733     // might be sharing.
1734     TargetLowering::AddrMode AM;
1735     AM.Scale = Scale;
1736
1737     // Any pre-inc iv use?
1738     IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[Share];
1739     for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
1740            E = StrideUses.Users.end(); I != E; ++I) {
1741       bool isAddress = isAddressUse(I->getUser(), I->getOperandValToReplace());
1742       if (!I->isUseOfPostIncrementedValue() &&
1743           isLegalUse(AM, isAddress ? LSRUse::Address : LSRUse::Basic,
1744                      isAddress ? getAccessType(I->getUser()) : 0,
1745                      TLI))
1746         return true;
1747     }
1748   }
1749   return false;
1750 }
1751
1752 /// OptimizeLoopTermCond - Change loop terminating condition to use the
1753 /// postinc iv when possible.
1754 bool
1755 LSRInstance::OptimizeLoopTermCond(Instruction *&IVIncInsertPos) {
1756   SmallPtrSet<Instruction *, 4> PostIncs;
1757
1758   BasicBlock *LatchBlock = L->getLoopLatch();
1759   SmallVector<BasicBlock*, 8> ExitingBlocks;
1760   L->getExitingBlocks(ExitingBlocks);
1761
1762   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
1763     BasicBlock *ExitingBlock = ExitingBlocks[i];
1764
1765     // Get the terminating condition for the loop if possible.  If we
1766     // can, we want to change it to use a post-incremented version of its
1767     // induction variable, to allow coalescing the live ranges for the IV into
1768     // one register value.
1769
1770     BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
1771     if (!TermBr)
1772       continue;
1773     // FIXME: Overly conservative, termination condition could be an 'or' etc..
1774     if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
1775       continue;
1776
1777     // Search IVUsesByStride to find Cond's IVUse if there is one.
1778     IVStrideUse *CondUse = 0;
1779     const SCEV *CondStride = 0;
1780     ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
1781     if (!FindIVUserForCond(Cond, CondUse, CondStride))
1782       continue;
1783
1784     // If the trip count is computed in terms of a max (due to ScalarEvolution
1785     // being unable to find a sufficient guard, for example), change the loop
1786     // comparison to use SLT or ULT instead of NE.
1787     // One consequence of doing this now is that it disrupts the count-down
1788     // optimization. That's not always a bad thing though, because in such
1789     // cases it may still be worthwhile to avoid a max.
1790     Cond = OptimizeMax(Cond, CondUse);
1791
1792     // If this exiting block is the latch block, and the condition has only
1793     // one use inside the loop (the branch), use the post-incremented value
1794     // of the induction variable
1795     if (ExitingBlock != LatchBlock) {
1796       // If this exiting block dominates the latch block, it may also use
1797       // the post-inc value if it won't be shared with other uses.
1798       // Check for dominance.
1799       if (!DT.dominates(ExitingBlock, LatchBlock))
1800         continue;
1801       // Check for sharing within the same stride.
1802       bool SameStrideSharing = false;
1803       IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[CondStride];
1804       for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
1805              E = StrideUses.Users.end(); I != E; ++I) {
1806         if (I->getUser() == Cond)
1807           continue;
1808         if (!I->isUseOfPostIncrementedValue()) {
1809           SameStrideSharing = true;
1810           break;
1811         }
1812       }
1813       if (SameStrideSharing)
1814         continue;
1815       // Check for sharing from a different stride.
1816       if (isa<SCEVConstant>(CondStride) && StrideMightBeShared(CondStride))
1817         continue;
1818     }
1819     if (!Cond->hasOneUse()) {
1820       bool HasOneUseInLoop = true;
1821       for (Value::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
1822            UI != UE; ++UI) {
1823         Instruction *U = cast<Instruction>(*UI);
1824         if (U == TermBr)
1825           continue;
1826         if (L->contains(U)) {
1827           HasOneUseInLoop = false;
1828           break;
1829         }
1830       }
1831       if (!HasOneUseInLoop)
1832         continue;
1833     }
1834
1835     DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
1836                  << *Cond << '\n');
1837
1838     // It's possible for the setcc instruction to be anywhere in the loop, and
1839     // possible for it to have multiple users.  If it is not immediately before
1840     // the exiting block branch, move it.
1841     if (&*++BasicBlock::iterator(Cond) != TermBr) {
1842       if (Cond->hasOneUse()) {   // Condition has a single use, just move it.
1843         Cond->moveBefore(TermBr);
1844       } else {
1845         // Otherwise, clone the terminating condition and insert into the
1846         // loopend.
1847         ICmpInst *OldCond = Cond;
1848         Cond = cast<ICmpInst>(Cond->clone());
1849         Cond->setName(L->getHeader()->getName() + ".termcond");
1850         ExitingBlock->getInstList().insert(TermBr, Cond);
1851
1852         // Clone the IVUse, as the old use still exists!
1853         IU.IVUsesByStride[CondStride]->addUser(CondUse->getOffset(), Cond,
1854                                              CondUse->getOperandValToReplace());
1855         CondUse = &IU.IVUsesByStride[CondStride]->Users.back();
1856         TermBr->replaceUsesOfWith(OldCond, Cond);
1857       }
1858     }
1859
1860     // If we get to here, we know that we can transform the setcc instruction to
1861     // use the post-incremented version of the IV, allowing us to coalesce the
1862     // live ranges for the IV correctly.
1863     CondUse->setOffset(SE.getMinusSCEV(CondUse->getOffset(), CondStride));
1864     CondUse->setIsUseOfPostIncrementedValue(true);
1865     Changed = true;
1866
1867     PostIncs.insert(Cond);
1868   }
1869
1870   // Determine an insertion point for the loop induction variable increment. It
1871   // must dominate all the post-inc comparisons we just set up, and it must
1872   // dominate the loop latch edge.
1873   IVIncInsertPos = L->getLoopLatch()->getTerminator();
1874   for (SmallPtrSet<Instruction *, 4>::iterator I = PostIncs.begin(),
1875        E = PostIncs.end(); I != E; ++I) {
1876     BasicBlock *BB =
1877       DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
1878                                     (*I)->getParent());
1879     if (BB == (*I)->getParent())
1880       IVIncInsertPos = *I;
1881     else if (BB != IVIncInsertPos->getParent())
1882       IVIncInsertPos = BB->getTerminator();
1883   }
1884
1885   return Changed;
1886 }
1887
1888 /// CountRegisters - Note the given register.
1889 void LSRInstance::CountRegister(const SCEV *Reg, uint32_t Complexity,
1890                                 size_t LUIdx) {
1891   std::pair<RegUsesTy::iterator, bool> Pair =
1892     RegUses.insert(std::make_pair(Reg, RegSortData()));
1893   RegSortData &BV = Pair.first->second;
1894   if (Pair.second) {
1895     BV.Index = CurrentArbitraryRegIndex++;
1896     BV.MaxComplexity = Complexity;
1897     RegSequence.push_back(Reg);
1898   } else {
1899     BV.MaxComplexity = std::max(BV.MaxComplexity, Complexity);
1900   }
1901   BV.Bits.resize(std::max(BV.Bits.size(), LUIdx + 1));
1902   BV.Bits.set(LUIdx);
1903 }
1904
1905 /// CountRegisters - Note which registers are used by the given formula,
1906 /// updating RegUses.
1907 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
1908   uint32_t Complexity = F.getComplexity();
1909   if (F.ScaledReg)
1910     CountRegister(F.ScaledReg, Complexity, LUIdx);
1911   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
1912        E = F.BaseRegs.end(); I != E; ++I)
1913     CountRegister(*I, Complexity, LUIdx);
1914 }
1915
1916 /// GenerateSymbolicOffsetReuse - Generate reuse formulae using symbolic
1917 /// offsets.
1918 void LSRInstance::GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx,
1919                                               Formula Base) {
1920   // We can't add a symbolic offset if the address already contains one.
1921   if (Base.AM.BaseGV) return;
1922
1923   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
1924     const SCEV *G = Base.BaseRegs[i];
1925     GlobalValue *GV = ExtractSymbol(G, SE);
1926     if (G->isZero())
1927       continue;
1928     Formula F = Base;
1929     F.AM.BaseGV = GV;
1930     if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI))
1931       continue;
1932     F.BaseRegs[i] = G;
1933     if (LU.InsertFormula(F))
1934       CountRegisters(LU.Formulae.back(), LUIdx);
1935   }
1936 }
1937
1938 /// GenerateICmpZeroScaledReuse - For ICmpZero, check to see if we can scale up
1939 /// the comparison. For example, x == y -> x*c == y*c.
1940 void LSRInstance::GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx,
1941                                               Formula Base) {
1942   if (LU.Kind != LSRUse::ICmpZero) return;
1943
1944   // Determine the integer type for the base formula.
1945   const Type *IntTy = Base.getType();
1946   if (!IntTy) return;
1947   if (SE.getTypeSizeInBits(IntTy) > 64) return;
1948   IntTy = SE.getEffectiveSCEVType(IntTy);
1949
1950   assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");
1951
1952   // Check each interesting stride.
1953   for (SmallSetVector<int64_t, 4>::const_iterator
1954        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
1955     int64_t Factor = *I;
1956     Formula F = Base;
1957
1958     // Check that the multiplication doesn't overflow.
1959     F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs * Factor;
1960     if ((int64_t)F.AM.BaseOffs / Factor != F.AM.BaseOffs)
1961       continue;
1962
1963     // Check that this scale is legal.
1964     if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI))
1965       continue;
1966
1967     const SCEV *FactorS = SE.getSCEV(ConstantInt::get(IntTy, Factor));
1968
1969     // Check that multiplying with each base register doesn't overflow.
1970     for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
1971       F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
1972       if (getSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
1973         goto next;
1974     }
1975
1976     // Check that multiplying with the scaled register doesn't overflow.
1977     if (F.ScaledReg) {
1978       F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
1979       if (getSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
1980         continue;
1981     }
1982
1983     // If we make it here and it's legal, add it.
1984     if (LU.InsertFormula(F))
1985       CountRegisters(LU.Formulae.back(), LUIdx);
1986   next:;
1987   }
1988 }
1989
1990 /// GenerateFormulaeFromReplacedBaseReg - If removing base register with
1991 /// index i from the BaseRegs list and adding the registers in AddOps
1992 /// to the address forms an interesting formula, pursue it.
1993 void
1994 LSRInstance::GenerateFormulaeFromReplacedBaseReg(
1995                                              LSRUse &LU,
1996                                              unsigned LUIdx,
1997                                              const Formula &Base, unsigned i,
1998                                              const SmallVectorImpl<const SCEV *>
1999                                                &AddOps) {
2000   if (AddOps.empty()) return;
2001
2002   Formula F = Base;
2003   std::swap(F.BaseRegs[i], F.BaseRegs.back());
2004   F.BaseRegs.pop_back();
2005   for (SmallVectorImpl<const SCEV *>::const_iterator I = AddOps.begin(),
2006        E = AddOps.end(); I != E; ++I)
2007     F.BaseRegs.push_back(*I);
2008   F.AM.HasBaseReg = !F.BaseRegs.empty();
2009   if (LU.InsertFormula(F)) {
2010     CountRegisters(LU.Formulae.back(), LUIdx);
2011     // Recurse.
2012     GenerateReassociationReuse(LU, LUIdx, LU.Formulae.back());
2013   }
2014 }
2015
2016 /// GenerateReassociationReuse - Split out subexpressions from adds and
2017 /// the bases of addrecs.
2018 void LSRInstance::GenerateReassociationReuse(LSRUse &LU, unsigned LUIdx,
2019                                              Formula Base) {
2020   SmallVector<const SCEV *, 8> AddOps;
2021   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2022     const SCEV *BaseReg = Base.BaseRegs[i];
2023     if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(BaseReg)) {
2024       for (SCEVAddExpr::op_iterator J = Add->op_begin(), JE = Add->op_end();
2025            J != JE; ++J) {
2026         // Don't pull a constant into a register if the constant could be
2027         // folded into an immediate field.
2028         if (isAlwaysFoldable(*J, true, LU.Kind, LU.AccessTy, TLI, SE)) continue;
2029         SmallVector<const SCEV *, 8> InnerAddOps;
2030         for (SCEVAddExpr::op_iterator K = Add->op_begin(); K != JE; ++K)
2031           if (K != J)
2032             InnerAddOps.push_back(*K);
2033         // Splitting a 2-operand add both ways is redundant. Pruning this
2034         // now saves compile time.
2035         if (InnerAddOps.size() < 2 && next(J) == JE)
2036           continue;
2037         AddOps.push_back(*J);
2038         const SCEV *InnerAdd = SE.getAddExpr(InnerAddOps);
2039         AddOps.push_back(InnerAdd);
2040         GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
2041         AddOps.clear();
2042       }
2043     } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BaseReg)) {
2044       if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(AR->getStart())) {
2045         for (SCEVAddExpr::op_iterator J = Add->op_begin(), JE = Add->op_end();
2046              J != JE; ++J) {
2047           // Don't pull a constant into a register if the constant could be
2048           // folded into an immediate field.
2049           if (isAlwaysFoldable(*J, true, LU.Kind, LU.AccessTy, TLI, SE))
2050             continue;
2051           SmallVector<const SCEV *, 8> InnerAddOps;
2052           for (SCEVAddExpr::op_iterator K = Add->op_begin(); K != JE; ++K)
2053             if (K != J)
2054               InnerAddOps.push_back(*K);
2055           AddOps.push_back(*J);
2056           const SCEV *InnerAdd = SE.getAddExpr(InnerAddOps);
2057           AddOps.push_back(SE.getAddRecExpr(InnerAdd,
2058                                             AR->getStepRecurrence(SE),
2059                                             AR->getLoop()));
2060           GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
2061           AddOps.clear();
2062         }
2063       } else if (!isAlwaysFoldable(AR->getStart(), Base.BaseRegs.size() > 1,
2064                                    LU.Kind, LU.AccessTy,
2065                                    TLI, SE)) {
2066         AddOps.push_back(AR->getStart());
2067         AddOps.push_back(SE.getAddRecExpr(SE.getIntegerSCEV(0,
2068                                                             BaseReg->getType()),
2069                                           AR->getStepRecurrence(SE),
2070                                           AR->getLoop()));
2071         GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
2072         AddOps.clear();
2073       }
2074     }
2075   }
2076 }
2077
2078 /// GenerateCombinationReuse - Generate a formula consisting of all of the
2079 /// loop-dominating registers added into a single register.
2080 void LSRInstance::GenerateCombinationReuse(LSRUse &LU, unsigned LUIdx,
2081                                            Formula Base) {
2082   if (Base.BaseRegs.size() <= 1) return;
2083
2084   Formula F = Base;
2085   F.BaseRegs.clear();
2086   SmallVector<const SCEV *, 4> Ops;
2087   for (SmallVectorImpl<const SCEV *>::const_iterator
2088        I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
2089     const SCEV *BaseReg = *I;
2090     if (BaseReg->properlyDominates(L->getHeader(), &DT) &&
2091         !BaseReg->hasComputableLoopEvolution(L))
2092       Ops.push_back(BaseReg);
2093     else
2094       F.BaseRegs.push_back(BaseReg);
2095   }
2096   if (Ops.size() > 1) {
2097     F.BaseRegs.push_back(SE.getAddExpr(Ops));
2098     if (LU.InsertFormula(F))
2099       CountRegisters(LU.Formulae.back(), LUIdx);
2100   }
2101 }
2102
2103 /// GenerateScaledReuse - Generate stride factor reuse formulae by making
2104 /// use of scaled-offset address modes, for example.
2105 void LSRInstance::GenerateScaledReuse(LSRUse &LU, unsigned LUIdx,
2106                                       Formula Base) {
2107   // Determine the integer type for the base formula.
2108   const Type *IntTy = Base.getType();
2109   if (!IntTy) return;
2110   IntTy = SE.getEffectiveSCEVType(IntTy);
2111
2112   // Check each interesting stride.
2113   for (SmallSetVector<int64_t, 4>::const_iterator
2114        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
2115     int64_t Factor = *I;
2116
2117     // If this Formula already has a scaled register, we can't add another one.
2118     if (Base.AM.Scale != 0)
2119       continue;
2120     Formula F = Base;
2121     F.AM.Scale = Factor;
2122     // Check whether this scale is going to be legal.
2123     if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI)) {
2124       // As a special-case, handle special out-of-loop Basic users specially.
2125       // TODO: Reconsider this special case.
2126       if (LU.Kind == LSRUse::Basic &&
2127           isLegalUse(F.AM, LSRUse::Special, LU.AccessTy, TLI) &&
2128           !L->contains(LU.UserInst))
2129         LU.Kind = LSRUse::Special;
2130       else
2131         continue;
2132     }
2133     // For each addrec base reg, apply the scale, if possible.
2134     for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
2135       if (const SCEVAddRecExpr *AR =
2136             dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
2137         const SCEV *FactorS = SE.getSCEV(ConstantInt::get(IntTy, Factor));
2138         // Divide out the factor, ignoring high bits, since we'll be
2139         // scaling the value back up in the end.
2140         if (const SCEV *Quotient = getSDiv(AR, FactorS, SE, true)) {
2141           // TODO: This could be optimized to avoid all the copying.
2142           Formula NewF = F;
2143           NewF.ScaledReg = Quotient;
2144           std::swap(NewF.BaseRegs[i], NewF.BaseRegs.back());
2145           NewF.BaseRegs.pop_back();
2146           NewF.AM.HasBaseReg = !NewF.BaseRegs.empty();
2147           if (LU.InsertFormula(NewF))
2148             CountRegisters(LU.Formulae.back(), LUIdx);
2149         }
2150       }
2151   }
2152 }
2153
2154 /// GenerateTruncateReuse - Generate reuse formulae from different IV types.
2155 void LSRInstance::GenerateTruncateReuse(LSRUse &LU, unsigned LUIdx,
2156                                         Formula Base) {
2157   // This requires TargetLowering to tell us which truncates are free.
2158   if (!TLI) return;
2159
2160   // Don't attempt to truncate symbolic values.
2161   if (Base.AM.BaseGV) return;
2162
2163   // Determine the integer type for the base formula.
2164   const Type *DstTy = Base.getType();
2165   if (!DstTy) return;
2166   DstTy = SE.getEffectiveSCEVType(DstTy);
2167
2168   for (SmallSetVector<const Type *, 4>::const_iterator
2169        I = Types.begin(), E = Types.end(); I != E; ++I) {
2170     const Type *SrcTy = *I;
2171     if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
2172       Formula F = Base;
2173       if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
2174       for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
2175            JE = F.BaseRegs.end(); J != JE; ++J)
2176         *J = SE.getAnyExtendExpr(*J, SrcTy);
2177       if (LU.InsertFormula(F))
2178         CountRegisters(LU.Formulae.back(), LUIdx);
2179     }
2180   }
2181 }
2182
2183 namespace {
2184
2185 /// WorkItem - Helper class for GenerateConstantOffsetReuse. It's used to
2186 /// defer modifications so that the search phase doesn't have to worry about
2187 /// the data structures moving underneath it.
2188 struct WorkItem {
2189   LSRUse *LU;
2190   size_t LUIdx;
2191   int64_t Imm;
2192   const SCEV *OrigReg;
2193
2194   WorkItem(LSRUse *U, size_t LI, int64_t I, const SCEV *R)
2195     : LU(U), LUIdx(LI), Imm(I), OrigReg(R) {}
2196
2197   void print(raw_ostream &OS) const;
2198   void dump() const;
2199 };
2200
2201 void WorkItem::print(raw_ostream &OS) const {
2202   OS << "in use ";
2203   LU->print(OS);
2204   OS << " (at index " << LUIdx << "), add offset " << Imm
2205      << " and compensate by adjusting refences to " << *OrigReg << "\n";
2206 }
2207
2208 void WorkItem::dump() const {
2209   print(errs()); errs() << '\n';
2210 }
2211
2212 }
2213
2214 /// GenerateConstantOffsetReuse - Look for registers which are a constant
2215 /// distance apart and try to form reuse opportunities between them.
2216 void LSRInstance::GenerateConstantOffsetReuse() {
2217   // Group the registers by their value without any added constant offset.
2218   typedef std::map<int64_t, const SCEV *> ImmMapTy;
2219   typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
2220   RegMapTy Map;
2221   SmallVector<const SCEV *, 8> Sequence;
2222   for (SmallVectorImpl<const SCEV *>::iterator I = RegSequence.begin(),
2223        E = RegSequence.end(); I != E; ++I) {
2224     const SCEV *Reg = *I;
2225     int64_t Imm = ExtractImmediate(Reg, SE);
2226     std::pair<RegMapTy::iterator, bool> Pair =
2227       Map.insert(std::make_pair(Reg, ImmMapTy()));
2228     if (Pair.second)
2229       Sequence.push_back(Reg);
2230     Pair.first->second.insert(std::make_pair(Imm, *I));
2231   }
2232
2233   // Insert an artificial expression at offset 0 (if there isn't one already),
2234   // as this may lead to more reuse opportunities.
2235   for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
2236        E = Sequence.end(); I != E; ++I)
2237     Map.find(*I)->second.insert(ImmMapTy::value_type(0, 0));
2238
2239   // Now examine each set of registers with the same base value. Build up
2240   // a list of work to do and do the work in a separate step so that we're
2241   // not adding formulae and register counts while we're searching.
2242   SmallVector<WorkItem, 32> WorkItems;
2243   for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
2244        E = Sequence.end(); I != E; ++I) {
2245     const SCEV *Reg = *I;
2246     const ImmMapTy &Imms = Map.find(Reg)->second;
2247     // Examine each offset.
2248     for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
2249          J != JE; ++J) {
2250       const SCEV *OrigReg = J->second;
2251       // Skip the artifical register at offset 0.
2252       if (!OrigReg) continue;
2253
2254       int64_t JImm = J->first;
2255       const SmallBitVector &Bits = RegUses.find(OrigReg)->second.Bits;
2256
2257       // Examine each other offset associated with the same register. This is
2258       // quadradic in the number of registers with the same base, but it's
2259       // uncommon for this to be a large number.
2260       for (ImmMapTy::const_iterator M = Imms.begin(); M != JE; ++M) {
2261         if (M == J) continue;
2262
2263         // Compute the difference between the two.
2264         int64_t Imm = (uint64_t)JImm - M->first;
2265         for (int LUIdx = Bits.find_first(); LUIdx != -1;
2266              LUIdx = Bits.find_next(LUIdx))
2267           // Make a memo of this use, offset, and register tuple.
2268           WorkItems.push_back(WorkItem(&Uses[LUIdx], LUIdx, Imm, OrigReg));
2269       }
2270     }
2271   }
2272
2273   // Now iterate through the worklist and add new formulae.
2274   for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
2275        E = WorkItems.end(); I != E; ++I) {
2276     const WorkItem &WI = *I;
2277     LSRUse &LU = *WI.LU;
2278     size_t LUIdx = WI.LUIdx;
2279     int64_t Imm = WI.Imm;
2280     const SCEV *OrigReg = WI.OrigReg;
2281
2282     const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
2283     const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy,
2284                                                       -(uint64_t)Imm));
2285
2286     for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
2287       Formula F = LU.Formulae[L];
2288       // Use the immediate in the scaled register.
2289       if (F.ScaledReg == OrigReg) {
2290         int64_t Offs = (uint64_t)F.AM.BaseOffs +
2291                        Imm * (uint64_t)F.AM.Scale;
2292         // Don't create 50 + reg(-50).
2293         if (F.referencesReg(SE.getSCEV(
2294                    ConstantInt::get(IntTy, -(uint64_t)Offs))))
2295           continue;
2296         Formula NewF = F;
2297         NewF.AM.BaseOffs = Offs;
2298         if (!isLegalUse(NewF.AM, LU.Kind, LU.AccessTy, TLI))
2299           continue;
2300         const SCEV *Diff = SE.getAddExpr(NegImmS, NewF.ScaledReg);
2301         if (Diff->isZero()) continue;
2302         NewF.ScaledReg = Diff;
2303         if (LU.InsertFormula(NewF))
2304           CountRegisters(LU.Formulae.back(), LUIdx);
2305       }
2306       // Use the immediate in a base register.
2307       for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
2308         const SCEV *BaseReg = F.BaseRegs[N];
2309         if (BaseReg != OrigReg)
2310           continue;
2311         Formula NewF = F;
2312         NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
2313         if (!isLegalUse(NewF.AM, LU.Kind, LU.AccessTy, TLI))
2314           continue;
2315         const SCEV *Diff = SE.getAddExpr(NegImmS, BaseReg);
2316         if (Diff->isZero()) continue;
2317         // Don't create 50 + reg(-50).
2318         if (Diff ==
2319             SE.getSCEV(ConstantInt::get(IntTy,
2320                                         -(uint64_t)NewF.AM.BaseOffs)))
2321           continue;
2322         NewF.BaseRegs[N] = Diff;
2323         if (LU.InsertFormula(NewF))
2324           CountRegisters(LU.Formulae.back(), LUIdx);
2325       }
2326     }
2327   }
2328 }
2329
2330 /// GenerateAllReuseFormulae - Generate formulae for each use.
2331 void
2332 LSRInstance::GenerateAllReuseFormulae() {
2333   SmallVector<Formula, 12> Save;
2334   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2335     LSRUse &LU = Uses[LUIdx];
2336
2337     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2338       GenerateSymbolicOffsetReuse(LU, LUIdx, LU.Formulae[i]);
2339     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2340       GenerateICmpZeroScaledReuse(LU, LUIdx, LU.Formulae[i]);
2341     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2342       GenerateReassociationReuse(LU, LUIdx, LU.Formulae[i]);
2343     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2344       GenerateCombinationReuse(LU, LUIdx, LU.Formulae[i]);
2345     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2346       GenerateScaledReuse(LU, LUIdx, LU.Formulae[i]);
2347     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2348       GenerateTruncateReuse(LU, LUIdx, LU.Formulae[i]);
2349   }
2350
2351   GenerateConstantOffsetReuse();
2352 }
2353
2354 /// GenerateLoopInvariantRegisterUses - Check for other uses of loop-invariant
2355 /// values which we're tracking. These other uses will pin these values in
2356 /// registers, making them less profitable for elimination.
2357 /// TODO: This currently misses non-constant addrec step registers.
2358 /// TODO: Should this give more weight to users inside the loop?
2359 void
2360 LSRInstance::GenerateLoopInvariantRegisterUses() {
2361   for (size_t i = 0, e = RegSequence.size(); i != e; ++i)
2362     if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(RegSequence[i])) {
2363       const Value *V = U->getValue();
2364       if (const Instruction *Inst = dyn_cast<Instruction>(V))
2365         if (L->contains(Inst)) continue;
2366       for (Value::use_const_iterator UI = V->use_begin(), UE = V->use_end();
2367            UI != UE; ++UI) {
2368         const Instruction *UserInst = dyn_cast<Instruction>(*UI);
2369         // Ignore non-instructions.
2370         if (!UserInst)
2371           continue;
2372         // Ignore instructions in other functions (as can happen with
2373         // Constants).
2374         if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
2375           continue;
2376         // Ignore instructions not dominated by the loop.
2377         const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
2378           UserInst->getParent() :
2379           cast<PHINode>(UserInst)->getIncomingBlock(
2380             PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
2381         if (!DT.dominates(L->getHeader(), UseBB))
2382           continue;
2383         // Ignore uses which are part of other SCEV expressions, to avoid
2384         // analyzing them multiple times.
2385         if (SE.isSCEVable(UserInst->getType()) &&
2386             !isa<SCEVUnknown>(SE.getSCEV(const_cast<Instruction *>(UserInst))))
2387           continue;
2388         // Ignore icmp instructions which are already being analyzed.
2389         if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
2390           unsigned OtherIdx = !UI.getOperandNo();
2391           Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
2392           if (SE.getSCEV(OtherOp)->hasComputableLoopEvolution(L))
2393             continue;
2394         }
2395
2396         LSRUse &LU = getNewUse();
2397         LU.UserInst = const_cast<Instruction *>(UserInst);
2398         LU.OperandValToReplace = UI.getUse();
2399         LU.InsertSupplementalFormula(U);
2400         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
2401       }
2402     }
2403 }
2404
2405 #ifndef NDEBUG
2406
2407 static void debug_winner(SmallVector<LSRUse, 16> const &Uses) {
2408   dbgs() << "LSR has selected formulae for each use:\n";
2409   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
2410        E = Uses.end(); I != E; ++I) {
2411     const LSRUse &LU = *I;
2412     dbgs() << "  ";
2413     LU.print(dbgs());
2414     dbgs() << '\n';
2415     dbgs() << "    ";
2416     LU.Formulae.front().print(dbgs());
2417     dbgs() << "\n";
2418   }
2419 }
2420
2421 #endif
2422
2423 LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
2424   : IU(P->getAnalysis<IVUsers>()),
2425     SE(P->getAnalysis<ScalarEvolution>()),
2426     DT(P->getAnalysis<DominatorTree>()),
2427     TLI(tli), L(l), Changed(false), CurrentArbitraryRegIndex(0) {
2428
2429   // If LoopSimplify form is not available, stay out of trouble.
2430   if (!L->isLoopSimplifyForm()) return;
2431
2432   // If there's no interesting work to be done, bail early.
2433   if (IU.IVUsesByStride.empty()) return;
2434
2435   DEBUG(dbgs() << "\nLSR on loop ";
2436         WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
2437         dbgs() << ":\n");
2438
2439   // Sort the StrideOrder so we process larger strides first.
2440   std::stable_sort(IU.StrideOrder.begin(), IU.StrideOrder.end(),
2441                    StrideCompare(SE));
2442
2443   /// OptimizeShadowIV - If IV is used in a int-to-float cast
2444   /// inside the loop then try to eliminate the cast opeation.
2445   OptimizeShadowIV();
2446
2447   // Change loop terminating condition to use the postinc iv when possible.
2448   Instruction *IVIncInsertPos;
2449   Changed |= OptimizeLoopTermCond(IVIncInsertPos);
2450
2451   for (SmallVectorImpl<const SCEV *>::const_iterator SIter =
2452        IU.StrideOrder.begin(), SEnd = IU.StrideOrder.end();
2453        SIter != SEnd; ++SIter) {
2454     const SCEV *Stride = *SIter;
2455
2456     // Collect interesting types.
2457     Types.insert(SE.getEffectiveSCEVType(Stride->getType()));
2458
2459     // Collect interesting factors.
2460     for (SmallVectorImpl<const SCEV *>::const_iterator NewStrideIter =
2461          SIter + 1; NewStrideIter != SEnd; ++NewStrideIter) {
2462       const SCEV *OldStride = Stride;
2463       const SCEV *NewStride = *NewStrideIter;
2464
2465       if (SE.getTypeSizeInBits(OldStride->getType()) !=
2466           SE.getTypeSizeInBits(NewStride->getType())) {
2467         if (SE.getTypeSizeInBits(OldStride->getType()) >
2468             SE.getTypeSizeInBits(NewStride->getType()))
2469           NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2470         else
2471           OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2472       }
2473       if (const SCEVConstant *Factor =
2474             dyn_cast_or_null<SCEVConstant>(getSDiv(NewStride, OldStride,
2475                                                    SE, true)))
2476         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
2477           Factors.insert(Factor->getValue()->getValue().getSExtValue());
2478     }
2479
2480     std::map<const SCEV *, IVUsersOfOneStride *>::const_iterator SI =
2481       IU.IVUsesByStride.find(Stride);
2482     assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
2483     for (ilist<IVStrideUse>::const_iterator UI = SI->second->Users.begin(),
2484          E = SI->second->Users.end(); UI != E; ++UI) {
2485       // Record the uses.
2486       LSRUse &LU = getNewUse();
2487       LU.UserInst = UI->getUser();
2488       LU.OperandValToReplace = UI->getOperandValToReplace();
2489       if (isAddressUse(LU.UserInst, LU.OperandValToReplace)) {
2490         LU.Kind = LSRUse::Address;
2491         LU.AccessTy = getAccessType(LU.UserInst);
2492       }
2493       if (UI->isUseOfPostIncrementedValue())
2494         LU.PostIncLoop = L;
2495
2496       const SCEV *S = IU.getCanonicalExpr(*UI);
2497
2498       // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
2499       // (N - i == 0), and this allows (N - i) to be the expression that we
2500       // work with rather than just N or i, so we can consider the register
2501       // requirements for both N and i at the same time. Limiting this code
2502       // to equality icmps is not a problem because all interesting loops
2503       // use equality icmps, thanks to IndVarSimplify.
2504       if (ICmpInst *CI = dyn_cast<ICmpInst>(LU.UserInst))
2505         if (CI->isEquality()) {
2506           // Swap the operands if needed to put the OperandValToReplace on
2507           // the left, for consistency.
2508           Value *NV = CI->getOperand(1);
2509           if (NV == LU.OperandValToReplace) {
2510             CI->setOperand(1, CI->getOperand(0));
2511             CI->setOperand(0, NV);
2512           }
2513
2514           // x == y  -->  x - y == 0
2515           const SCEV *N = SE.getSCEV(NV);
2516           if (N->isLoopInvariant(L)) {
2517             LU.Kind = LSRUse::ICmpZero;
2518             S = SE.getMinusSCEV(N, S);
2519           }
2520
2521           // -1 and the negations of all interesting strides (except the
2522           // negation of -1) are now also interesting.
2523           for (size_t i = 0, e = Factors.size(); i != e; ++i)
2524             if (Factors[i] != -1)
2525               Factors.insert(-(uint64_t)Factors[i]);
2526           Factors.insert(-1);
2527         }
2528
2529       // Ok, now enumerate all the different formulae we can find to compute
2530       // the value for this expression.
2531       LU.InsertInitialFormula(S, L, SE, DT);
2532       CountRegisters(LU.Formulae.back(), Uses.size() - 1);
2533     }
2534   }
2535
2536   // If all uses use the same type, don't bother looking for truncation-based
2537   // reuse.
2538   if (Types.size() == 1)
2539     Types.clear();
2540
2541   // Now use the reuse data to generate a bunch of interesting ways
2542   // to formulate the values needed for the uses.
2543   GenerateAllReuseFormulae();
2544
2545   // If there are any uses of registers that we're tracking that have escaped
2546   // IVUsers' attention, add trivial uses for them, so that the register
2547   // voting process takes the into consideration.
2548   GenerateLoopInvariantRegisterUses();
2549
2550   // Sort the formulae. TODO: This is redundantly sorted below.
2551   for (SmallVectorImpl<LSRUse>::iterator I = Uses.begin(), E = Uses.end();
2552        I != E; ++I) {
2553     LSRUse &LU = *I;
2554     std::stable_sort(LU.Formulae.begin(), LU.Formulae.end(),
2555                      ComplexitySorter());
2556   }
2557
2558   // Ok, we've now collected all the uses and noted their register uses. The
2559   // next step is to start looking at register reuse possibilities.
2560   DEBUG(print(dbgs()); dbgs() << '\n');
2561
2562   // Start by assuming we'll assign each use its own register. This is
2563   // sometimes called "full" strength reduction, or "superhero" mode.
2564   // Sometimes this is the best solution, but if there are opportunities for
2565   // reuse we may find a better solution.
2566   Score CurScore;
2567   CurScore.RateInitial(Uses, L, SE);
2568
2569   // Create a sorted list of registers with those with the most uses appearing
2570   // earlier in the list. We'll visit them first, as they're the most likely
2571   // to represent profitable reuse opportunities.
2572   SmallVector<RegCount, 8> RegOrder;
2573   for (SmallVectorImpl<const SCEV *>::const_iterator I =
2574        RegSequence.begin(), E = RegSequence.end(); I != E; ++I)
2575     RegOrder.push_back(RegCount(*I, RegUses.find(*I)->second));
2576   std::stable_sort(RegOrder.begin(), RegOrder.end());
2577
2578   // Visit each register. Determine which ones represent profitable reuse
2579   // opportunities and remember them.
2580   // TODO: Extract this code into a function.
2581   for (SmallVectorImpl<RegCount>::const_iterator I = RegOrder.begin(),
2582        E = RegOrder.end(); I != E; ++I) {
2583     const SCEV *Reg = I->Reg;
2584     const SmallBitVector &Bits = I->Sort.Bits;
2585
2586     // Registers with only one use don't represent reuse opportunities, so
2587     // when we get there, we're done.
2588     if (Bits.count() <= 1) break;
2589
2590     DEBUG(dbgs() << "Reg " << *Reg << ": ";
2591           I->Sort.print(dbgs());
2592           dbgs() << '\n');
2593
2594     // Determine the total number of registers will be needed if we make use
2595     // of the reuse opportunity represented by the current register.
2596     Score NewScore;
2597     NewScore.Rate(Reg, Bits, Uses, L, SE);
2598
2599     // Now decide whether this register's reuse opportunity is an overall win.
2600     // Currently the decision is heavily skewed for register pressure.
2601     if (!(NewScore < CurScore)) {
2602       continue;
2603     }
2604
2605     // Ok, use this opportunity.
2606     DEBUG(dbgs() << "This candidate has been accepted.\n");
2607     CurScore = NewScore;
2608
2609     // Now that we've selected a new reuse opportunity, delete formulae that
2610     // do not participate in that opportunity.
2611     for (int j = Bits.find_first(); j != -1; j = Bits.find_next(j)) {
2612       LSRUse &LU = Uses[j];
2613       for (unsigned k = 0, h = LU.Formulae.size(); k != h; ++k) {
2614         Formula &F = LU.Formulae[k];
2615         if (!F.referencesReg(Reg)) {
2616           std::swap(LU.Formulae[k], LU.Formulae.back());
2617           LU.Formulae.pop_back();
2618           --k; --h;
2619         }
2620       }
2621       // Also re-sort the list to put the formulae with the fewest registers
2622       // at the front.
2623       // TODO: Do this earlier, we don't need it each time.
2624       std::stable_sort(LU.Formulae.begin(), LU.Formulae.end(),
2625                        ComplexitySorter());
2626     }
2627   }
2628
2629   // Ok, we've now made all our decisions. The first formula for each use
2630   // will be used.
2631   DEBUG(dbgs() << "Concluding, we need "; CurScore.print(dbgs());
2632         dbgs() << ".\n";
2633         debug_winner(Uses));
2634
2635   // Free memory no longer needed.
2636   RegOrder.clear();
2637   Factors.clear();
2638   Types.clear();
2639   RegUses.clear();
2640   RegSequence.clear();
2641
2642   // Keep track of instructions we may have made dead, so that
2643   // we can remove them after we are done working.
2644   SmallVector<WeakVH, 16> DeadInsts;
2645
2646   SCEVExpander Rewriter(SE);
2647   Rewriter.disableCanonicalMode();
2648   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
2649
2650   // Expand the new value definitions and update the users.
2651   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
2652        E = Uses.end(); I != E; ++I) {
2653     // Formulae should be legal.
2654     DEBUG(for (SmallVectorImpl<Formula>::const_iterator J = I->Formulae.begin(),
2655                JE = I->Formulae.end(); J != JE; ++J)
2656             assert(isLegalUse(J->AM, I->Kind, I->AccessTy, TLI) &&
2657                    "Illegal formula generated!"));
2658
2659     // Expand the new code and update the user.
2660     I->Rewrite(L, Rewriter, DeadInsts, SE, DT, P);
2661     Changed = true;
2662   }
2663
2664   // Clean up after ourselves. This must be done before deleting any
2665   // instructions.
2666   Rewriter.clear();
2667
2668   Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
2669 }
2670
2671 void LSRInstance::print(raw_ostream &OS) const {
2672   OS << "LSR has identified the following interesting factors and types: ";
2673   bool First = true;
2674
2675   for (SmallSetVector<int64_t, 4>::const_iterator
2676        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
2677     if (!First) OS << ", ";
2678     First = false;
2679     OS << '*' << *I;
2680   }
2681
2682   for (SmallSetVector<const Type *, 4>::const_iterator
2683        I = Types.begin(), E = Types.end(); I != E; ++I) {
2684     if (!First) OS << ", ";
2685     First = false;
2686     OS << '(' << **I << ')';
2687   }
2688   OS << '\n';
2689
2690   OS << "LSR is examining the following uses, and candidate formulae:\n";
2691   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
2692        E = Uses.end(); I != E; ++I) {
2693     const LSRUse &LU = *I;
2694     dbgs() << "  ";
2695     LU.print(OS);
2696     OS << '\n';
2697     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
2698          JE = LU.Formulae.end(); J != JE; ++J) {
2699       OS << "    ";
2700       J->print(OS);
2701       OS << "\n";
2702     }
2703   }
2704 }
2705
2706 void LSRInstance::dump() const {
2707   print(errs()); errs() << '\n';
2708 }
2709
2710 namespace {
2711
2712 class LoopStrengthReduce : public LoopPass {
2713   /// TLI - Keep a pointer of a TargetLowering to consult for determining
2714   /// transformation profitability.
2715   const TargetLowering *const TLI;
2716
2717 public:
2718   static char ID; // Pass ID, replacement for typeid
2719   explicit LoopStrengthReduce(const TargetLowering *tli = NULL);
2720
2721 private:
2722   bool runOnLoop(Loop *L, LPPassManager &LPM);
2723   void getAnalysisUsage(AnalysisUsage &AU) const;
2724 };
2725
2726 }
2727
2728 char LoopStrengthReduce::ID = 0;
2729 static RegisterPass<LoopStrengthReduce>
2730 X("loop-reduce", "Loop Strength Reduction");
2731
2732 Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
2733   return new LoopStrengthReduce(TLI);
2734 }
2735
2736 LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
2737   : LoopPass(&ID), TLI(tli) {}
2738
2739 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
2740   // We split critical edges, so we change the CFG.  However, we do update
2741   // many analyses if they are around.
2742   AU.addPreservedID(LoopSimplifyID);
2743   AU.addPreserved<LoopInfo>();
2744   AU.addPreserved("domfrontier");
2745
2746   AU.addRequiredID(LoopSimplifyID);
2747   AU.addRequired<DominatorTree>();
2748   AU.addPreserved<DominatorTree>();
2749   AU.addRequired<ScalarEvolution>();
2750   AU.addPreserved<ScalarEvolution>();
2751   AU.addRequired<IVUsers>();
2752   AU.addPreserved<IVUsers>();
2753 }
2754
2755 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
2756   bool Changed = false;
2757
2758   // Run the main LSR transformation.
2759   Changed |= LSRInstance(TLI, L, this).getChanged();
2760
2761   // At this point, it is worth checking to see if any recurrence PHIs are also
2762   // dead, so that we can remove them as well.
2763   Changed |= DeleteDeadPHIs(L->getHeader());
2764
2765   return Changed;
2766 }