lib/Transforms/Scalar/LoopStrengthReduce.cpp

   1 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This transformation analyzes and transforms the induction variables (and
  11 // computations derived from them) into forms suitable for efficient execution
  12 // on the target.
  13 //
  14 // This pass performs a strength reduction on array references inside loops that
  15 // have as one or more of their components the loop induction variable, it
  16 // rewrites expressions to take advantage of scaled-index addressing modes
  17 // available on the target, and it performs a variety of other optimizations
  18 // related to loop induction variables.
  19 //
  20 // Terminology note: this code has a lot of handling for "post-increment" or
  21 // "post-inc" users. This is not talking about post-increment addressing modes;
  22 // it is instead talking about code like this:
  23 //
  24 //   %i = phi [ 0, %entry ], [ %i.next, %latch ]
  25 //   ...
  26 //   %i.next = add %i, 1
  27 //   %c = icmp eq %i.next, %n
  28 //
  29 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
  30 // it's useful to think about these as the same register, with some uses using
  31 // the value of the register before the add and some using // it after. In this
  32 // example, the icmp is a post-increment user, since it uses %i.next, which is
  33 // the value of the induction variable after the increment. The other common
  34 // case of post-increment users is users outside the loop.
  35 //
  36 // TODO: More sophistication in the way Formulae are generated.
  37 //
  38 // TODO: Handle multiple loops at a time.
  39 //
  40 // TODO: test/CodeGen/X86/full-lsr.ll should get full lsr. The problem is
  41 //       that {0,+,1}<%bb> is getting picked first because all 7 uses can
  42 //       use it, and while it's a pretty good solution, it means that LSR
  43 //       doesn't look further to find an even better solution.
  44 //
  45 // TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
  46 //       instead of a GlobalValue?
  47 //
  48 // TODO: When truncation is free, truncate ICmp users' operands to make it a
  49 //       smaller encoding (on x86 at least).
  50 //
  51 // TODO: When a negated register is used by an add (such as in a list of
  52 //       multiple base registers, or as the increment expression in an addrec),
  53 //       we may not actually need both reg and (-1 * reg) in registers; the
  54 //       negation can be implemented by using a sub instead of an add. The
  55 //       lack of support for taking this into consideration when making
  56 //       register pressure decisions is partly worked around by the "Special"
  57 //       use kind.
  58 //
  59 //===----------------------------------------------------------------------===//
  60
  61 #define DEBUG_TYPE "loop-reduce"
  62 #include "llvm/Transforms/Scalar.h"
  63 #include "llvm/Constants.h"
  64 #include "llvm/Instructions.h"
  65 #include "llvm/IntrinsicInst.h"
  66 #include "llvm/DerivedTypes.h"
  67 #include "llvm/Analysis/IVUsers.h"
  68 #include "llvm/Analysis/Dominators.h"
  69 #include "llvm/Analysis/LoopPass.h"
  70 #include "llvm/Analysis/ScalarEvolutionExpander.h"
  71 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  72 #include "llvm/Transforms/Utils/Local.h"
  73 #include "llvm/ADT/SmallBitVector.h"
  74 #include "llvm/ADT/SetVector.h"
  75 #include "llvm/Support/Debug.h"
  76 #include "llvm/Support/ValueHandle.h"
  77 #include "llvm/Support/raw_ostream.h"
  78 #include "llvm/Target/TargetLowering.h"
  79 #include <algorithm>
  80 using namespace llvm;
  81
  82 namespace {
  83
  84 // Constant strides come first which in turns are sorted by their absolute
  85 // values. If absolute values are the same, then positive strides comes first.
  86 // e.g.
  87 // 4, -1, X, 1, 2 ==> 1, -1, 2, 4, X
  88 struct StrideCompare {
  89   const ScalarEvolution &SE;
  90   explicit StrideCompare(const ScalarEvolution &se) : SE(se) {}
  91
  92   bool operator()(const SCEV *const &LHS, const SCEV *const &RHS) const {
  93     const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS);
  94     const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
  95     if (LHSC && RHSC) {
  96       unsigned BitWidth = std::max(SE.getTypeSizeInBits(LHS->getType()),
  97                                    SE.getTypeSizeInBits(RHS->getType()));
  98       APInt  LV = LHSC->getValue()->getValue();
  99       APInt  RV = RHSC->getValue()->getValue();
 100       LV.sextOrTrunc(BitWidth);
 101       RV.sextOrTrunc(BitWidth);
 102       APInt ALV = LV.abs();
 103       APInt ARV = RV.abs();
 104       if (ALV == ARV) {
 105         if (LV != RV)
 106           return LV.sgt(RV);
 107       } else {
 108         return ALV.ult(ARV);
 109       }
 110
 111       // If it's the same value but different type, sort by bit width so
 112       // that we emit larger induction variables before smaller
 113       // ones, letting the smaller be re-written in terms of larger ones.
 114       return SE.getTypeSizeInBits(RHS->getType()) <
 115              SE.getTypeSizeInBits(LHS->getType());
 116     }
 117     return LHSC && !RHSC;
 118   }
 119 };
 120
 121 /// RegSortData - This class holds data which is used to order reuse
 122 /// candidates.
 123 class RegSortData {
 124 public:
 125   /// Bits - This represents the set of LSRUses (by index) which reference a
 126   /// particular register.
 127   SmallBitVector Bits;
 128
 129   /// MaxComplexity - This represents the greatest complexity (see the comments
 130   /// on Formula::getComplexity) seen with a particular register.
 131   uint32_t MaxComplexity;
 132
 133   /// Index - This holds an arbitrary value used as a last-resort tie breaker
 134   /// to ensure deterministic behavior.
 135   unsigned Index;
 136
 137   RegSortData() {}
 138
 139   void print(raw_ostream &OS) const;
 140   void dump() const;
 141 };
 142
 143 }
 144
 145 void RegSortData::print(raw_ostream &OS) const {
 146   OS << "[NumUses=" << Bits.count()
 147      << ", MaxComplexity=";
 148   OS.write_hex(MaxComplexity);
 149   OS << ", Index=" << Index << ']';
 150 }
 151
 152 void RegSortData::dump() const {
 153   print(errs()); errs() << '\n';
 154 }
 155
 156 namespace {
 157
 158 /// RegCount - This is a helper class to sort a given set of registers
 159 /// according to associated RegSortData values.
 160 class RegCount {
 161 public:
 162   const SCEV *Reg;
 163   RegSortData Sort;
 164
 165   RegCount(const SCEV *R, const RegSortData &RSD)
 166     : Reg(R), Sort(RSD) {}
 167
 168   // Sort by count. Returning true means the register is preferred.
 169   bool operator<(const RegCount &Other) const {
 170     // Sort by the number of unique uses of this register.
 171     unsigned A = Sort.Bits.count();
 172     unsigned B = Other.Sort.Bits.count();
 173     if (A != B) return A > B;
 174
 175     if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
 176       const SCEVAddRecExpr *BR = dyn_cast<SCEVAddRecExpr>(Other.Reg);
 177       // AddRecs have higher priority than other things.
 178       if (!BR) return true;
 179
 180       // Prefer affine values.
 181       if (AR->isAffine() != BR->isAffine())
 182         return AR->isAffine();
 183
 184       const Loop *AL = AR->getLoop();
 185       const Loop *BL = BR->getLoop();
 186       if (AL != BL) {
 187         unsigned ADepth = AL->getLoopDepth();
 188         unsigned BDepth = BL->getLoopDepth();
 189         // Prefer a less deeply nested addrec.
 190         if (ADepth != BDepth)
 191           return ADepth < BDepth;
 192
 193         // Different loops at the same depth; do something arbitrary.
 194         BasicBlock *AH = AL->getHeader();
 195         BasicBlock *BH = BL->getHeader();
 196         for (Function::iterator I = AH, E = AH->getParent()->end(); I != E; ++I)
 197           if (&*I == BH) return true;
 198         return false;
 199       }
 200
 201       // Sort addrecs by stride.
 202       const SCEV *AStep = AR->getOperand(1);
 203       const SCEV *BStep = BR->getOperand(1);
 204       if (AStep != BStep) {
 205         if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(AStep)) {
 206           const SCEVConstant *BC = dyn_cast<SCEVConstant>(BStep);
 207           if (!BC) return true;
 208           // Arbitrarily prefer wider registers.
 209           if (AC->getValue()->getValue().getBitWidth() !=
 210               BC->getValue()->getValue().getBitWidth())
 211             return AC->getValue()->getValue().getBitWidth() >
 212                    BC->getValue()->getValue().getBitWidth();
 213           // Ignore the sign bit, assuming that striding by a negative value
 214           // is just as easy as by a positive value.
 215           // Prefer the addrec with the lesser absolute value stride, as it
 216           // will allow uses to have simpler addressing modes.
 217           return AC->getValue()->getValue().abs()
 218             .ult(BC->getValue()->getValue().abs());
 219         }
 220       }
 221
 222       // Then sort by the register which will permit the simplest uses.
 223       // This is a heuristic; currently we only track the most complex use as a
 224       // representative.
 225       if (Sort.MaxComplexity != Other.Sort.MaxComplexity)
 226         return Sort.MaxComplexity < Other.Sort.MaxComplexity;
 227
 228       // Then sort them by their start values.
 229       const SCEV *AStart = AR->getStart();
 230       const SCEV *BStart = BR->getStart();
 231       if (AStart != BStart) {
 232         if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(AStart)) {
 233           const SCEVConstant *BC = dyn_cast<SCEVConstant>(BStart);
 234           if (!BC) return true;
 235           // Arbitrarily prefer wider registers.
 236           if (AC->getValue()->getValue().getBitWidth() !=
 237               BC->getValue()->getValue().getBitWidth())
 238             return AC->getValue()->getValue().getBitWidth() >
 239                    BC->getValue()->getValue().getBitWidth();
 240           // Prefer positive over negative if the absolute values are the same.
 241           if (AC->getValue()->getValue().abs() ==
 242               BC->getValue()->getValue().abs())
 243             return AC->getValue()->getValue().isStrictlyPositive();
 244           // Prefer the addrec with the lesser absolute value start.
 245           return AC->getValue()->getValue().abs()
 246             .ult(BC->getValue()->getValue().abs());
 247         }
 248       }
 249     } else {
 250       // AddRecs have higher priority than other things.
 251       if (isa<SCEVAddRecExpr>(Other.Reg)) return false;
 252       // Sort by the register which will permit the simplest uses.
 253       // This is a heuristic; currently we only track the most complex use as a
 254       // representative.
 255       if (Sort.MaxComplexity != Other.Sort.MaxComplexity)
 256         return Sort.MaxComplexity < Other.Sort.MaxComplexity;
 257     }
 258
 259
 260     // Tie-breaker: the arbitrary index, to ensure a reliable ordering.
 261     return Sort.Index < Other.Sort.Index;
 262   }
 263
 264   void print(raw_ostream &OS) const;
 265   void dump() const;
 266 };
 267
 268 }
 269
 270 void RegCount::print(raw_ostream &OS) const {
 271   OS << *Reg << ':';
 272   Sort.print(OS);
 273 }
 274
 275 void RegCount::dump() const {
 276   print(errs()); errs() << '\n';
 277 }
 278
 279 namespace {
 280
 281 /// Formula - This class holds information that describes a formula for
 282 /// satisfying a use. It may include broken-out immediates and scaled registers.
 283 struct Formula {
 284   /// AM - This is used to represent complex addressing, as well as other kinds
 285   /// of interesting uses.
 286   TargetLowering::AddrMode AM;
 287
 288   /// BaseRegs - The list of "base" registers for this use. When this is
 289   /// non-empty, AM.HasBaseReg should be set to true.
 290   SmallVector<const SCEV *, 2> BaseRegs;
 291
 292   /// ScaledReg - The 'scaled' register for this use. This should be non-null
 293   /// when AM.Scale is not zero.
 294   const SCEV *ScaledReg;
 295
 296   Formula() : ScaledReg(0) {}
 297
 298   unsigned getNumRegs() const;
 299   uint32_t getComplexity() const;
 300   const Type *getType() const;
 301
 302   void InitialMatch(const SCEV *S, Loop *L,
 303                     ScalarEvolution &SE, DominatorTree &DT);
 304
 305   /// referencesReg - Test if this formula references the given register.
 306   bool referencesReg(const SCEV *S) const {
 307     return S == ScaledReg ||
 308            std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
 309   }
 310
 311   bool operator==(const Formula &Other) const {
 312     return BaseRegs == Other.BaseRegs &&
 313            ScaledReg == Other.ScaledReg &&
 314            AM.Scale == Other.AM.Scale &&
 315            AM.BaseOffs == Other.AM.BaseOffs &&
 316            AM.BaseGV == Other.AM.BaseGV;
 317   }
 318
 319   // This sorts at least partially based on host pointer values which are
 320   // not deterministic, so it is only usable for uniqification.
 321   bool operator<(const Formula &Other) const {
 322     if (BaseRegs != Other.BaseRegs)
 323       return BaseRegs < Other.BaseRegs;
 324     if (ScaledReg != Other.ScaledReg)
 325       return ScaledReg < Other.ScaledReg;
 326     if (AM.Scale != Other.AM.Scale)
 327       return AM.Scale < Other.AM.Scale;
 328     if (AM.BaseOffs != Other.AM.BaseOffs)
 329       return AM.BaseOffs < Other.AM.BaseOffs;
 330     if (AM.BaseGV != Other.AM.BaseGV)
 331       return AM.BaseGV < Other.AM.BaseGV;
 332     return false;
 333   }
 334
 335   void print(raw_ostream &OS) const;
 336   void dump() const;
 337 };
 338
 339 }
 340
 341 /// getNumRegs - Return the total number of register operands used by this
 342 /// formula. This does not include register uses implied by non-constant
 343 /// addrec strides.
 344 unsigned Formula::getNumRegs() const {
 345   return !!ScaledReg + BaseRegs.size();
 346 }
 347
 348 /// getComplexity - Return an oversimplified value indicating the complexity
 349 /// of this formula. This is used as a tie-breaker in choosing register
 350 /// preferences.
 351 uint32_t Formula::getComplexity() const {
 352   // Encode the information in a uint32_t so that comparing with operator<
 353   // will be interesting.
 354   return
 355     // Most significant, the number of registers. This saturates because we
 356     // need the bits, and because beyond a few hundred it doesn't really matter.
 357     (std::min(getNumRegs(), (1u<<15)-1) << 17) |
 358     // Having multiple base regs is worse than having a base reg and a scale.
 359     ((BaseRegs.size() > 1) << 16) |
 360     // Scale absolute value.
 361     ((AM.Scale != 0 ? (Log2_64(abs64(AM.Scale)) + 1) : 0u) << 9) |
 362     // Scale sign, which is less significant than the absolute value.
 363     ((AM.Scale < 0) << 8) |
 364     // Offset absolute value.
 365     ((AM.BaseOffs != 0 ? (Log2_64(abs64(AM.BaseOffs)) + 1) : 0u) << 1) |
 366     // If a GV is present, treat it like a maximal offset.
 367     ((AM.BaseGV ? ((1u<<7)-1) : 0) << 1) |
 368     // Offset sign, which is less significant than the absolute offset.
 369     ((AM.BaseOffs < 0) << 0);
 370 }
 371
 372 /// getType - Return the type of this formula, if it has one, or null
 373 /// otherwise. This type is meaningless except for the bit size.
 374 const Type *Formula::getType() const {
 375   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
 376          ScaledReg ? ScaledReg->getType() :
 377          AM.BaseGV ? AM.BaseGV->getType() :
 378          0;
 379 }
 380
 381 namespace {
 382
 383 /// ComplexitySorter - A predicate which orders Formulae by the number of
 384 /// registers they contain.
 385 struct ComplexitySorter {
 386   bool operator()(const Formula &LHS, const Formula &RHS) const {
 387     unsigned L = LHS.getNumRegs();
 388     unsigned R = RHS.getNumRegs();
 389     if (L != R) return L < R;
 390
 391     return LHS.getComplexity() < RHS.getComplexity();
 392   }
 393 };
 394
 395 }
 396
 397 /// DoInitialMatch - Recurrsion helper for InitialMatch.
 398 static void DoInitialMatch(const SCEV *S, Loop *L,
 399                            SmallVectorImpl<const SCEV *> &Good,
 400                            SmallVectorImpl<const SCEV *> &Bad,
 401                            ScalarEvolution &SE, DominatorTree &DT) {
 402   // Collect expressions which properly dominate the loop header.
 403   if (S->properlyDominates(L->getHeader(), &DT)) {
 404     Good.push_back(S);
 405     return;
 406   }
 407
 408   // Look at add operands.
 409   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 410     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
 411          I != E; ++I)
 412       DoInitialMatch(*I, L, Good, Bad, SE, DT);
 413     return;
 414   }
 415
 416   // Look at addrec operands.
 417   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 418     if (!AR->getStart()->isZero()) {
 419       DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT);
 420       DoInitialMatch(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()),
 421                                       AR->getStepRecurrence(SE),
 422                                       AR->getLoop()),
 423                      L, Good, Bad, SE, DT);
 424       return;
 425     }
 426   }
 427
 428   // Handle a multiplication by -1 (negation) if it didn't fold.
 429   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
 430     if (Mul->getOperand(0)->isAllOnesValue()) {
 431       SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
 432       const SCEV *NewMul = SE.getMulExpr(Ops);
 433
 434       SmallVector<const SCEV *, 4> MyGood;
 435       SmallVector<const SCEV *, 4> MyBad;
 436       DoInitialMatch(NewMul, L, MyGood, MyBad, SE, DT);
 437       const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
 438         SE.getEffectiveSCEVType(NewMul->getType())));
 439       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
 440            E = MyGood.end(); I != E; ++I)
 441         Good.push_back(SE.getMulExpr(NegOne, *I));
 442       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
 443            E = MyBad.end(); I != E; ++I)
 444         Bad.push_back(SE.getMulExpr(NegOne, *I));
 445       return;
 446     }
 447
 448   // Ok, we can't do anything interesting. Just stuff the whole thing into a
 449   // register and hope for the best.
 450   Bad.push_back(S);
 451 }
 452
 453 /// InitialMatch - Incorporate loop-variant parts of S into this Formula,
 454 /// attempting to keep all loop-invariant and loop-computable values in a
 455 /// single base register.
 456 void Formula::InitialMatch(const SCEV *S, Loop *L,
 457                            ScalarEvolution &SE, DominatorTree &DT) {
 458   SmallVector<const SCEV *, 4> Good;
 459   SmallVector<const SCEV *, 4> Bad;
 460   DoInitialMatch(S, L, Good, Bad, SE, DT);
 461   if (!Good.empty()) {
 462     BaseRegs.push_back(SE.getAddExpr(Good));
 463     AM.HasBaseReg = true;
 464   }
 465   if (!Bad.empty()) {
 466     BaseRegs.push_back(SE.getAddExpr(Bad));
 467     AM.HasBaseReg = true;
 468   }
 469 }
 470
 471 void Formula::print(raw_ostream &OS) const {
 472   bool First = true;
 473   if (AM.BaseGV) {
 474     if (!First) OS << " + "; else First = false;
 475     WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false);
 476   }
 477   if (AM.BaseOffs != 0) {
 478     if (!First) OS << " + "; else First = false;
 479     OS << AM.BaseOffs;
 480   }
 481   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
 482        E = BaseRegs.end(); I != E; ++I) {
 483     if (!First) OS << " + "; else First = false;
 484     OS << "reg(";
 485     OS << **I;
 486     OS << ")";
 487   }
 488   if (AM.Scale != 0) {
 489     if (!First) OS << " + "; else First = false;
 490     OS << AM.Scale << "*reg(";
 491     if (ScaledReg)
 492       OS << *ScaledReg;
 493     else
 494       OS << "<unknown>";
 495     OS << ")";
 496   }
 497 }
 498
 499 void Formula::dump() const {
 500   print(errs()); errs() << '\n';
 501 }
 502
 503 /// getSDiv - Return an expression for LHS /s RHS, if it can be determined,
 504 /// or null otherwise. If IgnoreSignificantBits is true, expressions like
 505 /// (X * Y) /s Y are simplified to Y, ignoring that the multiplication may
 506 /// overflow, which is useful when the result will be used in a context where
 507 /// the most significant bits are ignored.
 508 static const SCEV *getSDiv(const SCEV *LHS, const SCEV *RHS,
 509                            ScalarEvolution &SE,
 510                            bool IgnoreSignificantBits = false) {
 511   // Handle the trivial case, which works for any SCEV type.
 512   if (LHS == RHS)
 513     return SE.getIntegerSCEV(1, LHS->getType());
 514
 515   // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do some
 516   // folding.
 517   if (RHS->isAllOnesValue())
 518     return SE.getMulExpr(LHS, RHS);
 519
 520   // Check for a division of a constant by a constant.
 521   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
 522     const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
 523     if (!RC)
 524       return 0;
 525     if (C->getValue()->getValue().srem(RC->getValue()->getValue()) != 0)
 526       return 0;
 527     return SE.getConstant(C->getValue()->getValue()
 528                .sdiv(RC->getValue()->getValue()));
 529   }
 530
 531   // Distribute the sdiv over addrec operands.
 532   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
 533     const SCEV *Start = getSDiv(AR->getStart(), RHS, SE,
 534                                 IgnoreSignificantBits);
 535     if (!Start) return 0;
 536     const SCEV *Step = getSDiv(AR->getStepRecurrence(SE), RHS, SE,
 537                                IgnoreSignificantBits);
 538     if (!Step) return 0;
 539     return SE.getAddRecExpr(Start, Step, AR->getLoop());
 540   }
 541
 542   // Distribute the sdiv over add operands.
 543   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
 544     SmallVector<const SCEV *, 8> Ops;
 545     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
 546          I != E; ++I) {
 547       const SCEV *Op = getSDiv(*I, RHS, SE,
 548                                IgnoreSignificantBits);
 549       if (!Op) return 0;
 550       Ops.push_back(Op);
 551     }
 552     return SE.getAddExpr(Ops);
 553   }
 554
 555   // Check for a multiply operand that we can pull RHS out of.
 556   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS))
 557     if (IgnoreSignificantBits || Mul->hasNoSignedWrap()) {
 558       SmallVector<const SCEV *, 4> Ops;
 559       bool Found = false;
 560       for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
 561            I != E; ++I) {
 562         if (!Found)
 563           if (const SCEV *Q = getSDiv(*I, RHS, SE, IgnoreSignificantBits)) {
 564             Ops.push_back(Q);
 565             Found = true;
 566             continue;
 567           }
 568         Ops.push_back(*I);
 569       }
 570       return Found ? SE.getMulExpr(Ops) : 0;
 571     }
 572
 573   // Otherwise we don't know.
 574   return 0;
 575 }
 576
 577 namespace {
 578
 579 /// LSRUse - This class holds the state that LSR keeps for each use in
 580 /// IVUsers, as well as uses invented by LSR itself. It includes information
 581 /// about what kinds of things can be folded into the user, information
 582 /// about the user itself, and information about how the use may be satisfied.
 583 /// TODO: Represent multiple users of the same expression in common?
 584 class LSRUse {
 585   SmallSet<Formula, 8> FormulaeUniquifier;
 586
 587 public:
 588   /// KindType - An enum for a kind of use, indicating what types of
 589   /// scaled and immediate operands it might support.
 590   enum KindType {
 591     Basic,   ///< A normal use, with no folding.
 592     Special, ///< A special case of basic, allowing -1 scales.
 593     Address, ///< An address use; folding according to TargetLowering
 594     ICmpZero ///< An equality icmp with both operands folded into one.
 595     // TODO: Add a generic icmp too?
 596   };
 597
 598   KindType Kind;
 599   const Type *AccessTy;
 600   Instruction *UserInst;
 601   Value *OperandValToReplace;
 602
 603   /// PostIncLoop - If this user is to use the post-incremented value of an
 604   /// induction variable, this variable is non-null and holds the loop
 605   /// associated with the induction variable.
 606   const Loop *PostIncLoop;
 607
 608   /// Formulae - A list of ways to build a value that can satisfy this user.
 609   /// After the list is populated, one of these is selected heuristically and
 610   /// used to formulate a replacement for OperandValToReplace in UserInst.
 611   SmallVector<Formula, 12> Formulae;
 612
 613   LSRUse() : Kind(Basic), AccessTy(0),
 614              UserInst(0), OperandValToReplace(0), PostIncLoop(0) {}
 615
 616   void InsertInitialFormula(const SCEV *S, Loop *L,
 617                             ScalarEvolution &SE, DominatorTree &DT);
 618   void InsertSupplementalFormula(const SCEV *S);
 619
 620   bool InsertFormula(const Formula &F);
 621
 622   void Rewrite(Loop *L, SCEVExpander &Rewriter,
 623                SmallVectorImpl<WeakVH> &DeadInsts,
 624                ScalarEvolution &SE, DominatorTree &DT,
 625                Pass *P) const;
 626
 627   void print(raw_ostream &OS) const;
 628   void dump() const;
 629
 630 private:
 631   Value *Expand(BasicBlock::iterator IP,
 632                 Loop *L, SCEVExpander &Rewriter,
 633                 SmallVectorImpl<WeakVH> &DeadInsts,
 634                 ScalarEvolution &SE, DominatorTree &DT) const;
 635 };
 636
 637 }
 638
 639 /// ExtractImmediate - If S involves the addition of a constant integer value,
 640 /// return that integer value, and mutate S to point to a new SCEV with that
 641 /// value excluded.
 642 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
 643   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
 644     if (C->getValue()->getValue().getMinSignedBits() <= 64) {
 645       S = SE.getIntegerSCEV(0, C->getType());
 646       return C->getValue()->getSExtValue();
 647     }
 648   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 649     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
 650     int64_t Result = ExtractImmediate(NewOps.front(), SE);
 651     S = SE.getAddExpr(NewOps);
 652     return Result;
 653   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 654     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
 655     int64_t Result = ExtractImmediate(NewOps.front(), SE);
 656     S = SE.getAddRecExpr(NewOps, AR->getLoop());
 657     return Result;
 658   }
 659   return 0;
 660 }
 661
 662 /// ExtractSymbol - If S involves the addition of a GlobalValue address,
 663 /// return that symbol, and mutate S to point to a new SCEV with that
 664 /// value excluded.
 665 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
 666   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
 667     if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
 668       S = SE.getIntegerSCEV(0, GV->getType());
 669       return GV;
 670     }
 671   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 672     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
 673     GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
 674     S = SE.getAddExpr(NewOps);
 675     return Result;
 676   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 677     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
 678     GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
 679     S = SE.getAddRecExpr(NewOps, AR->getLoop());
 680     return Result;
 681   }
 682   return 0;
 683 }
 684
 685 /// isLegalUse - Test whether the use described by AM is "legal", meaning
 686 /// it can be completely folded into the user instruction at isel time.
 687 /// This includes address-mode folding and special icmp tricks.
 688 static bool isLegalUse(const TargetLowering::AddrMode &AM,
 689                        LSRUse::KindType Kind, const Type *AccessTy,
 690                        const TargetLowering *TLI) {
 691   switch (Kind) {
 692   case LSRUse::Address:
 693     // If we have low-level target information, ask the target if it can
 694     // completely fold this address.
 695     if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy);
 696
 697     // Otherwise, just guess that reg+reg addressing is legal.
 698     return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1;
 699
 700   case LSRUse::ICmpZero:
 701     // There's not even a target hook for querying whether it would be legal
 702     // to fold a GV into an ICmp.
 703     if (AM.BaseGV)
 704       return false;
 705
 706     // ICmp only has two operands; don't allow more than two non-trivial parts.
 707     if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0)
 708       return false;
 709
 710     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale
 711     // by putting the scaled register in the other operand of the icmp.
 712     if (AM.Scale != 0 && AM.Scale != -1)
 713       return false;
 714
 715     // If we have low-level target information, ask the target if it can
 716     // fold an integer immediate on an icmp.
 717     if (AM.BaseOffs != 0) {
 718       if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs);
 719       return false;
 720     }
 721
 722     return true;
 723
 724   case LSRUse::Basic:
 725     // Only handle single-register values.
 726     return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;
 727
 728   case LSRUse::Special:
 729     // Only handle -1 scales, or no scale.
 730     return AM.Scale == 0 || AM.Scale == -1;
 731   }
 732
 733   return false;
 734 }
 735
 736 static bool isAlwaysFoldable(const SCEV *S,
 737                              bool HasBaseReg,
 738                              LSRUse::KindType Kind, const Type *AccessTy,
 739                              const TargetLowering *TLI,
 740                              ScalarEvolution &SE) {
 741   // Fast-path: zero is always foldable.
 742   if (S->isZero()) return true;
 743
 744   // Conservatively, create an address with an immediate and a
 745   // base and a scale.
 746   TargetLowering::AddrMode AM;
 747   AM.BaseOffs = ExtractImmediate(S, SE);
 748   AM.BaseGV = ExtractSymbol(S, SE);
 749   AM.HasBaseReg = HasBaseReg;
 750   AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
 751
 752   // If there's anything else involved, it's not foldable.
 753   if (!S->isZero()) return false;
 754
 755   return isLegalUse(AM, Kind, AccessTy, TLI);
 756 }
 757
 758 /// InsertFormula - If the given formula has not yet been inserted, add it
 759 /// to the list, and return true. Return false otherwise.
 760 bool LSRUse::InsertFormula(const Formula &F) {
 761   Formula Copy = F;
 762
 763   // Sort the base regs, to avoid adding the same solution twice with
 764   // the base regs in different orders. This uses host pointer values, but
 765   // it doesn't matter since it's only used for uniquifying.
 766   std::sort(Copy.BaseRegs.begin(), Copy.BaseRegs.end());
 767
 768   DEBUG(for (SmallVectorImpl<const SCEV *>::const_iterator I =
 769              F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
 770           assert(!(*I)->isZero() && "Zero allocated in a base register!");
 771         assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
 772                "Zero allocated in a scaled register!"));
 773
 774   if (FormulaeUniquifier.insert(Copy)) {
 775     Formulae.push_back(F);
 776     return true;
 777   }
 778
 779   return false;
 780 }
 781
 782 void
 783 LSRUse::InsertInitialFormula(const SCEV *S, Loop *L,
 784                              ScalarEvolution &SE, DominatorTree &DT) {
 785   Formula F;
 786   F.InitialMatch(S, L, SE, DT);
 787   bool Inserted = InsertFormula(F);
 788   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
 789 }
 790
 791 void
 792 LSRUse::InsertSupplementalFormula(const SCEV *S) {
 793   Formula F;
 794   F.BaseRegs.push_back(S);
 795   F.AM.HasBaseReg = true;
 796   bool Inserted = InsertFormula(F);
 797   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
 798 }
 799
 800 /// getImmediateDominator - A handy utility for the specific DominatorTree
 801 /// query that we need here.
 802 ///
 803 static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) {
 804   DomTreeNode *Node = DT.getNode(BB);
 805   if (!Node) return 0;
 806   Node = Node->getIDom();
 807   if (!Node) return 0;
 808   return Node->getBlock();
 809 }
 810
 811 Value *LSRUse::Expand(BasicBlock::iterator IP,
 812                       Loop *L, SCEVExpander &Rewriter,
 813                       SmallVectorImpl<WeakVH> &DeadInsts,
 814                       ScalarEvolution &SE, DominatorTree &DT) const {
 815   // Then, collect some instructions which we will remain dominated by when
 816   // expanding the replacement. These must be dominated by any operands that
 817   // will be required in the expansion.
 818   SmallVector<Instruction *, 4> Inputs;
 819   if (Instruction *I = dyn_cast<Instruction>(OperandValToReplace))
 820     Inputs.push_back(I);
 821   if (Kind == ICmpZero)
 822     if (Instruction *I =
 823           dyn_cast<Instruction>(cast<ICmpInst>(UserInst)->getOperand(1)))
 824       Inputs.push_back(I);
 825   if (PostIncLoop && !L->contains(UserInst))
 826     Inputs.push_back(L->getLoopLatch()->getTerminator());
 827
 828   // Then, climb up the immediate dominator tree as far as we can go while
 829   // still being dominated by the input positions.
 830   for (;;) {
 831     bool AllDominate = true;
 832     Instruction *BetterPos = 0;
 833     BasicBlock *IDom = getImmediateDominator(IP->getParent(), DT);
 834     if (!IDom) break;
 835     Instruction *Tentative = IDom->getTerminator();
 836     for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
 837          E = Inputs.end(); I != E; ++I) {
 838       Instruction *Inst = *I;
 839       if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
 840         AllDominate = false;
 841         break;
 842       }
 843       if (IDom == Inst->getParent() &&
 844           (!BetterPos || DT.dominates(BetterPos, Inst)))
 845         BetterPos = next(BasicBlock::iterator(Inst));
 846     }
 847     if (!AllDominate)
 848       break;
 849     if (BetterPos)
 850       IP = BetterPos;
 851     else
 852       IP = Tentative;
 853   }
 854   while (isa<PHINode>(IP)) ++IP;
 855
 856   // The first formula in the list is the winner.
 857   const Formula &F = Formulae.front();
 858
 859   // Inform the Rewriter if we have a post-increment use, so that it can
 860   // perform an advantageous expansion.
 861   Rewriter.setPostInc(PostIncLoop);
 862
 863   // This is the type that the user actually needs.
 864   const Type *OpTy = OperandValToReplace->getType();
 865   // This will be the type that we'll initially expand to.
 866   const Type *Ty = F.getType();
 867   if (!Ty)
 868     // No type known; just expand directly to the ultimate type.
 869     Ty = OpTy;
 870   else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
 871     // Expand directly to the ultimate type if it's the right size.
 872     Ty = OpTy;
 873   // This is the type to do integer arithmetic in.
 874   const Type *IntTy = SE.getEffectiveSCEVType(Ty);
 875
 876   // Build up a list of operands to add together to form the full base.
 877   SmallVector<const SCEV *, 8> Ops;
 878
 879   // Expand the BaseRegs portion.
 880   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
 881        E = F.BaseRegs.end(); I != E; ++I) {
 882     const SCEV *Reg = *I;
 883     assert(!Reg->isZero() && "Zero allocated in a base register!");
 884
 885     // If we're expanding for a post-inc user for the add-rec's loop, make the
 886     // post-inc adjustment.
 887     if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg))
 888       if (AR->getLoop() == PostIncLoop)
 889         Reg = SE.getAddExpr(Reg, AR->getStepRecurrence(SE));
 890
 891     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
 892   }
 893
 894   // Expand the ScaledReg portion.
 895   Value *ICmpScaledV = 0;
 896   if (F.AM.Scale != 0) {
 897     const SCEV *ScaledS = F.ScaledReg;
 898
 899     // If we're expanding for a post-inc user for the add-rec's loop, make the
 900     // post-inc adjustment.
 901     if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ScaledS))
 902       if (AR->getLoop() == PostIncLoop)
 903         ScaledS = SE.getAddExpr(ScaledS, AR->getStepRecurrence(SE));
 904
 905     if (Kind == ICmpZero) {
 906       // An interesting way of "folding" with an icmp is to use a negated
 907       // scale, which we'll implement by inserting it into the other operand
 908       // of the icmp.
 909       assert(F.AM.Scale == -1 &&
 910              "The only scale supported by ICmpZero uses is -1!");
 911       ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
 912     } else {
 913       // Otherwise just expand the scaled register and an explicit scale,
 914       // which is expected to be matched as part of the address.
 915       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
 916       const Type *ScaledTy = SE.getEffectiveSCEVType(ScaledS->getType());
 917       ScaledS = SE.getMulExpr(ScaledS,
 918                               SE.getSCEV(ConstantInt::get(ScaledTy,
 919                                                           F.AM.Scale)));
 920       Ops.push_back(ScaledS);
 921     }
 922   }
 923
 924   // Expand the immediate portions.
 925   if (F.AM.BaseGV)
 926     Ops.push_back(SE.getSCEV(F.AM.BaseGV));
 927   if (F.AM.BaseOffs != 0) {
 928     if (Kind == ICmpZero) {
 929       // The other interesting way of "folding" with an ICmpZero is to use a
 930       // negated immediate.
 931       if (!ICmpScaledV)
 932         ICmpScaledV = ConstantInt::get(IntTy, -F.AM.BaseOffs);
 933       else {
 934         Ops.push_back(SE.getUnknown(ICmpScaledV));
 935         ICmpScaledV = ConstantInt::get(IntTy, F.AM.BaseOffs);
 936       }
 937     } else {
 938       // Just add the immediate values. These again are expected to be matched
 939       // as part of the address.
 940       Ops.push_back(SE.getSCEV(ConstantInt::get(IntTy, F.AM.BaseOffs)));
 941     }
 942   }
 943
 944   // Emit instructions summing all the operands.
 945   const SCEV *FullS = Ops.empty() ?
 946                       SE.getIntegerSCEV(0, IntTy) :
 947                       SE.getAddExpr(Ops);
 948   Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);
 949
 950   // We're done expanding now, so reset the rewriter.
 951   Rewriter.setPostInc(0);
 952
 953   // An ICmpZero Formula represents an ICmp which we're handling as a
 954   // comparison against zero. Now that we've expanded an expression for that
 955   // form, update the ICmp's other operand.
 956   if (Kind == ICmpZero) {
 957     ICmpInst *CI = cast<ICmpInst>(UserInst);
 958     DeadInsts.push_back(CI->getOperand(1));
 959     assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
 960                            "a scale at the same time!");
 961     if (F.AM.Scale == -1) {
 962       if (ICmpScaledV->getType() != OpTy) {
 963         Instruction *Cast =
 964           CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
 965                                                    OpTy, false),
 966                            ICmpScaledV, OpTy, "tmp", CI);
 967         ICmpScaledV = Cast;
 968       }
 969       CI->setOperand(1, ICmpScaledV);
 970     } else {
 971       assert(F.AM.Scale == 0 &&
 972              "ICmp does not support folding a global value and "
 973              "a scale at the same time!");
 974       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
 975                                            -(uint64_t)F.AM.BaseOffs);
 976       if (C->getType() != OpTy)
 977         C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
 978                                                           OpTy, false),
 979                                   C, OpTy);
 980
 981       CI->setOperand(1, C);
 982     }
 983   }
 984
 985   return FullV;
 986 }
 987
 988 /// Rewrite - Emit instructions for the leading candidate expression for this
 989 /// LSRUse (this is called "expanding"), and update the UserInst to reference
 990 /// the newly expanded value.
 991 void LSRUse::Rewrite(Loop *L, SCEVExpander &Rewriter,
 992                      SmallVectorImpl<WeakVH> &DeadInsts,
 993                      ScalarEvolution &SE, DominatorTree &DT,
 994                      Pass *P) const {
 995   const Type *OpTy = OperandValToReplace->getType();
 996
 997   // First, find an insertion point that dominates UserInst. For PHI nodes,
 998   // find the nearest block which dominates all the relevant uses.
 999   if (PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1000     DenseMap<BasicBlock *, Value *> Inserted;
1001     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1002       if (PN->getIncomingValue(i) == OperandValToReplace) {
1003         BasicBlock *BB = PN->getIncomingBlock(i);
1004
1005         // If this is a critical edge, split the edge so that we do not insert
1006         // the code on all predecessor/successor paths.  We do this unless this
1007         // is the canonical backedge for this loop, which complicates post-inc
1008         // users.
1009         if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
1010             !isa<IndirectBrInst>(BB->getTerminator()) &&
1011             (PN->getParent() != L->getHeader() || !L->contains(BB))) {
1012           // Split the critical edge.
1013           BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);
1014
1015           // If PN is outside of the loop and BB is in the loop, we want to
1016           // move the block to be immediately before the PHI block, not
1017           // immediately after BB.
1018           if (L->contains(BB) && !L->contains(PN))
1019             NewBB->moveBefore(PN->getParent());
1020
1021           // Splitting the edge can reduce the number of PHI entries we have.
1022           e = PN->getNumIncomingValues();
1023           BB = NewBB;
1024           i = PN->getBasicBlockIndex(BB);
1025         }
1026
1027         std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
1028           Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
1029         if (!Pair.second)
1030           PN->setIncomingValue(i, Pair.first->second);
1031         else {
1032           Value *FullV = Expand(BB->getTerminator(),
1033                                 L, Rewriter, DeadInsts, SE, DT);
1034
1035           // If this is reuse-by-noop-cast, insert the noop cast.
1036           if (FullV->getType() != OpTy)
1037             FullV =
1038               CastInst::Create(CastInst::getCastOpcode(FullV, false,
1039                                                        OpTy, false),
1040                                FullV, OperandValToReplace->getType(),
1041                                "tmp", BB->getTerminator());
1042
1043           PN->setIncomingValue(i, FullV);
1044           Pair.first->second = FullV;
1045         }
1046       }
1047   } else {
1048     Value *FullV = Expand(UserInst, L, Rewriter, DeadInsts, SE, DT);
1049
1050     // If this is reuse-by-noop-cast, insert the noop cast.
1051     if (FullV->getType() != OpTy) {
1052       Instruction *Cast =
1053         CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
1054                          FullV, OpTy, "tmp", UserInst);
1055       FullV = Cast;
1056     }
1057
1058     // Update the user.
1059     UserInst->replaceUsesOfWith(OperandValToReplace, FullV);
1060   }
1061
1062   DeadInsts.push_back(OperandValToReplace);
1063 }
1064
1065 void LSRUse::print(raw_ostream &OS) const {
1066   OS << "LSR Use: Kind=";
1067   switch (Kind) {
1068   case Basic:    OS << "Basic"; break;
1069   case Special:  OS << "Special"; break;
1070   case ICmpZero: OS << "ICmpZero"; break;
1071   case Address:
1072     OS << "Address of ";
1073     if (isa<PointerType>(AccessTy))
1074       OS << "pointer"; // the full pointer type could be really verbose
1075     else
1076       OS << *AccessTy;
1077   }
1078
1079   OS << ", UserInst=";
1080   // Store is common and interesting enough to be worth special-casing.
1081   if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1082     OS << "store ";
1083     WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
1084   } else if (UserInst->getType()->isVoidTy())
1085     OS << UserInst->getOpcodeName();
1086   else
1087     WriteAsOperand(OS, UserInst, /*PrintType=*/false);
1088
1089   OS << ", OperandValToReplace=";
1090   WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);
1091
1092   if (PostIncLoop) {
1093     OS << ", PostIncLoop=";
1094     WriteAsOperand(OS, PostIncLoop->getHeader(), /*PrintType=*/false);
1095   }
1096 }
1097
1098 void LSRUse::dump() const {
1099   print(errs()); errs() << '\n';
1100 }
1101
1102 namespace {
1103
1104 /// Score - This class is used to measure and compare candidate formulae.
1105 class Score {
1106   unsigned NumRegs;
1107   unsigned NumPhis;
1108   unsigned NumIVMuls;
1109   unsigned NumBaseAdds;
1110   unsigned NumImms;
1111
1112 public:
1113   Score()
1114     : NumRegs(0), NumPhis(0), NumIVMuls(0), NumBaseAdds(0), NumImms(0) {}
1115
1116   void RateInitial(SmallVector<LSRUse, 16> const &Uses, const Loop *L,
1117                    ScalarEvolution &SE);
1118
1119   void Rate(const SCEV *Reg, const SmallBitVector &Bits,
1120             const SmallVector<LSRUse, 16> &Uses, const Loop *L,
1121             ScalarEvolution &SE);
1122
1123   unsigned getNumRegs() const { return NumRegs; }
1124
1125   bool operator<(const Score &Other) const;
1126
1127   void print_details(raw_ostream &OS, const SCEV *Reg,
1128                      const SmallPtrSet<const SCEV *, 8> &Regs) const;
1129
1130   void print(raw_ostream &OS) const;
1131   void dump() const;
1132
1133 private:
1134   void RateRegister(const SCEV *Reg, SmallPtrSet<const SCEV *, 8> &Regs,
1135                     const Loop *L);
1136   void RateFormula(const Formula &F, SmallPtrSet<const SCEV *, 8> &Regs,
1137                    const Loop *L);
1138
1139   void Loose();
1140 };
1141
1142 }
1143
1144 /// RateRegister - Tally up interesting quantities from the given register.
1145 void Score::RateRegister(const SCEV *Reg,
1146                          SmallPtrSet<const SCEV *, 8> &Regs,
1147                          const Loop *L) {
1148   if (Regs.insert(Reg))
1149     if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1150       NumPhis += AR->getLoop() == L;
1151
1152       // Add the step value register, if it needs one.
1153       if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
1154         RateRegister(AR->getOperand(1), Regs, L);
1155     }
1156 }
1157
1158 void Score::RateFormula(const Formula &F,
1159                         SmallPtrSet<const SCEV *, 8> &Regs,
1160                         const Loop *L) {
1161   // Tally up the registers.
1162   if (F.ScaledReg)
1163     RateRegister(F.ScaledReg, Regs, L);
1164   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
1165        E = F.BaseRegs.end(); I != E; ++I) {
1166     const SCEV *BaseReg = *I;
1167     RateRegister(BaseReg, Regs, L);
1168
1169     NumIVMuls += isa<SCEVMulExpr>(BaseReg) &&
1170                  BaseReg->hasComputableLoopEvolution(L);
1171   }
1172
1173   if (F.BaseRegs.size() > 1)
1174     NumBaseAdds += F.BaseRegs.size() - 1;
1175
1176   // Tally up the non-zero immediates.
1177   if (F.AM.BaseGV || F.AM.BaseOffs != 0)
1178     ++NumImms;
1179 }
1180
1181 /// Loose - Set this score to a loosing value.
1182 void Score::Loose() {
1183   NumRegs = ~0u;
1184   NumPhis = ~0u;
1185   NumIVMuls = ~0u;
1186   NumBaseAdds = ~0u;
1187   NumImms = ~0u;
1188 }
1189
1190 /// RateInitial - Compute a score for the initial "fully reduced" solution.
1191 void Score::RateInitial(SmallVector<LSRUse, 16> const &Uses, const Loop *L,
1192                         ScalarEvolution &SE) {
1193   SmallPtrSet<const SCEV *, 8> Regs;
1194   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
1195        E = Uses.end(); I != E; ++I)
1196     RateFormula(I->Formulae.front(), Regs, L);
1197   NumRegs += Regs.size();
1198
1199   DEBUG(print_details(dbgs(), 0, Regs));
1200 }
1201
1202 /// Rate - Compute a score for the solution where the reuse associated with
1203 /// putting Reg in a register is selected.
1204 void Score::Rate(const SCEV *Reg, const SmallBitVector &Bits,
1205                  const SmallVector<LSRUse, 16> &Uses, const Loop *L,
1206                  ScalarEvolution &SE) {
1207   SmallPtrSet<const SCEV *, 8> Regs;
1208   for (size_t i = 0, e = Uses.size(); i != e; ++i) {
1209     const LSRUse &LU = Uses[i];
1210
1211     const Formula *BestFormula = 0;
1212     if (i >= Bits.size() || !Bits.test(i))
1213       // This use doesn't use the current register. Just go with the current
1214       // leading candidate formula.
1215       BestFormula = &LU.Formulae.front();
1216     else
1217       // Find the best formula for this use that uses the current register.
1218       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
1219            E = LU.Formulae.end(); I != E; ++I) {
1220         const Formula &F = *I;
1221         if (F.referencesReg(Reg) &&
1222             (!BestFormula || ComplexitySorter()(F, *BestFormula)))
1223           BestFormula = &F;
1224       }
1225
1226     // If we didn't find *any* forumlae, because earlier we eliminated some
1227     // in greedy fashion, skip the current register's reuse opportunity.
1228     if (!BestFormula) {
1229       DEBUG(dbgs() << "Reuse with reg " << *Reg
1230                    << " wouldn't help any users.\n");
1231       Loose();
1232       return;
1233     }
1234
1235     // For an in-loop post-inc user, don't allow multiple base registers,
1236     // because that would require an awkward in-loop add after the increment.
1237     if (LU.PostIncLoop && LU.PostIncLoop->contains(LU.UserInst) &&
1238         BestFormula->BaseRegs.size() > 1) {
1239       DEBUG(dbgs() << "Reuse with reg " << *Reg
1240                    << " would require an in-loop post-inc add: ";
1241             BestFormula->dump());
1242       Loose();
1243       return;
1244     }
1245
1246     RateFormula(*BestFormula, Regs, L);
1247   }
1248   NumRegs += Regs.size();
1249
1250   DEBUG(print_details(dbgs(), Reg, Regs));
1251 }
1252
1253 /// operator< - Choose the better score.
1254 bool Score::operator<(const Score &Other) const {
1255   if (NumRegs != Other.NumRegs)
1256     return NumRegs < Other.NumRegs;
1257   if (NumPhis != Other.NumPhis)
1258     return NumPhis < Other.NumPhis;
1259   if (NumIVMuls != Other.NumIVMuls)
1260     return NumIVMuls < Other.NumIVMuls;
1261   if (NumBaseAdds != Other.NumBaseAdds)
1262     return NumBaseAdds < Other.NumBaseAdds;
1263   return NumImms < Other.NumImms;
1264 }
1265
1266 void Score::print_details(raw_ostream &OS,
1267                           const SCEV *Reg,
1268                           const SmallPtrSet<const SCEV *, 8> &Regs) const {
1269   if (Reg) OS << "Reuse with reg " << *Reg << " would require ";
1270   else     OS << "The initial solution would require ";
1271   print(OS);
1272   OS << ". Regs:";
1273   for (SmallPtrSet<const SCEV *, 8>::const_iterator I = Regs.begin(),
1274        E = Regs.end(); I != E; ++I)
1275     OS << ' ' << **I;
1276   OS << '\n';
1277 }
1278
1279 void Score::print(raw_ostream &OS) const {
1280   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
1281   if (NumPhis != 0)
1282     OS << ", including " << NumPhis << " PHI" << (NumPhis == 1 ? "" : "s");
1283   if (NumIVMuls != 0)
1284     OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
1285   if (NumBaseAdds != 0)
1286     OS << ", plus " << NumBaseAdds << " base add"
1287        << (NumBaseAdds == 1 ? "" : "s");
1288   if (NumImms != 0)
1289     OS << ", plus " << NumImms << " imm" << (NumImms == 1 ? "" : "s");
1290 }
1291
1292 void Score::dump() const {
1293   print(errs()); errs() << '\n';
1294 }
1295
1296 /// isAddressUse - Returns true if the specified instruction is using the
1297 /// specified value as an address.
1298 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
1299   bool isAddress = isa<LoadInst>(Inst);
1300   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1301     if (SI->getOperand(1) == OperandVal)
1302       isAddress = true;
1303   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1304     // Addressing modes can also be folded into prefetches and a variety
1305     // of intrinsics.
1306     switch (II->getIntrinsicID()) {
1307       default: break;
1308       case Intrinsic::prefetch:
1309       case Intrinsic::x86_sse2_loadu_dq:
1310       case Intrinsic::x86_sse2_loadu_pd:
1311       case Intrinsic::x86_sse_loadu_ps:
1312       case Intrinsic::x86_sse_storeu_ps:
1313       case Intrinsic::x86_sse2_storeu_pd:
1314       case Intrinsic::x86_sse2_storeu_dq:
1315       case Intrinsic::x86_sse2_storel_dq:
1316         if (II->getOperand(1) == OperandVal)
1317           isAddress = true;
1318         break;
1319     }
1320   }
1321   return isAddress;
1322 }
1323
1324 /// getAccessType - Return the type of the memory being accessed.
1325 static const Type *getAccessType(const Instruction *Inst) {
1326   const Type *AccessTy = Inst->getType();
1327   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
1328     AccessTy = SI->getOperand(0)->getType();
1329   else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1330     // Addressing modes can also be folded into prefetches and a variety
1331     // of intrinsics.
1332     switch (II->getIntrinsicID()) {
1333     default: break;
1334     case Intrinsic::x86_sse_storeu_ps:
1335     case Intrinsic::x86_sse2_storeu_pd:
1336     case Intrinsic::x86_sse2_storeu_dq:
1337     case Intrinsic::x86_sse2_storel_dq:
1338       AccessTy = II->getOperand(1)->getType();
1339       break;
1340     }
1341   }
1342   return AccessTy;
1343 }
1344
1345 /// DeleteTriviallyDeadInstructions - If any of the instructions is the
1346 /// specified set are trivially dead, delete them and see if this makes any of
1347 /// their operands subsequently dead.
1348 static bool
1349 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
1350   bool Changed = false;
1351
1352   while (!DeadInsts.empty()) {
1353     Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
1354
1355     if (I == 0 || !isInstructionTriviallyDead(I))
1356       continue;
1357
1358     for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
1359       if (Instruction *U = dyn_cast<Instruction>(*OI)) {
1360         *OI = 0;
1361         if (U->use_empty())
1362           DeadInsts.push_back(U);
1363       }
1364
1365     I->eraseFromParent();
1366     Changed = true;
1367   }
1368
1369   return Changed;
1370 }
1371
1372 namespace {
1373
1374 /// LSRInstance - This class holds state for the main loop strength
1375 /// reduction logic.
1376 class LSRInstance {
1377   IVUsers &IU;
1378   ScalarEvolution &SE;
1379   DominatorTree &DT;
1380   const TargetLowering *const TLI;
1381   Loop *const L;
1382   bool Changed;
1383
1384   /// CurrentArbitraryRegIndex - To ensure a deterministic ordering, assign an
1385   /// arbitrary index value to each register as a sort tie breaker.
1386   unsigned CurrentArbitraryRegIndex;
1387
1388   /// MaxNumRegs - To help prune the search for solutions, identify the number
1389   /// of registers needed by the initial solution. No formula should require
1390   /// more than this.
1391   unsigned MaxNumRegs;
1392
1393   /// Factors - Interesting factors between use strides.
1394   SmallSetVector<int64_t, 4> Factors;
1395
1396   /// Types - Interesting use types, to facilitate truncation reuse.
1397   SmallSetVector<const Type *, 4> Types;
1398
1399   /// Uses - The list of interesting uses.
1400   SmallVector<LSRUse, 16> Uses;
1401
1402   // TODO: Reorganize these data structures.
1403   typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
1404   RegUsesTy RegUses;
1405   SmallVector<const SCEV *, 16> RegSequence;
1406
1407   void OptimizeShadowIV();
1408   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
1409                          const SCEV* &CondStride);
1410   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
1411   bool StrideMightBeShared(const SCEV* Stride);
1412   bool OptimizeLoopTermCond(Instruction *&IVIncInsertPos);
1413
1414   LSRUse &getNewUse() {
1415     Uses.push_back(LSRUse());
1416     return Uses.back();
1417   }
1418
1419   void CountRegister(const SCEV *Reg, uint32_t Complexity, size_t LUIdx);
1420   void CountRegisters(const Formula &F, size_t LUIdx);
1421
1422   bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
1423
1424   void GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx,
1425                                    Formula Base);
1426   void GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx,
1427                                    Formula Base);
1428   void GenerateFormulaeFromReplacedBaseReg(LSRUse &LU,
1429                                            unsigned LUIdx,
1430                                            const Formula &Base, unsigned i,
1431                                            const SmallVectorImpl<const SCEV *>
1432                                              &AddOps);
1433   void GenerateReassociationReuse(LSRUse &LU, unsigned LUIdx,
1434                                   Formula Base);
1435   void GenerateCombinationReuse(LSRUse &LU, unsigned LUIdx,
1436                                 Formula Base);
1437   void GenerateScaledReuse(LSRUse &LU, unsigned LUIdx,
1438                            Formula Base);
1439   void GenerateTruncateReuse(LSRUse &LU, unsigned LUIdx,
1440                              Formula Base);
1441
1442   void GenerateConstantOffsetReuse();
1443
1444   void GenerateAllReuseFormulae();
1445
1446   void GenerateLoopInvariantRegisterUses();
1447
1448 public:
1449   LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
1450
1451   bool getChanged() const { return Changed; }
1452
1453   void print(raw_ostream &OS) const;
1454   void dump() const;
1455 };
1456
1457 }
1458
1459 /// OptimizeShadowIV - If IV is used in a int-to-float cast
1460 /// inside the loop then try to eliminate the cast opeation.
1461 void LSRInstance::OptimizeShadowIV() {
1462   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1463   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1464     return;
1465
1466   for (size_t StrideIdx = 0, e = IU.StrideOrder.size();
1467        StrideIdx != e; ++StrideIdx) {
1468     std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
1469       IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]);
1470     assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
1471     if (!isa<SCEVConstant>(SI->first))
1472       continue;
1473
1474     for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
1475            E = SI->second->Users.end(); UI != E; /* empty */) {
1476       ilist<IVStrideUse>::iterator CandidateUI = UI;
1477       ++UI;
1478       Instruction *ShadowUse = CandidateUI->getUser();
1479       const Type *DestTy = NULL;
1480
1481       /* If shadow use is a int->float cast then insert a second IV
1482          to eliminate this cast.
1483
1484            for (unsigned i = 0; i < n; ++i)
1485              foo((double)i);
1486
1487          is transformed into
1488
1489            double d = 0.0;
1490            for (unsigned i = 0; i < n; ++i, ++d)
1491              foo(d);
1492       */
1493       if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
1494         DestTy = UCast->getDestTy();
1495       else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
1496         DestTy = SCast->getDestTy();
1497       if (!DestTy) continue;
1498
1499       if (TLI) {
1500         // If target does not support DestTy natively then do not apply
1501         // this transformation.
1502         EVT DVT = TLI->getValueType(DestTy);
1503         if (!TLI->isTypeLegal(DVT)) continue;
1504       }
1505
1506       PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
1507       if (!PH) continue;
1508       if (PH->getNumIncomingValues() != 2) continue;
1509
1510       const Type *SrcTy = PH->getType();
1511       int Mantissa = DestTy->getFPMantissaWidth();
1512       if (Mantissa == -1) continue;
1513       if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
1514         continue;
1515
1516       unsigned Entry, Latch;
1517       if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
1518         Entry = 0;
1519         Latch = 1;
1520       } else {
1521         Entry = 1;
1522         Latch = 0;
1523       }
1524
1525       ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
1526       if (!Init) continue;
1527       Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
1528
1529       BinaryOperator *Incr =
1530         dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
1531       if (!Incr) continue;
1532       if (Incr->getOpcode() != Instruction::Add
1533           && Incr->getOpcode() != Instruction::Sub)
1534         continue;
1535
1536       /* Initialize new IV, double d = 0.0 in above example. */
1537       ConstantInt *C = NULL;
1538       if (Incr->getOperand(0) == PH)
1539         C = dyn_cast<ConstantInt>(Incr->getOperand(1));
1540       else if (Incr->getOperand(1) == PH)
1541         C = dyn_cast<ConstantInt>(Incr->getOperand(0));
1542       else
1543         continue;
1544
1545       if (!C) continue;
1546
1547       // Ignore negative constants, as the code below doesn't handle them
1548       // correctly. TODO: Remove this restriction.
1549       if (!C->getValue().isStrictlyPositive()) continue;
1550
1551       /* Add new PHINode. */
1552       PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH);
1553
1554       /* create new increment. '++d' in above example. */
1555       Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
1556       BinaryOperator *NewIncr =
1557         BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
1558                                  Instruction::FAdd : Instruction::FSub,
1559                                NewPH, CFP, "IV.S.next.", Incr);
1560
1561       NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
1562       NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
1563
1564       /* Remove cast operation */
1565       ShadowUse->replaceAllUsesWith(NewPH);
1566       ShadowUse->eraseFromParent();
1567       break;
1568     }
1569   }
1570 }
1571
1572 /// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
1573 /// set the IV user and stride information and return true, otherwise return
1574 /// false.
1575 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond,
1576                                     IVStrideUse *&CondUse,
1577                                     const SCEV* &CondStride) {
1578   for (unsigned StrideIdx = 0, e = IU.StrideOrder.size();
1579        StrideIdx != e && !CondUse; ++StrideIdx) {
1580     std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
1581       IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]);
1582     assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
1583
1584     for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
1585          E = SI->second->Users.end(); UI != E; ++UI)
1586       if (UI->getUser() == Cond) {
1587         // NOTE: we could handle setcc instructions with multiple uses here, but
1588         // InstCombine does it as well for simple uses, it's not clear that it
1589         // occurs enough in real life to handle.
1590         CondUse = UI;
1591         CondStride = SI->first;
1592         return true;
1593       }
1594   }
1595   return false;
1596 }
1597
1598 /// OptimizeMax - Rewrite the loop's terminating condition if it uses
1599 /// a max computation.
1600 ///
1601 /// This is a narrow solution to a specific, but acute, problem. For loops
1602 /// like this:
1603 ///
1604 ///   i = 0;
1605 ///   do {
1606 ///     p[i] = 0.0;
1607 ///   } while (++i < n);
1608 ///
1609 /// the trip count isn't just 'n', because 'n' might not be positive. And
1610 /// unfortunately this can come up even for loops where the user didn't use
1611 /// a C do-while loop. For example, seemingly well-behaved top-test loops
1612 /// will commonly be lowered like this:
1613 //
1614 ///   if (n > 0) {
1615 ///     i = 0;
1616 ///     do {
1617 ///       p[i] = 0.0;
1618 ///     } while (++i < n);
1619 ///   }
1620 ///
1621 /// and then it's possible for subsequent optimization to obscure the if
1622 /// test in such a way that indvars can't find it.
1623 ///
1624 /// When indvars can't find the if test in loops like this, it creates a
1625 /// max expression, which allows it to give the loop a canonical
1626 /// induction variable:
1627 ///
1628 ///   i = 0;
1629 ///   max = n < 1 ? 1 : n;
1630 ///   do {
1631 ///     p[i] = 0.0;
1632 ///   } while (++i != max);
1633 ///
1634 /// Canonical induction variables are necessary because the loop passes
1635 /// are designed around them. The most obvious example of this is the
1636 /// LoopInfo analysis, which doesn't remember trip count values. It
1637 /// expects to be able to rediscover the trip count each time it is
1638 /// needed, and it does this using a simple analysis that only succeeds if
1639 /// the loop has a canonical induction variable.
1640 ///
1641 /// However, when it comes time to generate code, the maximum operation
1642 /// can be quite costly, especially if it's inside of an outer loop.
1643 ///
1644 /// This function solves this problem by detecting this type of loop and
1645 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
1646 /// the instructions for the maximum computation.
1647 ///
1648 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
1649   // Check that the loop matches the pattern we're looking for.
1650   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
1651       Cond->getPredicate() != CmpInst::ICMP_NE)
1652     return Cond;
1653
1654   SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
1655   if (!Sel || !Sel->hasOneUse()) return Cond;
1656
1657   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1658   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1659     return Cond;
1660   const SCEV *One = SE.getIntegerSCEV(1, BackedgeTakenCount->getType());
1661
1662   // Add one to the backedge-taken count to get the trip count.
1663   const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One);
1664
1665   // Check for a max calculation that matches the pattern.
1666   if (!isa<SCEVSMaxExpr>(IterationCount) && !isa<SCEVUMaxExpr>(IterationCount))
1667     return Cond;
1668   const SCEVNAryExpr *Max = cast<SCEVNAryExpr>(IterationCount);
1669   if (Max != SE.getSCEV(Sel)) return Cond;
1670
1671   // To handle a max with more than two operands, this optimization would
1672   // require additional checking and setup.
1673   if (Max->getNumOperands() != 2)
1674     return Cond;
1675
1676   const SCEV *MaxLHS = Max->getOperand(0);
1677   const SCEV *MaxRHS = Max->getOperand(1);
1678   if (!MaxLHS || MaxLHS != One) return Cond;
1679   // Check the relevant induction variable for conformance to
1680   // the pattern.
1681   const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
1682   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
1683   if (!AR || !AR->isAffine() ||
1684       AR->getStart() != One ||
1685       AR->getStepRecurrence(SE) != One)
1686     return Cond;
1687
1688   assert(AR->getLoop() == L &&
1689          "Loop condition operand is an addrec in a different loop!");
1690
1691   // Check the right operand of the select, and remember it, as it will
1692   // be used in the new comparison instruction.
1693   Value *NewRHS = 0;
1694   if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
1695     NewRHS = Sel->getOperand(1);
1696   else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
1697     NewRHS = Sel->getOperand(2);
1698   if (!NewRHS) return Cond;
1699
1700   // Determine the new comparison opcode. It may be signed or unsigned,
1701   // and the original comparison may be either equality or inequality.
1702   CmpInst::Predicate Pred =
1703     isa<SCEVSMaxExpr>(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
1704   if (Cond->getPredicate() == CmpInst::ICMP_EQ)
1705     Pred = CmpInst::getInversePredicate(Pred);
1706
1707   // Ok, everything looks ok to change the condition into an SLT or SGE and
1708   // delete the max calculation.
1709   ICmpInst *NewCond =
1710     new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
1711
1712   // Delete the max calculation instructions.
1713   Cond->replaceAllUsesWith(NewCond);
1714   CondUse->setUser(NewCond);
1715   Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
1716   Cond->eraseFromParent();
1717   Sel->eraseFromParent();
1718   if (Cmp->use_empty())
1719     Cmp->eraseFromParent();
1720   return NewCond;
1721 }
1722
1723 bool LSRInstance::StrideMightBeShared(const SCEV* Stride) {
1724   int64_t SInt = cast<SCEVConstant>(Stride)->getValue()->getSExtValue();
1725   for (unsigned i = 0, e = IU.StrideOrder.size(); i != e; ++i) {
1726     std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
1727       IU.IVUsesByStride.find(IU.StrideOrder[i]);
1728     const SCEV *Share = SI->first;
1729     if (!isa<SCEVConstant>(SI->first) || Share == Stride)
1730       continue;
1731     int64_t SSInt = cast<SCEVConstant>(Share)->getValue()->getSExtValue();
1732     if (SSInt == SInt)
1733       return true; // This can definitely be reused.
1734     if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0)
1735       continue;
1736     int64_t Scale = SSInt / SInt;
1737
1738     // This AM will be used for conservative queries. At this point in the
1739     // process we don't know which users will have a base reg, immediate,
1740     // etc., so we conservatively assume that it may not, making more
1741     // strides valid, thus erring on the side of assuming that there
1742     // might be sharing.
1743     TargetLowering::AddrMode AM;
1744     AM.Scale = Scale;
1745
1746     // Any pre-inc iv use?
1747     IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[Share];
1748     for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
1749            E = StrideUses.Users.end(); I != E; ++I) {
1750       bool isAddress = isAddressUse(I->getUser(), I->getOperandValToReplace());
1751       if (!I->isUseOfPostIncrementedValue() &&
1752           isLegalUse(AM, isAddress ? LSRUse::Address : LSRUse::Basic,
1753                      isAddress ? getAccessType(I->getUser()) : 0,
1754                      TLI))
1755         return true;
1756     }
1757   }
1758   return false;
1759 }
1760
1761 /// OptimizeLoopTermCond - Change loop terminating condition to use the
1762 /// postinc iv when possible.
1763 bool
1764 LSRInstance::OptimizeLoopTermCond(Instruction *&IVIncInsertPos) {
1765   SmallPtrSet<Instruction *, 4> PostIncs;
1766
1767   BasicBlock *LatchBlock = L->getLoopLatch();
1768   SmallVector<BasicBlock*, 8> ExitingBlocks;
1769   L->getExitingBlocks(ExitingBlocks);
1770
1771   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
1772     BasicBlock *ExitingBlock = ExitingBlocks[i];
1773
1774     // Get the terminating condition for the loop if possible.  If we
1775     // can, we want to change it to use a post-incremented version of its
1776     // induction variable, to allow coalescing the live ranges for the IV into
1777     // one register value.
1778
1779     BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
1780     if (!TermBr)
1781       continue;
1782     // FIXME: Overly conservative, termination condition could be an 'or' etc..
1783     if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
1784       continue;
1785
1786     // Search IVUsesByStride to find Cond's IVUse if there is one.
1787     IVStrideUse *CondUse = 0;
1788     const SCEV *CondStride = 0;
1789     ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
1790     if (!FindIVUserForCond(Cond, CondUse, CondStride))
1791       continue;
1792
1793     // If the trip count is computed in terms of a max (due to ScalarEvolution
1794     // being unable to find a sufficient guard, for example), change the loop
1795     // comparison to use SLT or ULT instead of NE.
1796     // One consequence of doing this now is that it disrupts the count-down
1797     // optimization. That's not always a bad thing though, because in such
1798     // cases it may still be worthwhile to avoid a max.
1799     Cond = OptimizeMax(Cond, CondUse);
1800
1801     // If this exiting block is the latch block, and the condition has only
1802     // one use inside the loop (the branch), use the post-incremented value
1803     // of the induction variable
1804     if (ExitingBlock != LatchBlock) {
1805       // If this exiting block dominates the latch block, it may also use
1806       // the post-inc value if it won't be shared with other uses.
1807       // Check for dominance.
1808       if (!DT.dominates(ExitingBlock, LatchBlock))
1809         continue;
1810       // Check for sharing within the same stride.
1811       bool SameStrideSharing = false;
1812       IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[CondStride];
1813       for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
1814              E = StrideUses.Users.end(); I != E; ++I) {
1815         if (I->getUser() == Cond)
1816           continue;
1817         if (!I->isUseOfPostIncrementedValue()) {
1818           SameStrideSharing = true;
1819           break;
1820         }
1821       }
1822       if (SameStrideSharing)
1823         continue;
1824       // Check for sharing from a different stride.
1825       if (isa<SCEVConstant>(CondStride) && StrideMightBeShared(CondStride))
1826         continue;
1827     }
1828     if (!Cond->hasOneUse()) {
1829       bool HasOneUseInLoop = true;
1830       for (Value::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
1831            UI != UE; ++UI) {
1832         Instruction *U = cast<Instruction>(*UI);
1833         if (U == TermBr)
1834           continue;
1835         if (L->contains(U)) {
1836           HasOneUseInLoop = false;
1837           break;
1838         }
1839       }
1840       if (!HasOneUseInLoop)
1841         continue;
1842     }
1843
1844     DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
1845                  << *Cond << '\n');
1846
1847     // It's possible for the setcc instruction to be anywhere in the loop, and
1848     // possible for it to have multiple users.  If it is not immediately before
1849     // the exiting block branch, move it.
1850     if (&*++BasicBlock::iterator(Cond) != TermBr) {
1851       if (Cond->hasOneUse()) {   // Condition has a single use, just move it.
1852         Cond->moveBefore(TermBr);
1853       } else {
1854         // Otherwise, clone the terminating condition and insert into the
1855         // loopend.
1856         ICmpInst *OldCond = Cond;
1857         Cond = cast<ICmpInst>(Cond->clone());
1858         Cond->setName(L->getHeader()->getName() + ".termcond");
1859         ExitingBlock->getInstList().insert(TermBr, Cond);
1860
1861         // Clone the IVUse, as the old use still exists!
1862         IU.IVUsesByStride[CondStride]->addUser(CondUse->getOffset(), Cond,
1863                                              CondUse->getOperandValToReplace());
1864         CondUse = &IU.IVUsesByStride[CondStride]->Users.back();
1865         TermBr->replaceUsesOfWith(OldCond, Cond);
1866       }
1867     }
1868
1869     // If we get to here, we know that we can transform the setcc instruction to
1870     // use the post-incremented version of the IV, allowing us to coalesce the
1871     // live ranges for the IV correctly.
1872     CondUse->setOffset(SE.getMinusSCEV(CondUse->getOffset(), CondStride));
1873     CondUse->setIsUseOfPostIncrementedValue(true);
1874     Changed = true;
1875
1876     PostIncs.insert(Cond);
1877   }
1878
1879   // Determine an insertion point for the loop induction variable increment. It
1880   // must dominate all the post-inc comparisons we just set up, and it must
1881   // dominate the loop latch edge.
1882   IVIncInsertPos = L->getLoopLatch()->getTerminator();
1883   for (SmallPtrSet<Instruction *, 4>::iterator I = PostIncs.begin(),
1884        E = PostIncs.end(); I != E; ++I) {
1885     BasicBlock *BB =
1886       DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
1887                                     (*I)->getParent());
1888     if (BB == (*I)->getParent())
1889       IVIncInsertPos = *I;
1890     else if (BB != IVIncInsertPos->getParent())
1891       IVIncInsertPos = BB->getTerminator();
1892   }
1893
1894   return Changed;
1895 }
1896
1897 /// CountRegisters - Note the given register.
1898 void LSRInstance::CountRegister(const SCEV *Reg, uint32_t Complexity,
1899                                 size_t LUIdx) {
1900   std::pair<RegUsesTy::iterator, bool> Pair =
1901     RegUses.insert(std::make_pair(Reg, RegSortData()));
1902   RegSortData &BV = Pair.first->second;
1903   if (Pair.second) {
1904     BV.Index = CurrentArbitraryRegIndex++;
1905     BV.MaxComplexity = Complexity;
1906     RegSequence.push_back(Reg);
1907   } else {
1908     BV.MaxComplexity = std::max(BV.MaxComplexity, Complexity);
1909   }
1910   BV.Bits.resize(std::max(BV.Bits.size(), LUIdx + 1));
1911   BV.Bits.set(LUIdx);
1912 }
1913
1914 /// CountRegisters - Note which registers are used by the given formula,
1915 /// updating RegUses.
1916 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
1917   uint32_t Complexity = F.getComplexity();
1918   if (F.ScaledReg)
1919     CountRegister(F.ScaledReg, Complexity, LUIdx);
1920   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
1921        E = F.BaseRegs.end(); I != E; ++I)
1922     CountRegister(*I, Complexity, LUIdx);
1923 }
1924
1925 /// InsertFormula - If the given formula has not yet been inserted, add it
1926 /// to the list, and return true. Return false otherwise.
1927 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
1928   // If a formula by itself would require more registers than the base solution,
1929   // discard it and stop searching from it, as it won't be profitable. This is
1930   // actually more permissive than it could be, because it doesn't include
1931   // registers used by non-constant strides in F.
1932   if (F.getNumRegs() > MaxNumRegs)
1933     return false;
1934
1935   if (!LU.InsertFormula(F))
1936     return false;
1937
1938   CountRegisters(LU.Formulae.back(), LUIdx);
1939   return true;
1940 }
1941
1942 /// GenerateSymbolicOffsetReuse - Generate reuse formulae using symbolic
1943 /// offsets.
1944 void LSRInstance::GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx,
1945                                               Formula Base) {
1946   // We can't add a symbolic offset if the address already contains one.
1947   if (Base.AM.BaseGV) return;
1948
1949   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
1950     const SCEV *G = Base.BaseRegs[i];
1951     GlobalValue *GV = ExtractSymbol(G, SE);
1952     if (G->isZero())
1953       continue;
1954     Formula F = Base;
1955     F.AM.BaseGV = GV;
1956     if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI))
1957       continue;
1958     F.BaseRegs[i] = G;
1959     (void)InsertFormula(LU, LUIdx, F);
1960   }
1961 }
1962
1963 /// GenerateICmpZeroScaledReuse - For ICmpZero, check to see if we can scale up
1964 /// the comparison. For example, x == y -> x*c == y*c.
1965 void LSRInstance::GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx,
1966                                               Formula Base) {
1967   if (LU.Kind != LSRUse::ICmpZero) return;
1968
1969   // Determine the integer type for the base formula.
1970   const Type *IntTy = Base.getType();
1971   if (!IntTy) return;
1972   if (SE.getTypeSizeInBits(IntTy) > 64) return;
1973   IntTy = SE.getEffectiveSCEVType(IntTy);
1974
1975   assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");
1976
1977   // Check each interesting stride.
1978   for (SmallSetVector<int64_t, 4>::const_iterator
1979        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
1980     int64_t Factor = *I;
1981     Formula F = Base;
1982
1983     // Check that the multiplication doesn't overflow.
1984     F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs * Factor;
1985     if ((int64_t)F.AM.BaseOffs / Factor != F.AM.BaseOffs)
1986       continue;
1987
1988     // Check that this scale is legal.
1989     if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI))
1990       continue;
1991
1992     const SCEV *FactorS = SE.getSCEV(ConstantInt::get(IntTy, Factor));
1993
1994     // Check that multiplying with each base register doesn't overflow.
1995     for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
1996       F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
1997       if (getSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
1998         goto next;
1999     }
2000
2001     // Check that multiplying with the scaled register doesn't overflow.
2002     if (F.ScaledReg) {
2003       F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
2004       if (getSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
2005         continue;
2006     }
2007
2008     // If we make it here and it's legal, add it.
2009     (void)InsertFormula(LU, LUIdx, F);
2010   next:;
2011   }
2012 }
2013
2014 /// GenerateFormulaeFromReplacedBaseReg - If removing base register with
2015 /// index i from the BaseRegs list and adding the registers in AddOps
2016 /// to the address forms an interesting formula, pursue it.
2017 void
2018 LSRInstance::GenerateFormulaeFromReplacedBaseReg(
2019                                              LSRUse &LU,
2020                                              unsigned LUIdx,
2021                                              const Formula &Base, unsigned i,
2022                                              const SmallVectorImpl<const SCEV *>
2023                                                &AddOps) {
2024   if (AddOps.empty()) return;
2025
2026   Formula F = Base;
2027   std::swap(F.BaseRegs[i], F.BaseRegs.back());
2028   F.BaseRegs.pop_back();
2029   for (SmallVectorImpl<const SCEV *>::const_iterator I = AddOps.begin(),
2030        E = AddOps.end(); I != E; ++I)
2031     F.BaseRegs.push_back(*I);
2032   F.AM.HasBaseReg = !F.BaseRegs.empty();
2033   if (InsertFormula(LU, LUIdx, F))
2034     // If that formula hadn't been seen before, recurse to find more like it.
2035     GenerateReassociationReuse(LU, LUIdx, LU.Formulae.back());
2036 }
2037
2038 /// GenerateReassociationReuse - Split out subexpressions from adds and
2039 /// the bases of addrecs.
2040 void LSRInstance::GenerateReassociationReuse(LSRUse &LU, unsigned LUIdx,
2041                                              Formula Base) {
2042   SmallVector<const SCEV *, 8> AddOps;
2043   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2044     const SCEV *BaseReg = Base.BaseRegs[i];
2045     if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(BaseReg)) {
2046       for (SCEVAddExpr::op_iterator J = Add->op_begin(), JE = Add->op_end();
2047            J != JE; ++J) {
2048         // Don't pull a constant into a register if the constant could be
2049         // folded into an immediate field.
2050         if (isAlwaysFoldable(*J, true, LU.Kind, LU.AccessTy, TLI, SE)) continue;
2051         SmallVector<const SCEV *, 8> InnerAddOps;
2052         for (SCEVAddExpr::op_iterator K = Add->op_begin(); K != JE; ++K)
2053           if (K != J)
2054             InnerAddOps.push_back(*K);
2055         // Splitting a 2-operand add both ways is redundant. Pruning this
2056         // now saves compile time.
2057         if (InnerAddOps.size() < 2 && next(J) == JE)
2058           continue;
2059         AddOps.push_back(*J);
2060         const SCEV *InnerAdd = SE.getAddExpr(InnerAddOps);
2061         AddOps.push_back(InnerAdd);
2062         GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
2063         AddOps.clear();
2064       }
2065     } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BaseReg)) {
2066       if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(AR->getStart())) {
2067         for (SCEVAddExpr::op_iterator J = Add->op_begin(), JE = Add->op_end();
2068              J != JE; ++J) {
2069           // Don't pull a constant into a register if the constant could be
2070           // folded into an immediate field.
2071           if (isAlwaysFoldable(*J, true, LU.Kind, LU.AccessTy, TLI, SE))
2072             continue;
2073           SmallVector<const SCEV *, 8> InnerAddOps;
2074           for (SCEVAddExpr::op_iterator K = Add->op_begin(); K != JE; ++K)
2075             if (K != J)
2076               InnerAddOps.push_back(*K);
2077           AddOps.push_back(*J);
2078           const SCEV *InnerAdd = SE.getAddExpr(InnerAddOps);
2079           AddOps.push_back(SE.getAddRecExpr(InnerAdd,
2080                                             AR->getStepRecurrence(SE),
2081                                             AR->getLoop()));
2082           GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
2083           AddOps.clear();
2084         }
2085       } else if (!isAlwaysFoldable(AR->getStart(), Base.BaseRegs.size() > 1,
2086                                    LU.Kind, LU.AccessTy,
2087                                    TLI, SE)) {
2088         AddOps.push_back(AR->getStart());
2089         AddOps.push_back(SE.getAddRecExpr(SE.getIntegerSCEV(0,
2090                                                             BaseReg->getType()),
2091                                           AR->getStepRecurrence(SE),
2092                                           AR->getLoop()));
2093         GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
2094         AddOps.clear();
2095       }
2096     }
2097   }
2098 }
2099
2100 /// GenerateCombinationReuse - Generate a formula consisting of all of the
2101 /// loop-dominating registers added into a single register.
2102 void LSRInstance::GenerateCombinationReuse(LSRUse &LU, unsigned LUIdx,
2103                                            Formula Base) {
2104   if (Base.BaseRegs.size() <= 1) return;
2105
2106   Formula F = Base;
2107   F.BaseRegs.clear();
2108   SmallVector<const SCEV *, 4> Ops;
2109   for (SmallVectorImpl<const SCEV *>::const_iterator
2110        I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
2111     const SCEV *BaseReg = *I;
2112     if (BaseReg->properlyDominates(L->getHeader(), &DT) &&
2113         !BaseReg->hasComputableLoopEvolution(L))
2114       Ops.push_back(BaseReg);
2115     else
2116       F.BaseRegs.push_back(BaseReg);
2117   }
2118   if (Ops.size() > 1) {
2119     F.BaseRegs.push_back(SE.getAddExpr(Ops));
2120     (void)InsertFormula(LU, LUIdx, F);
2121   }
2122 }
2123
2124 /// GenerateScaledReuse - Generate stride factor reuse formulae by making
2125 /// use of scaled-offset address modes, for example.
2126 void LSRInstance::GenerateScaledReuse(LSRUse &LU, unsigned LUIdx,
2127                                       Formula Base) {
2128   // Determine the integer type for the base formula.
2129   const Type *IntTy = Base.getType();
2130   if (!IntTy) return;
2131   IntTy = SE.getEffectiveSCEVType(IntTy);
2132
2133   // Check each interesting stride.
2134   for (SmallSetVector<int64_t, 4>::const_iterator
2135        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
2136     int64_t Factor = *I;
2137
2138     // If this Formula already has a scaled register, we can't add another one.
2139     if (Base.AM.Scale != 0)
2140       continue;
2141     Formula F = Base;
2142     F.AM.Scale = Factor;
2143     // Check whether this scale is going to be legal.
2144     if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI)) {
2145       // As a special-case, handle special out-of-loop Basic users specially.
2146       // TODO: Reconsider this special case.
2147       if (LU.Kind == LSRUse::Basic &&
2148           isLegalUse(F.AM, LSRUse::Special, LU.AccessTy, TLI) &&
2149           !L->contains(LU.UserInst))
2150         LU.Kind = LSRUse::Special;
2151       else
2152         continue;
2153     }
2154     // For each addrec base reg, apply the scale, if possible.
2155     for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
2156       if (const SCEVAddRecExpr *AR =
2157             dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
2158         const SCEV *FactorS = SE.getSCEV(ConstantInt::get(IntTy, Factor));
2159         // Divide out the factor, ignoring high bits, since we'll be
2160         // scaling the value back up in the end.
2161         if (const SCEV *Quotient = getSDiv(AR, FactorS, SE, true)) {
2162           // TODO: This could be optimized to avoid all the copying.
2163           Formula NewF = F;
2164           NewF.ScaledReg = Quotient;
2165           std::swap(NewF.BaseRegs[i], NewF.BaseRegs.back());
2166           NewF.BaseRegs.pop_back();
2167           NewF.AM.HasBaseReg = !NewF.BaseRegs.empty();
2168           (void)InsertFormula(LU, LUIdx, NewF);
2169         }
2170       }
2171   }
2172 }
2173
2174 /// GenerateTruncateReuse - Generate reuse formulae from different IV types.
2175 void LSRInstance::GenerateTruncateReuse(LSRUse &LU, unsigned LUIdx,
2176                                         Formula Base) {
2177   // This requires TargetLowering to tell us which truncates are free.
2178   if (!TLI) return;
2179
2180   // Don't attempt to truncate symbolic values.
2181   if (Base.AM.BaseGV) return;
2182
2183   // Determine the integer type for the base formula.
2184   const Type *DstTy = Base.getType();
2185   if (!DstTy) return;
2186   DstTy = SE.getEffectiveSCEVType(DstTy);
2187
2188   for (SmallSetVector<const Type *, 4>::const_iterator
2189        I = Types.begin(), E = Types.end(); I != E; ++I) {
2190     const Type *SrcTy = *I;
2191     if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
2192       Formula F = Base;
2193       if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
2194       for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
2195            JE = F.BaseRegs.end(); J != JE; ++J)
2196         *J = SE.getAnyExtendExpr(*J, SrcTy);
2197       (void)InsertFormula(LU, LUIdx, F);
2198     }
2199   }
2200 }
2201
2202 namespace {
2203
2204 /// WorkItem - Helper class for GenerateConstantOffsetReuse. It's used to
2205 /// defer modifications so that the search phase doesn't have to worry about
2206 /// the data structures moving underneath it.
2207 struct WorkItem {
2208   LSRUse *LU;
2209   size_t LUIdx;
2210   int64_t Imm;
2211   const SCEV *OrigReg;
2212
2213   WorkItem(LSRUse *U, size_t LI, int64_t I, const SCEV *R)
2214     : LU(U), LUIdx(LI), Imm(I), OrigReg(R) {}
2215
2216   void print(raw_ostream &OS) const;
2217   void dump() const;
2218 };
2219
2220 void WorkItem::print(raw_ostream &OS) const {
2221   OS << "in use ";
2222   LU->print(OS);
2223   OS << " (at index " << LUIdx << "), add offset " << Imm
2224      << " and compensate by adjusting refences to " << *OrigReg << "\n";
2225 }
2226
2227 void WorkItem::dump() const {
2228   print(errs()); errs() << '\n';
2229 }
2230
2231 }
2232
2233 /// GenerateConstantOffsetReuse - Look for registers which are a constant
2234 /// distance apart and try to form reuse opportunities between them.
2235 void LSRInstance::GenerateConstantOffsetReuse() {
2236   // Group the registers by their value without any added constant offset.
2237   typedef std::map<int64_t, const SCEV *> ImmMapTy;
2238   typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
2239   RegMapTy Map;
2240   SmallVector<const SCEV *, 8> Sequence;
2241   for (SmallVectorImpl<const SCEV *>::iterator I = RegSequence.begin(),
2242        E = RegSequence.end(); I != E; ++I) {
2243     const SCEV *Reg = *I;
2244     int64_t Imm = ExtractImmediate(Reg, SE);
2245     std::pair<RegMapTy::iterator, bool> Pair =
2246       Map.insert(std::make_pair(Reg, ImmMapTy()));
2247     if (Pair.second)
2248       Sequence.push_back(Reg);
2249     Pair.first->second.insert(std::make_pair(Imm, *I));
2250   }
2251
2252   // Insert an artificial expression at offset 0 (if there isn't one already),
2253   // as this may lead to more reuse opportunities.
2254   for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
2255        E = Sequence.end(); I != E; ++I)
2256     Map.find(*I)->second.insert(ImmMapTy::value_type(0, 0));
2257
2258   // Now examine each set of registers with the same base value. Build up
2259   // a list of work to do and do the work in a separate step so that we're
2260   // not adding formulae and register counts while we're searching.
2261   SmallVector<WorkItem, 32> WorkItems;
2262   for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
2263        E = Sequence.end(); I != E; ++I) {
2264     const SCEV *Reg = *I;
2265     const ImmMapTy &Imms = Map.find(Reg)->second;
2266     // Examine each offset.
2267     for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
2268          J != JE; ++J) {
2269       const SCEV *OrigReg = J->second;
2270       // Skip the artifical register at offset 0.
2271       if (!OrigReg) continue;
2272
2273       int64_t JImm = J->first;
2274       const SmallBitVector &Bits = RegUses.find(OrigReg)->second.Bits;
2275
2276       // Examine each other offset associated with the same register. This is
2277       // quadradic in the number of registers with the same base, but it's
2278       // uncommon for this to be a large number.
2279       for (ImmMapTy::const_iterator M = Imms.begin(); M != JE; ++M) {
2280         if (M == J) continue;
2281
2282         // Compute the difference between the two.
2283         int64_t Imm = (uint64_t)JImm - M->first;
2284         for (int LUIdx = Bits.find_first(); LUIdx != -1;
2285              LUIdx = Bits.find_next(LUIdx))
2286           // Make a memo of this use, offset, and register tuple.
2287           WorkItems.push_back(WorkItem(&Uses[LUIdx], LUIdx, Imm, OrigReg));
2288       }
2289     }
2290   }
2291
2292   // Now iterate through the worklist and add new formulae.
2293   for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
2294        E = WorkItems.end(); I != E; ++I) {
2295     const WorkItem &WI = *I;
2296     LSRUse &LU = *WI.LU;
2297     size_t LUIdx = WI.LUIdx;
2298     int64_t Imm = WI.Imm;
2299     const SCEV *OrigReg = WI.OrigReg;
2300
2301     const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
2302     const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy,
2303                                                       -(uint64_t)Imm));
2304
2305     for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
2306       Formula F = LU.Formulae[L];
2307       // Use the immediate in the scaled register.
2308       if (F.ScaledReg == OrigReg) {
2309         int64_t Offs = (uint64_t)F.AM.BaseOffs +
2310                        Imm * (uint64_t)F.AM.Scale;
2311         // Don't create 50 + reg(-50).
2312         if (F.referencesReg(SE.getSCEV(
2313                    ConstantInt::get(IntTy, -(uint64_t)Offs))))
2314           continue;
2315         Formula NewF = F;
2316         NewF.AM.BaseOffs = Offs;
2317         if (!isLegalUse(NewF.AM, LU.Kind, LU.AccessTy, TLI))
2318           continue;
2319         const SCEV *Diff = SE.getAddExpr(NegImmS, NewF.ScaledReg);
2320         if (Diff->isZero()) continue;
2321         NewF.ScaledReg = Diff;
2322         (void)InsertFormula(LU, LUIdx, NewF);
2323       }
2324       // Use the immediate in a base register.
2325       for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
2326         const SCEV *BaseReg = F.BaseRegs[N];
2327         if (BaseReg != OrigReg)
2328           continue;
2329         Formula NewF = F;
2330         NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
2331         if (!isLegalUse(NewF.AM, LU.Kind, LU.AccessTy, TLI))
2332           continue;
2333         const SCEV *Diff = SE.getAddExpr(NegImmS, BaseReg);
2334         if (Diff->isZero()) continue;
2335         // Don't create 50 + reg(-50).
2336         if (Diff ==
2337             SE.getSCEV(ConstantInt::get(IntTy,
2338                                         -(uint64_t)NewF.AM.BaseOffs)))
2339           continue;
2340         NewF.BaseRegs[N] = Diff;
2341         (void)InsertFormula(LU, LUIdx, NewF);
2342       }
2343     }
2344   }
2345 }
2346
2347 /// GenerateAllReuseFormulae - Generate formulae for each use.
2348 void
2349 LSRInstance::GenerateAllReuseFormulae() {
2350   SmallVector<Formula, 12> Save;
2351   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2352     LSRUse &LU = Uses[LUIdx];
2353
2354     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2355       GenerateSymbolicOffsetReuse(LU, LUIdx, LU.Formulae[i]);
2356     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2357       GenerateICmpZeroScaledReuse(LU, LUIdx, LU.Formulae[i]);
2358     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2359       GenerateReassociationReuse(LU, LUIdx, LU.Formulae[i]);
2360     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2361       GenerateCombinationReuse(LU, LUIdx, LU.Formulae[i]);
2362     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2363       GenerateScaledReuse(LU, LUIdx, LU.Formulae[i]);
2364     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2365       GenerateTruncateReuse(LU, LUIdx, LU.Formulae[i]);
2366   }
2367
2368   GenerateConstantOffsetReuse();
2369 }
2370
2371 /// GenerateLoopInvariantRegisterUses - Check for other uses of loop-invariant
2372 /// values which we're tracking. These other uses will pin these values in
2373 /// registers, making them less profitable for elimination.
2374 /// TODO: This currently misses non-constant addrec step registers.
2375 /// TODO: Should this give more weight to users inside the loop?
2376 void
2377 LSRInstance::GenerateLoopInvariantRegisterUses() {
2378   SmallVector<const SCEV *, 8> Worklist(RegSequence.begin(), RegSequence.end());
2379
2380   while (!Worklist.empty()) {
2381     const SCEV *S = Worklist.pop_back_val();
2382
2383     if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
2384       Worklist.insert(Worklist.end(), N->op_begin(), N->op_end());
2385     else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
2386       Worklist.push_back(C->getOperand());
2387     else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
2388       Worklist.push_back(D->getLHS());
2389       Worklist.push_back(D->getRHS());
2390     } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
2391       const Value *V = U->getValue();
2392       if (const Instruction *Inst = dyn_cast<Instruction>(V))
2393         if (L->contains(Inst)) continue;
2394       for (Value::use_const_iterator UI = V->use_begin(), UE = V->use_end();
2395            UI != UE; ++UI) {
2396         const Instruction *UserInst = dyn_cast<Instruction>(*UI);
2397         // Ignore non-instructions.
2398         if (!UserInst)
2399           continue;
2400         // Ignore instructions in other functions (as can happen with
2401         // Constants).
2402         if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
2403           continue;
2404         // Ignore instructions not dominated by the loop.
2405         const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
2406           UserInst->getParent() :
2407           cast<PHINode>(UserInst)->getIncomingBlock(
2408             PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
2409         if (!DT.dominates(L->getHeader(), UseBB))
2410           continue;
2411         // Ignore uses which are part of other SCEV expressions, to avoid
2412         // analyzing them multiple times.
2413         if (SE.isSCEVable(UserInst->getType()) &&
2414             !isa<SCEVUnknown>(SE.getSCEV(const_cast<Instruction *>(UserInst))))
2415           continue;
2416         // Ignore icmp instructions which are already being analyzed.
2417         if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
2418           unsigned OtherIdx = !UI.getOperandNo();
2419           Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
2420           if (SE.getSCEV(OtherOp)->hasComputableLoopEvolution(L))
2421             continue;
2422         }
2423
2424         LSRUse &LU = getNewUse();
2425         LU.UserInst = const_cast<Instruction *>(UserInst);
2426         LU.OperandValToReplace = UI.getUse();
2427         LU.InsertSupplementalFormula(U);
2428         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
2429       }
2430     }
2431   }
2432 }
2433
2434 #ifndef NDEBUG
2435
2436 static void debug_winner(SmallVector<LSRUse, 16> const &Uses) {
2437   dbgs() << "LSR has selected formulae for each use:\n";
2438   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
2439        E = Uses.end(); I != E; ++I) {
2440     const LSRUse &LU = *I;
2441     dbgs() << "  ";
2442     LU.print(dbgs());
2443     dbgs() << '\n';
2444     dbgs() << "    ";
2445     LU.Formulae.front().print(dbgs());
2446     dbgs() << "\n";
2447   }
2448 }
2449
2450 #endif
2451
2452 LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
2453   : IU(P->getAnalysis<IVUsers>()),
2454     SE(P->getAnalysis<ScalarEvolution>()),
2455     DT(P->getAnalysis<DominatorTree>()),
2456     TLI(tli), L(l), Changed(false), CurrentArbitraryRegIndex(0), MaxNumRegs(0) {
2457
2458   // If LoopSimplify form is not available, stay out of trouble.
2459   if (!L->isLoopSimplifyForm()) return;
2460
2461   // If there's no interesting work to be done, bail early.
2462   if (IU.IVUsesByStride.empty()) return;
2463
2464   DEBUG(dbgs() << "\nLSR on loop ";
2465         WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
2466         dbgs() << ":\n");
2467
2468   // Sort the StrideOrder so we process larger strides first.
2469   std::stable_sort(IU.StrideOrder.begin(), IU.StrideOrder.end(),
2470                    StrideCompare(SE));
2471
2472   /// OptimizeShadowIV - If IV is used in a int-to-float cast
2473   /// inside the loop then try to eliminate the cast opeation.
2474   OptimizeShadowIV();
2475
2476   // Change loop terminating condition to use the postinc iv when possible.
2477   Instruction *IVIncInsertPos;
2478   Changed |= OptimizeLoopTermCond(IVIncInsertPos);
2479
2480   for (SmallVectorImpl<const SCEV *>::const_iterator SIter =
2481        IU.StrideOrder.begin(), SEnd = IU.StrideOrder.end();
2482        SIter != SEnd; ++SIter) {
2483     const SCEV *Stride = *SIter;
2484
2485     // Collect interesting types.
2486     Types.insert(SE.getEffectiveSCEVType(Stride->getType()));
2487
2488     // Collect interesting factors.
2489     for (SmallVectorImpl<const SCEV *>::const_iterator NewStrideIter =
2490          SIter + 1; NewStrideIter != SEnd; ++NewStrideIter) {
2491       const SCEV *OldStride = Stride;
2492       const SCEV *NewStride = *NewStrideIter;
2493
2494       if (SE.getTypeSizeInBits(OldStride->getType()) !=
2495           SE.getTypeSizeInBits(NewStride->getType())) {
2496         if (SE.getTypeSizeInBits(OldStride->getType()) >
2497             SE.getTypeSizeInBits(NewStride->getType()))
2498           NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2499         else
2500           OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2501       }
2502       if (const SCEVConstant *Factor =
2503             dyn_cast_or_null<SCEVConstant>(getSDiv(NewStride, OldStride,
2504                                                    SE, true)))
2505         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
2506           Factors.insert(Factor->getValue()->getValue().getSExtValue());
2507     }
2508
2509     std::map<const SCEV *, IVUsersOfOneStride *>::const_iterator SI =
2510       IU.IVUsesByStride.find(Stride);
2511     assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
2512     for (ilist<IVStrideUse>::const_iterator UI = SI->second->Users.begin(),
2513          E = SI->second->Users.end(); UI != E; ++UI) {
2514       // Record the uses.
2515       LSRUse &LU = getNewUse();
2516       LU.UserInst = UI->getUser();
2517       LU.OperandValToReplace = UI->getOperandValToReplace();
2518       if (isAddressUse(LU.UserInst, LU.OperandValToReplace)) {
2519         LU.Kind = LSRUse::Address;
2520         LU.AccessTy = getAccessType(LU.UserInst);
2521       }
2522       if (UI->isUseOfPostIncrementedValue())
2523         LU.PostIncLoop = L;
2524
2525       const SCEV *S = IU.getCanonicalExpr(*UI);
2526
2527       // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
2528       // (N - i == 0), and this allows (N - i) to be the expression that we
2529       // work with rather than just N or i, so we can consider the register
2530       // requirements for both N and i at the same time. Limiting this code
2531       // to equality icmps is not a problem because all interesting loops
2532       // use equality icmps, thanks to IndVarSimplify.
2533       if (ICmpInst *CI = dyn_cast<ICmpInst>(LU.UserInst))
2534         if (CI->isEquality()) {
2535           // Swap the operands if needed to put the OperandValToReplace on
2536           // the left, for consistency.
2537           Value *NV = CI->getOperand(1);
2538           if (NV == LU.OperandValToReplace) {
2539             CI->setOperand(1, CI->getOperand(0));
2540             CI->setOperand(0, NV);
2541           }
2542
2543           // x == y  -->  x - y == 0
2544           const SCEV *N = SE.getSCEV(NV);
2545           if (N->isLoopInvariant(L)) {
2546             LU.Kind = LSRUse::ICmpZero;
2547             S = SE.getMinusSCEV(N, S);
2548           }
2549
2550           // -1 and the negations of all interesting strides (except the
2551           // negation of -1) are now also interesting.
2552           for (size_t i = 0, e = Factors.size(); i != e; ++i)
2553             if (Factors[i] != -1)
2554               Factors.insert(-(uint64_t)Factors[i]);
2555           Factors.insert(-1);
2556         }
2557
2558       // Ok, now enumerate all the different formulae we can find to compute
2559       // the value for this expression.
2560       LU.InsertInitialFormula(S, L, SE, DT);
2561       CountRegisters(LU.Formulae.back(), Uses.size() - 1);
2562     }
2563   }
2564
2565   // If all uses use the same type, don't bother looking for truncation-based
2566   // reuse.
2567   if (Types.size() == 1)
2568     Types.clear();
2569
2570   // If there are any uses of registers that we're tracking that have escaped
2571   // IVUsers' attention, add trivial uses for them, so that the register
2572   // voting process takes the into consideration.
2573   GenerateLoopInvariantRegisterUses();
2574
2575   // Start by assuming we'll assign each use its own register. This is
2576   // sometimes called "full" strength reduction, or "superhero" mode.
2577   // Sometimes this is the best solution, but if there are opportunities for
2578   // reuse we may find a better solution.
2579   Score CurScore;
2580   CurScore.RateInitial(Uses, L, SE);
2581
2582   MaxNumRegs = CurScore.getNumRegs();
2583
2584   // Now use the reuse data to generate a bunch of interesting ways
2585   // to formulate the values needed for the uses.
2586   GenerateAllReuseFormulae();
2587
2588   // Sort the formulae. TODO: This is redundantly sorted below.
2589   for (SmallVectorImpl<LSRUse>::iterator I = Uses.begin(), E = Uses.end();
2590        I != E; ++I) {
2591     LSRUse &LU = *I;
2592     std::stable_sort(LU.Formulae.begin(), LU.Formulae.end(),
2593                      ComplexitySorter());
2594   }
2595
2596   // Ok, we've now collected all the uses and noted their register uses. The
2597   // next step is to start looking at register reuse possibilities.
2598   DEBUG(print(dbgs()); dbgs() << '\n'; IU.dump());
2599
2600   // Create a sorted list of registers with those with the most uses appearing
2601   // earlier in the list. We'll visit them first, as they're the most likely
2602   // to represent profitable reuse opportunities.
2603   SmallVector<RegCount, 8> RegOrder;
2604   for (SmallVectorImpl<const SCEV *>::const_iterator I =
2605        RegSequence.begin(), E = RegSequence.end(); I != E; ++I)
2606     RegOrder.push_back(RegCount(*I, RegUses.find(*I)->second));
2607   std::stable_sort(RegOrder.begin(), RegOrder.end());
2608
2609   // Visit each register. Determine which ones represent profitable reuse
2610   // opportunities and remember them.
2611   // TODO: Extract this code into a function.
2612   for (SmallVectorImpl<RegCount>::const_iterator I = RegOrder.begin(),
2613        E = RegOrder.end(); I != E; ++I) {
2614     const SCEV *Reg = I->Reg;
2615     const SmallBitVector &Bits = I->Sort.Bits;
2616
2617     // Registers with only one use don't represent reuse opportunities, so
2618     // when we get there, we're done.
2619     if (Bits.count() <= 1) break;
2620
2621     DEBUG(dbgs() << "Reg " << *Reg << ": ";
2622           I->Sort.print(dbgs());
2623           dbgs() << '\n');
2624
2625     // Determine the total number of registers will be needed if we make use
2626     // of the reuse opportunity represented by the current register.
2627     Score NewScore;
2628     NewScore.Rate(Reg, Bits, Uses, L, SE);
2629
2630     // Now decide whether this register's reuse opportunity is an overall win.
2631     // Currently the decision is heavily skewed for register pressure.
2632     if (!(NewScore < CurScore)) {
2633       continue;
2634     }
2635
2636     // Ok, use this opportunity.
2637     DEBUG(dbgs() << "This candidate has been accepted.\n");
2638     CurScore = NewScore;
2639
2640     // Now that we've selected a new reuse opportunity, delete formulae that
2641     // do not participate in that opportunity.
2642     for (int j = Bits.find_first(); j != -1; j = Bits.find_next(j)) {
2643       LSRUse &LU = Uses[j];
2644       for (unsigned k = 0, h = LU.Formulae.size(); k != h; ++k) {
2645         Formula &F = LU.Formulae[k];
2646         if (!F.referencesReg(Reg)) {
2647           std::swap(LU.Formulae[k], LU.Formulae.back());
2648           LU.Formulae.pop_back();
2649           --k; --h;
2650         }
2651       }
2652       // Also re-sort the list to put the formulae with the fewest registers
2653       // at the front.
2654       // TODO: Do this earlier, we don't need it each time.
2655       std::stable_sort(LU.Formulae.begin(), LU.Formulae.end(),
2656                        ComplexitySorter());
2657     }
2658   }
2659
2660   // Ok, we've now made all our decisions. The first formula for each use
2661   // will be used.
2662   DEBUG(dbgs() << "Concluding, we need "; CurScore.print(dbgs());
2663         dbgs() << ".\n";
2664         debug_winner(Uses));
2665
2666   // Free memory no longer needed.
2667   RegOrder.clear();
2668   Factors.clear();
2669   Types.clear();
2670   RegUses.clear();
2671   RegSequence.clear();
2672
2673   // Keep track of instructions we may have made dead, so that
2674   // we can remove them after we are done working.
2675   SmallVector<WeakVH, 16> DeadInsts;
2676
2677   SCEVExpander Rewriter(SE);
2678   Rewriter.disableCanonicalMode();
2679   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
2680
2681   // Expand the new value definitions and update the users.
2682   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
2683        E = Uses.end(); I != E; ++I) {
2684     // Formulae should be legal.
2685     DEBUG(for (SmallVectorImpl<Formula>::const_iterator J = I->Formulae.begin(),
2686                JE = I->Formulae.end(); J != JE; ++J)
2687             assert(isLegalUse(J->AM, I->Kind, I->AccessTy, TLI) &&
2688                    "Illegal formula generated!"));
2689
2690     // Expand the new code and update the user.
2691     I->Rewrite(L, Rewriter, DeadInsts, SE, DT, P);
2692     Changed = true;
2693   }
2694
2695   // Clean up after ourselves. This must be done before deleting any
2696   // instructions.
2697   Rewriter.clear();
2698
2699   Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
2700 }
2701
2702 void LSRInstance::print(raw_ostream &OS) const {
2703   if (MaxNumRegs != 0)
2704     OS << "LSR is considering " << MaxNumRegs << " to be the maximum "
2705           "number of registers needed.\n";
2706
2707   OS << "LSR has identified the following interesting factors and types: ";
2708   bool First = true;
2709
2710   for (SmallSetVector<int64_t, 4>::const_iterator
2711        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
2712     if (!First) OS << ", ";
2713     First = false;
2714     OS << '*' << *I;
2715   }
2716
2717   for (SmallSetVector<const Type *, 4>::const_iterator
2718        I = Types.begin(), E = Types.end(); I != E; ++I) {
2719     if (!First) OS << ", ";
2720     First = false;
2721     OS << '(' << **I << ')';
2722   }
2723   OS << '\n';
2724
2725   OS << "LSR is examining the following uses, and candidate formulae:\n";
2726   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
2727        E = Uses.end(); I != E; ++I) {
2728     const LSRUse &LU = *I;
2729     dbgs() << "  ";
2730     LU.print(OS);
2731     OS << '\n';
2732     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
2733          JE = LU.Formulae.end(); J != JE; ++J) {
2734       OS << "    ";
2735       J->print(OS);
2736       OS << "\n";
2737     }
2738   }
2739 }
2740
2741 void LSRInstance::dump() const {
2742   print(errs()); errs() << '\n';
2743 }
2744
2745 namespace {
2746
2747 class LoopStrengthReduce : public LoopPass {
2748   /// TLI - Keep a pointer of a TargetLowering to consult for determining
2749   /// transformation profitability.
2750   const TargetLowering *const TLI;
2751
2752 public:
2753   static char ID; // Pass ID, replacement for typeid
2754   explicit LoopStrengthReduce(const TargetLowering *tli = NULL);
2755
2756 private:
2757   bool runOnLoop(Loop *L, LPPassManager &LPM);
2758   void getAnalysisUsage(AnalysisUsage &AU) const;
2759 };
2760
2761 }
2762
2763 char LoopStrengthReduce::ID = 0;
2764 static RegisterPass<LoopStrengthReduce>
2765 X("loop-reduce", "Loop Strength Reduction");
2766
2767 Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
2768   return new LoopStrengthReduce(TLI);
2769 }
2770
2771 LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
2772   : LoopPass(&ID), TLI(tli) {}
2773
2774 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
2775   // We split critical edges, so we change the CFG.  However, we do update
2776   // many analyses if they are around.
2777   AU.addPreservedID(LoopSimplifyID);
2778   AU.addPreserved<LoopInfo>();
2779   AU.addPreserved("domfrontier");
2780
2781   AU.addRequiredID(LoopSimplifyID);
2782   AU.addRequired<DominatorTree>();
2783   AU.addPreserved<DominatorTree>();
2784   AU.addRequired<ScalarEvolution>();
2785   AU.addPreserved<ScalarEvolution>();
2786   AU.addRequired<IVUsers>();
2787   AU.addPreserved<IVUsers>();
2788 }
2789
2790 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
2791   bool Changed = false;
2792
2793   // Run the main LSR transformation.
2794   Changed |= LSRInstance(TLI, L, this).getChanged();
2795
2796   // At this point, it is worth checking to see if any recurrence PHIs are also
2797   // dead, so that we can remove them as well.
2798   Changed |= DeleteDeadPHIs(L->getHeader());
2799
2800   return Changed;
2801 }