1 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This transformation analyzes and transforms the induction variables (and
11 // computations derived from them) into forms suitable for efficient execution
14 // This pass performs a strength reduction on array references inside loops that
15 // have as one or more of their components the loop induction variable, it
16 // rewrites expressions to take advantage of scaled-index addressing modes
17 // available on the target, and it performs a variety of other optimizations
18 // related to loop induction variables.
20 // Terminology note: this code has a lot of handling for "post-increment" or
21 // "post-inc" users. This is not talking about post-increment addressing modes;
22 // it is instead talking about code like this:
24 // %i = phi [ 0, %entry ], [ %i.next, %latch ]
26 // %i.next = add %i, 1
27 // %c = icmp eq %i.next, %n
29 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
30 // it's useful to think about these as the same register, with some uses using
31 // the value of the register before the add and some using // it after. In this
32 // example, the icmp is a post-increment user, since it uses %i.next, which is
33 // the value of the induction variable after the increment. The other common
34 // case of post-increment users is users outside the loop.
36 // TODO: More sophistication in the way Formulae are generated.
38 // TODO: Handle multiple loops at a time.
40 // TODO: test/CodeGen/X86/full-lsr.ll should get full lsr. The problem is
41 // that {0,+,1}<%bb> is getting picked first because all 7 uses can
42 // use it, and while it's a pretty good solution, it means that LSR
43 // doesn't look further to find an even better solution.
45 // TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
46 // instead of a GlobalValue?
48 // TODO: When truncation is free, truncate ICmp users' operands to make it a
49 // smaller encoding (on x86 at least).
51 // TODO: When a negated register is used by an add (such as in a list of
52 // multiple base registers, or as the increment expression in an addrec),
53 // we may not actually need both reg and (-1 * reg) in registers; the
54 // negation can be implemented by using a sub instead of an add. The
55 // lack of support for taking this into consideration when making
56 // register pressure decisions is partly worked around by the "Special"
59 //===----------------------------------------------------------------------===//
61 #define DEBUG_TYPE "loop-reduce"
62 #include "llvm/Transforms/Scalar.h"
63 #include "llvm/Constants.h"
64 #include "llvm/Instructions.h"
65 #include "llvm/IntrinsicInst.h"
66 #include "llvm/DerivedTypes.h"
67 #include "llvm/Analysis/IVUsers.h"
68 #include "llvm/Analysis/Dominators.h"
69 #include "llvm/Analysis/LoopPass.h"
70 #include "llvm/Analysis/ScalarEvolutionExpander.h"
71 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
72 #include "llvm/Transforms/Utils/Local.h"
73 #include "llvm/ADT/SmallBitVector.h"
74 #include "llvm/ADT/SetVector.h"
75 #include "llvm/Support/Debug.h"
76 #include "llvm/Support/ValueHandle.h"
77 #include "llvm/Support/raw_ostream.h"
78 #include "llvm/Target/TargetLowering.h"
84 // Constant strides come first which in turns are sorted by their absolute
85 // values. If absolute values are the same, then positive strides comes first.
87 // 4, -1, X, 1, 2 ==> 1, -1, 2, 4, X
88 struct StrideCompare {
89 const ScalarEvolution &SE;
90 explicit StrideCompare(const ScalarEvolution &se) : SE(se) {}
92 bool operator()(const SCEV *const &LHS, const SCEV *const &RHS) const {
93 const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS);
94 const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
96 unsigned BitWidth = std::max(SE.getTypeSizeInBits(LHS->getType()),
97 SE.getTypeSizeInBits(RHS->getType()));
98 APInt LV = LHSC->getValue()->getValue();
99 APInt RV = RHSC->getValue()->getValue();
100 LV.sextOrTrunc(BitWidth);
101 RV.sextOrTrunc(BitWidth);
102 APInt ALV = LV.abs();
103 APInt ARV = RV.abs();
111 // If it's the same value but different type, sort by bit width so
112 // that we emit larger induction variables before smaller
113 // ones, letting the smaller be re-written in terms of larger ones.
114 return SE.getTypeSizeInBits(RHS->getType()) <
115 SE.getTypeSizeInBits(LHS->getType());
117 return LHSC && !RHSC;
121 /// RegSortData - This class holds data which is used to order reuse
125 /// Bits - This represents the set of LSRUses (by index) which reference a
126 /// particular register.
129 /// MaxComplexity - This represents the greatest complexity (see the comments
130 /// on Formula::getComplexity) seen with a particular register.
131 uint32_t MaxComplexity;
133 /// Index - This holds an arbitrary value used as a last-resort tie breaker
134 /// to ensure deterministic behavior.
139 void print(raw_ostream &OS) const;
145 void RegSortData::print(raw_ostream &OS) const {
146 OS << "[NumUses=" << Bits.count()
147 << ", MaxComplexity=";
148 OS.write_hex(MaxComplexity);
149 OS << ", Index=" << Index << ']';
152 void RegSortData::dump() const {
153 print(errs()); errs() << '\n';
158 /// RegCount - This is a helper class to sort a given set of registers
159 /// according to associated RegSortData values.
165 RegCount(const SCEV *R, const RegSortData &RSD)
166 : Reg(R), Sort(RSD) {}
168 // Sort by count. Returning true means the register is preferred.
169 bool operator<(const RegCount &Other) const {
170 // Sort by the number of unique uses of this register.
171 unsigned A = Sort.Bits.count();
172 unsigned B = Other.Sort.Bits.count();
173 if (A != B) return A > B;
175 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
176 const SCEVAddRecExpr *BR = dyn_cast<SCEVAddRecExpr>(Other.Reg);
177 // AddRecs have higher priority than other things.
178 if (!BR) return true;
180 // Prefer affine values.
181 if (AR->isAffine() != BR->isAffine())
182 return AR->isAffine();
184 const Loop *AL = AR->getLoop();
185 const Loop *BL = BR->getLoop();
187 unsigned ADepth = AL->getLoopDepth();
188 unsigned BDepth = BL->getLoopDepth();
189 // Prefer a less deeply nested addrec.
190 if (ADepth != BDepth)
191 return ADepth < BDepth;
193 // Different loops at the same depth; do something arbitrary.
194 BasicBlock *AH = AL->getHeader();
195 BasicBlock *BH = BL->getHeader();
196 for (Function::iterator I = AH, E = AH->getParent()->end(); I != E; ++I)
197 if (&*I == BH) return true;
201 // Sort addrecs by stride.
202 const SCEV *AStep = AR->getOperand(1);
203 const SCEV *BStep = BR->getOperand(1);
204 if (AStep != BStep) {
205 if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(AStep)) {
206 const SCEVConstant *BC = dyn_cast<SCEVConstant>(BStep);
207 if (!BC) return true;
208 // Arbitrarily prefer wider registers.
209 if (AC->getValue()->getValue().getBitWidth() !=
210 BC->getValue()->getValue().getBitWidth())
211 return AC->getValue()->getValue().getBitWidth() >
212 BC->getValue()->getValue().getBitWidth();
213 // Ignore the sign bit, assuming that striding by a negative value
214 // is just as easy as by a positive value.
215 // Prefer the addrec with the lesser absolute value stride, as it
216 // will allow uses to have simpler addressing modes.
217 return AC->getValue()->getValue().abs()
218 .ult(BC->getValue()->getValue().abs());
222 // Then sort by the register which will permit the simplest uses.
223 // This is a heuristic; currently we only track the most complex use as a
225 if (Sort.MaxComplexity != Other.Sort.MaxComplexity)
226 return Sort.MaxComplexity < Other.Sort.MaxComplexity;
228 // Then sort them by their start values.
229 const SCEV *AStart = AR->getStart();
230 const SCEV *BStart = BR->getStart();
231 if (AStart != BStart) {
232 if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(AStart)) {
233 const SCEVConstant *BC = dyn_cast<SCEVConstant>(BStart);
234 if (!BC) return true;
235 // Arbitrarily prefer wider registers.
236 if (AC->getValue()->getValue().getBitWidth() !=
237 BC->getValue()->getValue().getBitWidth())
238 return AC->getValue()->getValue().getBitWidth() >
239 BC->getValue()->getValue().getBitWidth();
240 // Prefer positive over negative if the absolute values are the same.
241 if (AC->getValue()->getValue().abs() ==
242 BC->getValue()->getValue().abs())
243 return AC->getValue()->getValue().isStrictlyPositive();
244 // Prefer the addrec with the lesser absolute value start.
245 return AC->getValue()->getValue().abs()
246 .ult(BC->getValue()->getValue().abs());
250 // AddRecs have higher priority than other things.
251 if (isa<SCEVAddRecExpr>(Other.Reg)) return false;
252 // Sort by the register which will permit the simplest uses.
253 // This is a heuristic; currently we only track the most complex use as a
255 if (Sort.MaxComplexity != Other.Sort.MaxComplexity)
256 return Sort.MaxComplexity < Other.Sort.MaxComplexity;
260 // Tie-breaker: the arbitrary index, to ensure a reliable ordering.
261 return Sort.Index < Other.Sort.Index;
264 void print(raw_ostream &OS) const;
270 void RegCount::print(raw_ostream &OS) const {
275 void RegCount::dump() const {
276 print(errs()); errs() << '\n';
281 /// Formula - This class holds information that describes a formula for
282 /// satisfying a use. It may include broken-out immediates and scaled registers.
284 /// AM - This is used to represent complex addressing, as well as other kinds
285 /// of interesting uses.
286 TargetLowering::AddrMode AM;
288 /// BaseRegs - The list of "base" registers for this use. When this is
289 /// non-empty, AM.HasBaseReg should be set to true.
290 SmallVector<const SCEV *, 2> BaseRegs;
292 /// ScaledReg - The 'scaled' register for this use. This should be non-null
293 /// when AM.Scale is not zero.
294 const SCEV *ScaledReg;
296 Formula() : ScaledReg(0) {}
298 unsigned getNumRegs() const;
299 uint32_t getComplexity() const;
300 const Type *getType() const;
302 void InitialMatch(const SCEV *S, Loop *L,
303 ScalarEvolution &SE, DominatorTree &DT);
305 /// referencesReg - Test if this formula references the given register.
306 bool referencesReg(const SCEV *S) const {
307 return S == ScaledReg ||
308 std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
311 bool operator==(const Formula &Other) const {
312 return BaseRegs == Other.BaseRegs &&
313 ScaledReg == Other.ScaledReg &&
314 AM.Scale == Other.AM.Scale &&
315 AM.BaseOffs == Other.AM.BaseOffs &&
316 AM.BaseGV == Other.AM.BaseGV;
319 // This sorts at least partially based on host pointer values which are
320 // not deterministic, so it is only usable for uniqification.
321 bool operator<(const Formula &Other) const {
322 if (BaseRegs != Other.BaseRegs)
323 return BaseRegs < Other.BaseRegs;
324 if (ScaledReg != Other.ScaledReg)
325 return ScaledReg < Other.ScaledReg;
326 if (AM.Scale != Other.AM.Scale)
327 return AM.Scale < Other.AM.Scale;
328 if (AM.BaseOffs != Other.AM.BaseOffs)
329 return AM.BaseOffs < Other.AM.BaseOffs;
330 if (AM.BaseGV != Other.AM.BaseGV)
331 return AM.BaseGV < Other.AM.BaseGV;
335 void print(raw_ostream &OS) const;
341 /// getNumRegs - Return the total number of register operands used by this
342 /// formula. This does not include register uses implied by non-constant
344 unsigned Formula::getNumRegs() const {
345 return !!ScaledReg + BaseRegs.size();
348 /// getComplexity - Return an oversimplified value indicating the complexity
349 /// of this formula. This is used as a tie-breaker in choosing register
351 uint32_t Formula::getComplexity() const {
352 // Encode the information in a uint32_t so that comparing with operator<
353 // will be interesting.
355 // Most significant, the number of registers. This saturates because we
356 // need the bits, and because beyond a few hundred it doesn't really matter.
357 (std::min(getNumRegs(), (1u<<15)-1) << 17) |
358 // Having multiple base regs is worse than having a base reg and a scale.
359 ((BaseRegs.size() > 1) << 16) |
360 // Scale absolute value.
361 ((AM.Scale != 0 ? (Log2_64(abs64(AM.Scale)) + 1) : 0u) << 9) |
362 // Scale sign, which is less significant than the absolute value.
363 ((AM.Scale < 0) << 8) |
364 // Offset absolute value.
365 ((AM.BaseOffs != 0 ? (Log2_64(abs64(AM.BaseOffs)) + 1) : 0u) << 1) |
366 // If a GV is present, treat it like a maximal offset.
367 ((AM.BaseGV ? ((1u<<7)-1) : 0) << 1) |
368 // Offset sign, which is less significant than the absolute offset.
369 ((AM.BaseOffs < 0) << 0);
372 /// getType - Return the type of this formula, if it has one, or null
373 /// otherwise. This type is meaningless except for the bit size.
374 const Type *Formula::getType() const {
375 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
376 ScaledReg ? ScaledReg->getType() :
377 AM.BaseGV ? AM.BaseGV->getType() :
383 /// ComplexitySorter - A predicate which orders Formulae by the number of
384 /// registers they contain.
385 struct ComplexitySorter {
386 bool operator()(const Formula &LHS, const Formula &RHS) const {
387 unsigned L = LHS.getNumRegs();
388 unsigned R = RHS.getNumRegs();
389 if (L != R) return L < R;
391 return LHS.getComplexity() < RHS.getComplexity();
397 /// DoInitialMatch - Recurrsion helper for InitialMatch.
398 static void DoInitialMatch(const SCEV *S, Loop *L,
399 SmallVectorImpl<const SCEV *> &Good,
400 SmallVectorImpl<const SCEV *> &Bad,
401 ScalarEvolution &SE, DominatorTree &DT) {
402 // Collect expressions which properly dominate the loop header.
403 if (S->properlyDominates(L->getHeader(), &DT)) {
408 // Look at add operands.
409 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
410 for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
412 DoInitialMatch(*I, L, Good, Bad, SE, DT);
416 // Look at addrec operands.
417 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
418 if (!AR->getStart()->isZero()) {
419 DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT);
420 DoInitialMatch(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()),
421 AR->getStepRecurrence(SE),
423 L, Good, Bad, SE, DT);
428 // Handle a multiplication by -1 (negation) if it didn't fold.
429 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
430 if (Mul->getOperand(0)->isAllOnesValue()) {
431 SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
432 const SCEV *NewMul = SE.getMulExpr(Ops);
434 SmallVector<const SCEV *, 4> MyGood;
435 SmallVector<const SCEV *, 4> MyBad;
436 DoInitialMatch(NewMul, L, MyGood, MyBad, SE, DT);
437 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
438 SE.getEffectiveSCEVType(NewMul->getType())));
439 for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
440 E = MyGood.end(); I != E; ++I)
441 Good.push_back(SE.getMulExpr(NegOne, *I));
442 for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
443 E = MyBad.end(); I != E; ++I)
444 Bad.push_back(SE.getMulExpr(NegOne, *I));
448 // Ok, we can't do anything interesting. Just stuff the whole thing into a
449 // register and hope for the best.
453 /// InitialMatch - Incorporate loop-variant parts of S into this Formula,
454 /// attempting to keep all loop-invariant and loop-computable values in a
455 /// single base register.
456 void Formula::InitialMatch(const SCEV *S, Loop *L,
457 ScalarEvolution &SE, DominatorTree &DT) {
458 SmallVector<const SCEV *, 4> Good;
459 SmallVector<const SCEV *, 4> Bad;
460 DoInitialMatch(S, L, Good, Bad, SE, DT);
462 BaseRegs.push_back(SE.getAddExpr(Good));
463 AM.HasBaseReg = true;
466 BaseRegs.push_back(SE.getAddExpr(Bad));
467 AM.HasBaseReg = true;
471 void Formula::print(raw_ostream &OS) const {
474 if (!First) OS << " + "; else First = false;
475 WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false);
477 if (AM.BaseOffs != 0) {
478 if (!First) OS << " + "; else First = false;
481 for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
482 E = BaseRegs.end(); I != E; ++I) {
483 if (!First) OS << " + "; else First = false;
489 if (!First) OS << " + "; else First = false;
490 OS << AM.Scale << "*reg(";
499 void Formula::dump() const {
500 print(errs()); errs() << '\n';
503 /// getSDiv - Return an expression for LHS /s RHS, if it can be determined,
504 /// or null otherwise. If IgnoreSignificantBits is true, expressions like
505 /// (X * Y) /s Y are simplified to Y, ignoring that the multiplication may
506 /// overflow, which is useful when the result will be used in a context where
507 /// the most significant bits are ignored.
508 static const SCEV *getSDiv(const SCEV *LHS, const SCEV *RHS,
510 bool IgnoreSignificantBits = false) {
511 // Handle the trivial case, which works for any SCEV type.
513 return SE.getIntegerSCEV(1, LHS->getType());
515 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do some
517 if (RHS->isAllOnesValue())
518 return SE.getMulExpr(LHS, RHS);
520 // Check for a division of a constant by a constant.
521 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
522 const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
525 if (C->getValue()->getValue().srem(RC->getValue()->getValue()) != 0)
527 return SE.getConstant(C->getValue()->getValue()
528 .sdiv(RC->getValue()->getValue()));
531 // Distribute the sdiv over addrec operands.
532 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
533 const SCEV *Start = getSDiv(AR->getStart(), RHS, SE,
534 IgnoreSignificantBits);
535 if (!Start) return 0;
536 const SCEV *Step = getSDiv(AR->getStepRecurrence(SE), RHS, SE,
537 IgnoreSignificantBits);
539 return SE.getAddRecExpr(Start, Step, AR->getLoop());
542 // Distribute the sdiv over add operands.
543 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
544 SmallVector<const SCEV *, 8> Ops;
545 for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
547 const SCEV *Op = getSDiv(*I, RHS, SE,
548 IgnoreSignificantBits);
552 return SE.getAddExpr(Ops);
555 // Check for a multiply operand that we can pull RHS out of.
556 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS))
557 if (IgnoreSignificantBits || Mul->hasNoSignedWrap()) {
558 SmallVector<const SCEV *, 4> Ops;
560 for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
563 if (const SCEV *Q = getSDiv(*I, RHS, SE, IgnoreSignificantBits)) {
570 return Found ? SE.getMulExpr(Ops) : 0;
573 // Otherwise we don't know.
579 /// LSRUse - This class holds the state that LSR keeps for each use in
580 /// IVUsers, as well as uses invented by LSR itself. It includes information
581 /// about what kinds of things can be folded into the user, information
582 /// about the user itself, and information about how the use may be satisfied.
583 /// TODO: Represent multiple users of the same expression in common?
585 SmallSet<Formula, 8> FormulaeUniquifier;
588 /// KindType - An enum for a kind of use, indicating what types of
589 /// scaled and immediate operands it might support.
591 Basic, ///< A normal use, with no folding.
592 Special, ///< A special case of basic, allowing -1 scales.
593 Address, ///< An address use; folding according to TargetLowering
594 ICmpZero ///< An equality icmp with both operands folded into one.
595 // TODO: Add a generic icmp too?
599 const Type *AccessTy;
600 Instruction *UserInst;
601 Value *OperandValToReplace;
603 /// PostIncLoop - If this user is to use the post-incremented value of an
604 /// induction variable, this variable is non-null and holds the loop
605 /// associated with the induction variable.
606 const Loop *PostIncLoop;
608 /// Formulae - A list of ways to build a value that can satisfy this user.
609 /// After the list is populated, one of these is selected heuristically and
610 /// used to formulate a replacement for OperandValToReplace in UserInst.
611 SmallVector<Formula, 12> Formulae;
613 LSRUse() : Kind(Basic), AccessTy(0),
614 UserInst(0), OperandValToReplace(0), PostIncLoop(0) {}
616 void InsertInitialFormula(const SCEV *S, Loop *L,
617 ScalarEvolution &SE, DominatorTree &DT);
618 void InsertSupplementalFormula(const SCEV *S);
620 bool InsertFormula(const Formula &F);
622 void Rewrite(Loop *L, SCEVExpander &Rewriter,
623 SmallVectorImpl<WeakVH> &DeadInsts,
624 ScalarEvolution &SE, DominatorTree &DT,
627 void print(raw_ostream &OS) const;
631 Value *Expand(BasicBlock::iterator IP,
632 Loop *L, SCEVExpander &Rewriter,
633 SmallVectorImpl<WeakVH> &DeadInsts,
634 ScalarEvolution &SE, DominatorTree &DT) const;
639 /// ExtractImmediate - If S involves the addition of a constant integer value,
640 /// return that integer value, and mutate S to point to a new SCEV with that
642 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
643 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
644 if (C->getValue()->getValue().getMinSignedBits() <= 64) {
645 S = SE.getIntegerSCEV(0, C->getType());
646 return C->getValue()->getSExtValue();
648 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
649 SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
650 int64_t Result = ExtractImmediate(NewOps.front(), SE);
651 S = SE.getAddExpr(NewOps);
653 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
654 SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
655 int64_t Result = ExtractImmediate(NewOps.front(), SE);
656 S = SE.getAddRecExpr(NewOps, AR->getLoop());
662 /// ExtractSymbol - If S involves the addition of a GlobalValue address,
663 /// return that symbol, and mutate S to point to a new SCEV with that
665 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
666 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
667 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
668 S = SE.getIntegerSCEV(0, GV->getType());
671 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
672 SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
673 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
674 S = SE.getAddExpr(NewOps);
676 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
677 SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
678 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
679 S = SE.getAddRecExpr(NewOps, AR->getLoop());
685 /// isLegalUse - Test whether the use described by AM is "legal", meaning
686 /// it can be completely folded into the user instruction at isel time.
687 /// This includes address-mode folding and special icmp tricks.
688 static bool isLegalUse(const TargetLowering::AddrMode &AM,
689 LSRUse::KindType Kind, const Type *AccessTy,
690 const TargetLowering *TLI) {
692 case LSRUse::Address:
693 // If we have low-level target information, ask the target if it can
694 // completely fold this address.
695 if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy);
697 // Otherwise, just guess that reg+reg addressing is legal.
698 return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1;
700 case LSRUse::ICmpZero:
701 // There's not even a target hook for querying whether it would be legal
702 // to fold a GV into an ICmp.
706 // ICmp only has two operands; don't allow more than two non-trivial parts.
707 if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0)
710 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale
711 // by putting the scaled register in the other operand of the icmp.
712 if (AM.Scale != 0 && AM.Scale != -1)
715 // If we have low-level target information, ask the target if it can
716 // fold an integer immediate on an icmp.
717 if (AM.BaseOffs != 0) {
718 if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs);
725 // Only handle single-register values.
726 return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;
728 case LSRUse::Special:
729 // Only handle -1 scales, or no scale.
730 return AM.Scale == 0 || AM.Scale == -1;
736 static bool isAlwaysFoldable(const SCEV *S,
738 LSRUse::KindType Kind, const Type *AccessTy,
739 const TargetLowering *TLI,
740 ScalarEvolution &SE) {
741 // Fast-path: zero is always foldable.
742 if (S->isZero()) return true;
744 // Conservatively, create an address with an immediate and a
746 TargetLowering::AddrMode AM;
747 AM.BaseOffs = ExtractImmediate(S, SE);
748 AM.BaseGV = ExtractSymbol(S, SE);
749 AM.HasBaseReg = HasBaseReg;
750 AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
752 // If there's anything else involved, it's not foldable.
753 if (!S->isZero()) return false;
755 return isLegalUse(AM, Kind, AccessTy, TLI);
758 /// InsertFormula - If the given formula has not yet been inserted, add it
759 /// to the list, and return true. Return false otherwise.
760 bool LSRUse::InsertFormula(const Formula &F) {
763 // Sort the base regs, to avoid adding the same solution twice with
764 // the base regs in different orders. This uses host pointer values, but
765 // it doesn't matter since it's only used for uniquifying.
766 std::sort(Copy.BaseRegs.begin(), Copy.BaseRegs.end());
768 DEBUG(for (SmallVectorImpl<const SCEV *>::const_iterator I =
769 F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
770 assert(!(*I)->isZero() && "Zero allocated in a base register!");
771 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
772 "Zero allocated in a scaled register!"));
774 if (FormulaeUniquifier.insert(Copy)) {
775 Formulae.push_back(F);
783 LSRUse::InsertInitialFormula(const SCEV *S, Loop *L,
784 ScalarEvolution &SE, DominatorTree &DT) {
786 F.InitialMatch(S, L, SE, DT);
787 bool Inserted = InsertFormula(F);
788 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
792 LSRUse::InsertSupplementalFormula(const SCEV *S) {
794 F.BaseRegs.push_back(S);
795 F.AM.HasBaseReg = true;
796 bool Inserted = InsertFormula(F);
797 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
800 /// getImmediateDominator - A handy utility for the specific DominatorTree
801 /// query that we need here.
803 static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) {
804 DomTreeNode *Node = DT.getNode(BB);
806 Node = Node->getIDom();
808 return Node->getBlock();
811 Value *LSRUse::Expand(BasicBlock::iterator IP,
812 Loop *L, SCEVExpander &Rewriter,
813 SmallVectorImpl<WeakVH> &DeadInsts,
814 ScalarEvolution &SE, DominatorTree &DT) const {
815 // Then, collect some instructions which we will remain dominated by when
816 // expanding the replacement. These must be dominated by any operands that
817 // will be required in the expansion.
818 SmallVector<Instruction *, 4> Inputs;
819 if (Instruction *I = dyn_cast<Instruction>(OperandValToReplace))
821 if (Kind == ICmpZero)
823 dyn_cast<Instruction>(cast<ICmpInst>(UserInst)->getOperand(1)))
825 if (PostIncLoop && !L->contains(UserInst))
826 Inputs.push_back(L->getLoopLatch()->getTerminator());
828 // Then, climb up the immediate dominator tree as far as we can go while
829 // still being dominated by the input positions.
831 bool AllDominate = true;
832 Instruction *BetterPos = 0;
833 BasicBlock *IDom = getImmediateDominator(IP->getParent(), DT);
835 Instruction *Tentative = IDom->getTerminator();
836 for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
837 E = Inputs.end(); I != E; ++I) {
838 Instruction *Inst = *I;
839 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
843 if (IDom == Inst->getParent() &&
844 (!BetterPos || DT.dominates(BetterPos, Inst)))
845 BetterPos = next(BasicBlock::iterator(Inst));
854 while (isa<PHINode>(IP)) ++IP;
856 // The first formula in the list is the winner.
857 const Formula &F = Formulae.front();
859 // Inform the Rewriter if we have a post-increment use, so that it can
860 // perform an advantageous expansion.
861 Rewriter.setPostInc(PostIncLoop);
863 // This is the type that the user actually needs.
864 const Type *OpTy = OperandValToReplace->getType();
865 // This will be the type that we'll initially expand to.
866 const Type *Ty = F.getType();
868 // No type known; just expand directly to the ultimate type.
870 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
871 // Expand directly to the ultimate type if it's the right size.
873 // This is the type to do integer arithmetic in.
874 const Type *IntTy = SE.getEffectiveSCEVType(Ty);
876 // Build up a list of operands to add together to form the full base.
877 SmallVector<const SCEV *, 8> Ops;
879 // Expand the BaseRegs portion.
880 for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
881 E = F.BaseRegs.end(); I != E; ++I) {
882 const SCEV *Reg = *I;
883 assert(!Reg->isZero() && "Zero allocated in a base register!");
885 // If we're expanding for a post-inc user for the add-rec's loop, make the
886 // post-inc adjustment.
887 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg))
888 if (AR->getLoop() == PostIncLoop)
889 Reg = SE.getAddExpr(Reg, AR->getStepRecurrence(SE));
891 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
894 // Expand the ScaledReg portion.
895 Value *ICmpScaledV = 0;
896 if (F.AM.Scale != 0) {
897 const SCEV *ScaledS = F.ScaledReg;
899 // If we're expanding for a post-inc user for the add-rec's loop, make the
900 // post-inc adjustment.
901 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ScaledS))
902 if (AR->getLoop() == PostIncLoop)
903 ScaledS = SE.getAddExpr(ScaledS, AR->getStepRecurrence(SE));
905 if (Kind == ICmpZero) {
906 // An interesting way of "folding" with an icmp is to use a negated
907 // scale, which we'll implement by inserting it into the other operand
909 assert(F.AM.Scale == -1 &&
910 "The only scale supported by ICmpZero uses is -1!");
911 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
913 // Otherwise just expand the scaled register and an explicit scale,
914 // which is expected to be matched as part of the address.
915 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
916 const Type *ScaledTy = SE.getEffectiveSCEVType(ScaledS->getType());
917 ScaledS = SE.getMulExpr(ScaledS,
918 SE.getSCEV(ConstantInt::get(ScaledTy,
920 Ops.push_back(ScaledS);
924 // Expand the immediate portions.
926 Ops.push_back(SE.getSCEV(F.AM.BaseGV));
927 if (F.AM.BaseOffs != 0) {
928 if (Kind == ICmpZero) {
929 // The other interesting way of "folding" with an ICmpZero is to use a
930 // negated immediate.
932 ICmpScaledV = ConstantInt::get(IntTy, -F.AM.BaseOffs);
934 Ops.push_back(SE.getUnknown(ICmpScaledV));
935 ICmpScaledV = ConstantInt::get(IntTy, F.AM.BaseOffs);
938 // Just add the immediate values. These again are expected to be matched
939 // as part of the address.
940 Ops.push_back(SE.getSCEV(ConstantInt::get(IntTy, F.AM.BaseOffs)));
944 // Emit instructions summing all the operands.
945 const SCEV *FullS = Ops.empty() ?
946 SE.getIntegerSCEV(0, IntTy) :
948 Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);
950 // We're done expanding now, so reset the rewriter.
951 Rewriter.setPostInc(0);
953 // An ICmpZero Formula represents an ICmp which we're handling as a
954 // comparison against zero. Now that we've expanded an expression for that
955 // form, update the ICmp's other operand.
956 if (Kind == ICmpZero) {
957 ICmpInst *CI = cast<ICmpInst>(UserInst);
958 DeadInsts.push_back(CI->getOperand(1));
959 assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
960 "a scale at the same time!");
961 if (F.AM.Scale == -1) {
962 if (ICmpScaledV->getType() != OpTy) {
964 CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
966 ICmpScaledV, OpTy, "tmp", CI);
969 CI->setOperand(1, ICmpScaledV);
971 assert(F.AM.Scale == 0 &&
972 "ICmp does not support folding a global value and "
973 "a scale at the same time!");
974 Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
975 -(uint64_t)F.AM.BaseOffs);
976 if (C->getType() != OpTy)
977 C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
981 CI->setOperand(1, C);
988 /// Rewrite - Emit instructions for the leading candidate expression for this
989 /// LSRUse (this is called "expanding"), and update the UserInst to reference
990 /// the newly expanded value.
991 void LSRUse::Rewrite(Loop *L, SCEVExpander &Rewriter,
992 SmallVectorImpl<WeakVH> &DeadInsts,
993 ScalarEvolution &SE, DominatorTree &DT,
995 const Type *OpTy = OperandValToReplace->getType();
997 // First, find an insertion point that dominates UserInst. For PHI nodes,
998 // find the nearest block which dominates all the relevant uses.
999 if (PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1000 DenseMap<BasicBlock *, Value *> Inserted;
1001 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1002 if (PN->getIncomingValue(i) == OperandValToReplace) {
1003 BasicBlock *BB = PN->getIncomingBlock(i);
1005 // If this is a critical edge, split the edge so that we do not insert
1006 // the code on all predecessor/successor paths. We do this unless this
1007 // is the canonical backedge for this loop, which complicates post-inc
1009 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
1010 !isa<IndirectBrInst>(BB->getTerminator()) &&
1011 (PN->getParent() != L->getHeader() || !L->contains(BB))) {
1012 // Split the critical edge.
1013 BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);
1015 // If PN is outside of the loop and BB is in the loop, we want to
1016 // move the block to be immediately before the PHI block, not
1017 // immediately after BB.
1018 if (L->contains(BB) && !L->contains(PN))
1019 NewBB->moveBefore(PN->getParent());
1021 // Splitting the edge can reduce the number of PHI entries we have.
1022 e = PN->getNumIncomingValues();
1024 i = PN->getBasicBlockIndex(BB);
1027 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
1028 Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
1030 PN->setIncomingValue(i, Pair.first->second);
1032 Value *FullV = Expand(BB->getTerminator(),
1033 L, Rewriter, DeadInsts, SE, DT);
1035 // If this is reuse-by-noop-cast, insert the noop cast.
1036 if (FullV->getType() != OpTy)
1038 CastInst::Create(CastInst::getCastOpcode(FullV, false,
1040 FullV, OperandValToReplace->getType(),
1041 "tmp", BB->getTerminator());
1043 PN->setIncomingValue(i, FullV);
1044 Pair.first->second = FullV;
1048 Value *FullV = Expand(UserInst, L, Rewriter, DeadInsts, SE, DT);
1050 // If this is reuse-by-noop-cast, insert the noop cast.
1051 if (FullV->getType() != OpTy) {
1053 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
1054 FullV, OpTy, "tmp", UserInst);
1059 UserInst->replaceUsesOfWith(OperandValToReplace, FullV);
1062 DeadInsts.push_back(OperandValToReplace);
1065 void LSRUse::print(raw_ostream &OS) const {
1066 OS << "LSR Use: Kind=";
1068 case Basic: OS << "Basic"; break;
1069 case Special: OS << "Special"; break;
1070 case ICmpZero: OS << "ICmpZero"; break;
1072 OS << "Address of ";
1073 if (isa<PointerType>(AccessTy))
1074 OS << "pointer"; // the full pointer type could be really verbose
1079 OS << ", UserInst=";
1080 // Store is common and interesting enough to be worth special-casing.
1081 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1083 WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
1084 } else if (UserInst->getType()->isVoidTy())
1085 OS << UserInst->getOpcodeName();
1087 WriteAsOperand(OS, UserInst, /*PrintType=*/false);
1089 OS << ", OperandValToReplace=";
1090 WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);
1093 OS << ", PostIncLoop=";
1094 WriteAsOperand(OS, PostIncLoop->getHeader(), /*PrintType=*/false);
1098 void LSRUse::dump() const {
1099 print(errs()); errs() << '\n';
1104 /// Score - This class is used to measure and compare candidate formulae.
1109 unsigned NumBaseAdds;
1114 : NumRegs(0), NumPhis(0), NumIVMuls(0), NumBaseAdds(0), NumImms(0) {}
1116 void RateInitial(SmallVector<LSRUse, 16> const &Uses, const Loop *L,
1117 ScalarEvolution &SE);
1119 void Rate(const SCEV *Reg, const SmallBitVector &Bits,
1120 const SmallVector<LSRUse, 16> &Uses, const Loop *L,
1121 ScalarEvolution &SE);
1123 bool operator<(const Score &Other) const;
1125 void print_details(raw_ostream &OS, const SCEV *Reg,
1126 const SmallPtrSet<const SCEV *, 8> &Regs) const;
1128 void print(raw_ostream &OS) const;
1132 void RateRegister(const SCEV *Reg, SmallPtrSet<const SCEV *, 8> &Regs,
1134 void RateFormula(const Formula &F, SmallPtrSet<const SCEV *, 8> &Regs,
1142 /// RateRegister - Tally up interesting quantities from the given register.
1143 void Score::RateRegister(const SCEV *Reg,
1144 SmallPtrSet<const SCEV *, 8> &Regs,
1146 if (Regs.insert(Reg))
1147 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1148 NumPhis += AR->getLoop() == L;
1150 // Add the step value register, if it needs one.
1151 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
1152 RateRegister(AR->getOperand(1), Regs, L);
1156 void Score::RateFormula(const Formula &F,
1157 SmallPtrSet<const SCEV *, 8> &Regs,
1159 // Tally up the registers.
1161 RateRegister(F.ScaledReg, Regs, L);
1162 for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
1163 E = F.BaseRegs.end(); I != E; ++I) {
1164 const SCEV *BaseReg = *I;
1165 RateRegister(BaseReg, Regs, L);
1167 NumIVMuls += isa<SCEVMulExpr>(BaseReg) &&
1168 BaseReg->hasComputableLoopEvolution(L);
1171 if (F.BaseRegs.size() > 1)
1172 NumBaseAdds += F.BaseRegs.size() - 1;
1174 // Tally up the non-zero immediates.
1175 if (F.AM.BaseGV || F.AM.BaseOffs != 0)
1179 /// Loose - Set this score to a loosing value.
1180 void Score::Loose() {
1188 /// RateInitial - Compute a score for the initial "fully reduced" solution.
1189 void Score::RateInitial(SmallVector<LSRUse, 16> const &Uses, const Loop *L,
1190 ScalarEvolution &SE) {
1191 SmallPtrSet<const SCEV *, 8> Regs;
1192 for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
1193 E = Uses.end(); I != E; ++I)
1194 RateFormula(I->Formulae.front(), Regs, L);
1195 NumRegs += Regs.size();
1197 DEBUG(print_details(dbgs(), 0, Regs));
1200 /// Rate - Compute a score for the solution where the reuse associated with
1201 /// putting Reg in a register is selected.
1202 void Score::Rate(const SCEV *Reg, const SmallBitVector &Bits,
1203 const SmallVector<LSRUse, 16> &Uses, const Loop *L,
1204 ScalarEvolution &SE) {
1205 SmallPtrSet<const SCEV *, 8> Regs;
1206 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
1207 const LSRUse &LU = Uses[i];
1209 const Formula *BestFormula = 0;
1210 if (i >= Bits.size() || !Bits.test(i))
1211 // This use doesn't use the current register. Just go with the current
1212 // leading candidate formula.
1213 BestFormula = &LU.Formulae.front();
1215 // Find the best formula for this use that uses the current register.
1216 for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
1217 E = LU.Formulae.end(); I != E; ++I) {
1218 const Formula &F = *I;
1219 if (F.referencesReg(Reg) &&
1220 (!BestFormula || ComplexitySorter()(F, *BestFormula)))
1224 // If we didn't find *any* forumlae, because earlier we eliminated some
1225 // in greedy fashion, skip the current register's reuse opportunity.
1227 DEBUG(dbgs() << "Reuse with reg " << *Reg
1228 << " wouldn't help any users.\n");
1233 // For an in-loop post-inc user, don't allow multiple base registers,
1234 // because that would require an awkward in-loop add after the increment.
1235 if (LU.PostIncLoop && LU.PostIncLoop->contains(LU.UserInst) &&
1236 BestFormula->BaseRegs.size() > 1) {
1237 DEBUG(dbgs() << "Reuse with reg " << *Reg
1238 << " would require an in-loop post-inc add: ";
1239 BestFormula->dump());
1244 RateFormula(*BestFormula, Regs, L);
1246 NumRegs += Regs.size();
1248 DEBUG(print_details(dbgs(), Reg, Regs));
1251 /// operator< - Choose the better score.
1252 bool Score::operator<(const Score &Other) const {
1253 if (NumRegs != Other.NumRegs)
1254 return NumRegs < Other.NumRegs;
1255 if (NumPhis != Other.NumPhis)
1256 return NumPhis < Other.NumPhis;
1257 if (NumIVMuls != Other.NumIVMuls)
1258 return NumIVMuls < Other.NumIVMuls;
1259 if (NumBaseAdds != Other.NumBaseAdds)
1260 return NumBaseAdds < Other.NumBaseAdds;
1261 return NumImms < Other.NumImms;
1264 void Score::print_details(raw_ostream &OS,
1266 const SmallPtrSet<const SCEV *, 8> &Regs) const {
1267 if (Reg) OS << "Reuse with reg " << *Reg << " would require ";
1268 else OS << "The initial solution would require ";
1271 for (SmallPtrSet<const SCEV *, 8>::const_iterator I = Regs.begin(),
1272 E = Regs.end(); I != E; ++I)
1277 void Score::print(raw_ostream &OS) const {
1278 OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
1280 OS << ", including " << NumPhis << " PHI" << (NumPhis == 1 ? "" : "s");
1282 OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
1283 if (NumBaseAdds != 0)
1284 OS << ", plus " << NumBaseAdds << " base add"
1285 << (NumBaseAdds == 1 ? "" : "s");
1287 OS << ", plus " << NumImms << " imm" << (NumImms == 1 ? "" : "s");
1290 void Score::dump() const {
1291 print(errs()); errs() << '\n';
1294 /// isAddressUse - Returns true if the specified instruction is using the
1295 /// specified value as an address.
1296 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
1297 bool isAddress = isa<LoadInst>(Inst);
1298 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1299 if (SI->getOperand(1) == OperandVal)
1301 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1302 // Addressing modes can also be folded into prefetches and a variety
1304 switch (II->getIntrinsicID()) {
1306 case Intrinsic::prefetch:
1307 case Intrinsic::x86_sse2_loadu_dq:
1308 case Intrinsic::x86_sse2_loadu_pd:
1309 case Intrinsic::x86_sse_loadu_ps:
1310 case Intrinsic::x86_sse_storeu_ps:
1311 case Intrinsic::x86_sse2_storeu_pd:
1312 case Intrinsic::x86_sse2_storeu_dq:
1313 case Intrinsic::x86_sse2_storel_dq:
1314 if (II->getOperand(1) == OperandVal)
1322 /// getAccessType - Return the type of the memory being accessed.
1323 static const Type *getAccessType(const Instruction *Inst) {
1324 const Type *AccessTy = Inst->getType();
1325 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
1326 AccessTy = SI->getOperand(0)->getType();
1327 else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1328 // Addressing modes can also be folded into prefetches and a variety
1330 switch (II->getIntrinsicID()) {
1332 case Intrinsic::x86_sse_storeu_ps:
1333 case Intrinsic::x86_sse2_storeu_pd:
1334 case Intrinsic::x86_sse2_storeu_dq:
1335 case Intrinsic::x86_sse2_storel_dq:
1336 AccessTy = II->getOperand(1)->getType();
1343 /// DeleteTriviallyDeadInstructions - If any of the instructions is the
1344 /// specified set are trivially dead, delete them and see if this makes any of
1345 /// their operands subsequently dead.
1347 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
1348 bool Changed = false;
1350 while (!DeadInsts.empty()) {
1351 Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
1353 if (I == 0 || !isInstructionTriviallyDead(I))
1356 for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
1357 if (Instruction *U = dyn_cast<Instruction>(*OI)) {
1360 DeadInsts.push_back(U);
1363 I->eraseFromParent();
1372 /// LSRInstance - This class holds state for the main loop strength
1373 /// reduction logic.
1376 ScalarEvolution &SE;
1378 const TargetLowering *const TLI;
1382 /// CurrentArbitraryRegIndex - To ensure a deterministic ordering, assign an
1383 /// arbitrary index value to each register as a sort tie breaker.
1384 unsigned CurrentArbitraryRegIndex;
1386 /// Factors - Interesting factors between use strides.
1387 SmallSetVector<int64_t, 4> Factors;
1389 /// Types - Interesting use types, to facilitate truncation reuse.
1390 SmallSetVector<const Type *, 4> Types;
1392 /// Uses - The list of interesting uses.
1393 SmallVector<LSRUse, 16> Uses;
1395 // TODO: Reorganize these data structures.
1396 typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
1398 SmallVector<const SCEV *, 16> RegSequence;
1400 void OptimizeShadowIV();
1401 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
1402 const SCEV* &CondStride);
1403 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
1404 bool StrideMightBeShared(const SCEV* Stride);
1405 bool OptimizeLoopTermCond(Instruction *&IVIncInsertPos);
1407 LSRUse &getNewUse() {
1408 Uses.push_back(LSRUse());
1412 void CountRegister(const SCEV *Reg, uint32_t Complexity, size_t LUIdx);
1413 void CountRegisters(const Formula &F, size_t LUIdx);
1415 void GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx,
1417 void GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx,
1419 void GenerateFormulaeFromReplacedBaseReg(LSRUse &LU,
1421 const Formula &Base, unsigned i,
1422 const SmallVectorImpl<const SCEV *>
1424 void GenerateReassociationReuse(LSRUse &LU, unsigned LUIdx,
1426 void GenerateCombinationReuse(LSRUse &LU, unsigned LUIdx,
1428 void GenerateScaledReuse(LSRUse &LU, unsigned LUIdx,
1430 void GenerateTruncateReuse(LSRUse &LU, unsigned LUIdx,
1433 void GenerateConstantOffsetReuse();
1435 void GenerateAllReuseFormulae();
1437 void GenerateLoopInvariantRegisterUses();
1440 LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
1442 bool getChanged() const { return Changed; }
1444 void print(raw_ostream &OS) const;
1450 /// OptimizeShadowIV - If IV is used in a int-to-float cast
1451 /// inside the loop then try to eliminate the cast opeation.
1452 void LSRInstance::OptimizeShadowIV() {
1453 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1454 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1457 for (size_t StrideIdx = 0, e = IU.StrideOrder.size();
1458 StrideIdx != e; ++StrideIdx) {
1459 std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
1460 IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]);
1461 assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
1462 if (!isa<SCEVConstant>(SI->first))
1465 for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
1466 E = SI->second->Users.end(); UI != E; /* empty */) {
1467 ilist<IVStrideUse>::iterator CandidateUI = UI;
1469 Instruction *ShadowUse = CandidateUI->getUser();
1470 const Type *DestTy = NULL;
1472 /* If shadow use is a int->float cast then insert a second IV
1473 to eliminate this cast.
1475 for (unsigned i = 0; i < n; ++i)
1481 for (unsigned i = 0; i < n; ++i, ++d)
1484 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
1485 DestTy = UCast->getDestTy();
1486 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
1487 DestTy = SCast->getDestTy();
1488 if (!DestTy) continue;
1491 // If target does not support DestTy natively then do not apply
1492 // this transformation.
1493 EVT DVT = TLI->getValueType(DestTy);
1494 if (!TLI->isTypeLegal(DVT)) continue;
1497 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
1499 if (PH->getNumIncomingValues() != 2) continue;
1501 const Type *SrcTy = PH->getType();
1502 int Mantissa = DestTy->getFPMantissaWidth();
1503 if (Mantissa == -1) continue;
1504 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
1507 unsigned Entry, Latch;
1508 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
1516 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
1517 if (!Init) continue;
1518 Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
1520 BinaryOperator *Incr =
1521 dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
1522 if (!Incr) continue;
1523 if (Incr->getOpcode() != Instruction::Add
1524 && Incr->getOpcode() != Instruction::Sub)
1527 /* Initialize new IV, double d = 0.0 in above example. */
1528 ConstantInt *C = NULL;
1529 if (Incr->getOperand(0) == PH)
1530 C = dyn_cast<ConstantInt>(Incr->getOperand(1));
1531 else if (Incr->getOperand(1) == PH)
1532 C = dyn_cast<ConstantInt>(Incr->getOperand(0));
1538 // Ignore negative constants, as the code below doesn't handle them
1539 // correctly. TODO: Remove this restriction.
1540 if (!C->getValue().isStrictlyPositive()) continue;
1542 /* Add new PHINode. */
1543 PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH);
1545 /* create new increment. '++d' in above example. */
1546 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
1547 BinaryOperator *NewIncr =
1548 BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
1549 Instruction::FAdd : Instruction::FSub,
1550 NewPH, CFP, "IV.S.next.", Incr);
1552 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
1553 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
1555 /* Remove cast operation */
1556 ShadowUse->replaceAllUsesWith(NewPH);
1557 ShadowUse->eraseFromParent();
1563 /// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
1564 /// set the IV user and stride information and return true, otherwise return
1566 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond,
1567 IVStrideUse *&CondUse,
1568 const SCEV* &CondStride) {
1569 for (unsigned StrideIdx = 0, e = IU.StrideOrder.size();
1570 StrideIdx != e && !CondUse; ++StrideIdx) {
1571 std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
1572 IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]);
1573 assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
1575 for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
1576 E = SI->second->Users.end(); UI != E; ++UI)
1577 if (UI->getUser() == Cond) {
1578 // NOTE: we could handle setcc instructions with multiple uses here, but
1579 // InstCombine does it as well for simple uses, it's not clear that it
1580 // occurs enough in real life to handle.
1582 CondStride = SI->first;
1589 /// OptimizeMax - Rewrite the loop's terminating condition if it uses
1590 /// a max computation.
1592 /// This is a narrow solution to a specific, but acute, problem. For loops
1598 /// } while (++i < n);
1600 /// the trip count isn't just 'n', because 'n' might not be positive. And
1601 /// unfortunately this can come up even for loops where the user didn't use
1602 /// a C do-while loop. For example, seemingly well-behaved top-test loops
1603 /// will commonly be lowered like this:
1609 /// } while (++i < n);
1612 /// and then it's possible for subsequent optimization to obscure the if
1613 /// test in such a way that indvars can't find it.
1615 /// When indvars can't find the if test in loops like this, it creates a
1616 /// max expression, which allows it to give the loop a canonical
1617 /// induction variable:
1620 /// max = n < 1 ? 1 : n;
1623 /// } while (++i != max);
1625 /// Canonical induction variables are necessary because the loop passes
1626 /// are designed around them. The most obvious example of this is the
1627 /// LoopInfo analysis, which doesn't remember trip count values. It
1628 /// expects to be able to rediscover the trip count each time it is
1629 /// needed, and it does this using a simple analysis that only succeeds if
1630 /// the loop has a canonical induction variable.
1632 /// However, when it comes time to generate code, the maximum operation
1633 /// can be quite costly, especially if it's inside of an outer loop.
1635 /// This function solves this problem by detecting this type of loop and
1636 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
1637 /// the instructions for the maximum computation.
1639 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
1640 // Check that the loop matches the pattern we're looking for.
1641 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
1642 Cond->getPredicate() != CmpInst::ICMP_NE)
1645 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
1646 if (!Sel || !Sel->hasOneUse()) return Cond;
1648 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1649 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1651 const SCEV *One = SE.getIntegerSCEV(1, BackedgeTakenCount->getType());
1653 // Add one to the backedge-taken count to get the trip count.
1654 const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One);
1656 // Check for a max calculation that matches the pattern.
1657 if (!isa<SCEVSMaxExpr>(IterationCount) && !isa<SCEVUMaxExpr>(IterationCount))
1659 const SCEVNAryExpr *Max = cast<SCEVNAryExpr>(IterationCount);
1660 if (Max != SE.getSCEV(Sel)) return Cond;
1662 // To handle a max with more than two operands, this optimization would
1663 // require additional checking and setup.
1664 if (Max->getNumOperands() != 2)
1667 const SCEV *MaxLHS = Max->getOperand(0);
1668 const SCEV *MaxRHS = Max->getOperand(1);
1669 if (!MaxLHS || MaxLHS != One) return Cond;
1670 // Check the relevant induction variable for conformance to
1672 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
1673 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
1674 if (!AR || !AR->isAffine() ||
1675 AR->getStart() != One ||
1676 AR->getStepRecurrence(SE) != One)
1679 assert(AR->getLoop() == L &&
1680 "Loop condition operand is an addrec in a different loop!");
1682 // Check the right operand of the select, and remember it, as it will
1683 // be used in the new comparison instruction.
1685 if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
1686 NewRHS = Sel->getOperand(1);
1687 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
1688 NewRHS = Sel->getOperand(2);
1689 if (!NewRHS) return Cond;
1691 // Determine the new comparison opcode. It may be signed or unsigned,
1692 // and the original comparison may be either equality or inequality.
1693 CmpInst::Predicate Pred =
1694 isa<SCEVSMaxExpr>(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
1695 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
1696 Pred = CmpInst::getInversePredicate(Pred);
1698 // Ok, everything looks ok to change the condition into an SLT or SGE and
1699 // delete the max calculation.
1701 new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
1703 // Delete the max calculation instructions.
1704 Cond->replaceAllUsesWith(NewCond);
1705 CondUse->setUser(NewCond);
1706 Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
1707 Cond->eraseFromParent();
1708 Sel->eraseFromParent();
1709 if (Cmp->use_empty())
1710 Cmp->eraseFromParent();
1714 bool LSRInstance::StrideMightBeShared(const SCEV* Stride) {
1715 int64_t SInt = cast<SCEVConstant>(Stride)->getValue()->getSExtValue();
1716 for (unsigned i = 0, e = IU.StrideOrder.size(); i != e; ++i) {
1717 std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
1718 IU.IVUsesByStride.find(IU.StrideOrder[i]);
1719 const SCEV *Share = SI->first;
1720 if (!isa<SCEVConstant>(SI->first) || Share == Stride)
1722 int64_t SSInt = cast<SCEVConstant>(Share)->getValue()->getSExtValue();
1724 return true; // This can definitely be reused.
1725 if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0)
1727 int64_t Scale = SSInt / SInt;
1729 // This AM will be used for conservative queries. At this point in the
1730 // process we don't know which users will have a base reg, immediate,
1731 // etc., so we conservatively assume that it may not, making more
1732 // strides valid, thus erring on the side of assuming that there
1733 // might be sharing.
1734 TargetLowering::AddrMode AM;
1737 // Any pre-inc iv use?
1738 IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[Share];
1739 for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
1740 E = StrideUses.Users.end(); I != E; ++I) {
1741 bool isAddress = isAddressUse(I->getUser(), I->getOperandValToReplace());
1742 if (!I->isUseOfPostIncrementedValue() &&
1743 isLegalUse(AM, isAddress ? LSRUse::Address : LSRUse::Basic,
1744 isAddress ? getAccessType(I->getUser()) : 0,
1752 /// OptimizeLoopTermCond - Change loop terminating condition to use the
1753 /// postinc iv when possible.
1755 LSRInstance::OptimizeLoopTermCond(Instruction *&IVIncInsertPos) {
1756 SmallPtrSet<Instruction *, 4> PostIncs;
1758 BasicBlock *LatchBlock = L->getLoopLatch();
1759 SmallVector<BasicBlock*, 8> ExitingBlocks;
1760 L->getExitingBlocks(ExitingBlocks);
1762 for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
1763 BasicBlock *ExitingBlock = ExitingBlocks[i];
1765 // Get the terminating condition for the loop if possible. If we
1766 // can, we want to change it to use a post-incremented version of its
1767 // induction variable, to allow coalescing the live ranges for the IV into
1768 // one register value.
1770 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
1773 // FIXME: Overly conservative, termination condition could be an 'or' etc..
1774 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
1777 // Search IVUsesByStride to find Cond's IVUse if there is one.
1778 IVStrideUse *CondUse = 0;
1779 const SCEV *CondStride = 0;
1780 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
1781 if (!FindIVUserForCond(Cond, CondUse, CondStride))
1784 // If the trip count is computed in terms of a max (due to ScalarEvolution
1785 // being unable to find a sufficient guard, for example), change the loop
1786 // comparison to use SLT or ULT instead of NE.
1787 // One consequence of doing this now is that it disrupts the count-down
1788 // optimization. That's not always a bad thing though, because in such
1789 // cases it may still be worthwhile to avoid a max.
1790 Cond = OptimizeMax(Cond, CondUse);
1792 // If this exiting block is the latch block, and the condition has only
1793 // one use inside the loop (the branch), use the post-incremented value
1794 // of the induction variable
1795 if (ExitingBlock != LatchBlock) {
1796 // If this exiting block dominates the latch block, it may also use
1797 // the post-inc value if it won't be shared with other uses.
1798 // Check for dominance.
1799 if (!DT.dominates(ExitingBlock, LatchBlock))
1801 // Check for sharing within the same stride.
1802 bool SameStrideSharing = false;
1803 IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[CondStride];
1804 for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
1805 E = StrideUses.Users.end(); I != E; ++I) {
1806 if (I->getUser() == Cond)
1808 if (!I->isUseOfPostIncrementedValue()) {
1809 SameStrideSharing = true;
1813 if (SameStrideSharing)
1815 // Check for sharing from a different stride.
1816 if (isa<SCEVConstant>(CondStride) && StrideMightBeShared(CondStride))
1819 if (!Cond->hasOneUse()) {
1820 bool HasOneUseInLoop = true;
1821 for (Value::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
1823 Instruction *U = cast<Instruction>(*UI);
1826 if (L->contains(U)) {
1827 HasOneUseInLoop = false;
1831 if (!HasOneUseInLoop)
1835 DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
1838 // It's possible for the setcc instruction to be anywhere in the loop, and
1839 // possible for it to have multiple users. If it is not immediately before
1840 // the exiting block branch, move it.
1841 if (&*++BasicBlock::iterator(Cond) != TermBr) {
1842 if (Cond->hasOneUse()) { // Condition has a single use, just move it.
1843 Cond->moveBefore(TermBr);
1845 // Otherwise, clone the terminating condition and insert into the
1847 ICmpInst *OldCond = Cond;
1848 Cond = cast<ICmpInst>(Cond->clone());
1849 Cond->setName(L->getHeader()->getName() + ".termcond");
1850 ExitingBlock->getInstList().insert(TermBr, Cond);
1852 // Clone the IVUse, as the old use still exists!
1853 IU.IVUsesByStride[CondStride]->addUser(CondUse->getOffset(), Cond,
1854 CondUse->getOperandValToReplace());
1855 CondUse = &IU.IVUsesByStride[CondStride]->Users.back();
1856 TermBr->replaceUsesOfWith(OldCond, Cond);
1860 // If we get to here, we know that we can transform the setcc instruction to
1861 // use the post-incremented version of the IV, allowing us to coalesce the
1862 // live ranges for the IV correctly.
1863 CondUse->setOffset(SE.getMinusSCEV(CondUse->getOffset(), CondStride));
1864 CondUse->setIsUseOfPostIncrementedValue(true);
1867 PostIncs.insert(Cond);
1870 // Determine an insertion point for the loop induction variable increment. It
1871 // must dominate all the post-inc comparisons we just set up, and it must
1872 // dominate the loop latch edge.
1873 IVIncInsertPos = L->getLoopLatch()->getTerminator();
1874 for (SmallPtrSet<Instruction *, 4>::iterator I = PostIncs.begin(),
1875 E = PostIncs.end(); I != E; ++I) {
1877 DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
1879 if (BB == (*I)->getParent())
1880 IVIncInsertPos = *I;
1881 else if (BB != IVIncInsertPos->getParent())
1882 IVIncInsertPos = BB->getTerminator();
1888 /// CountRegisters - Note the given register.
1889 void LSRInstance::CountRegister(const SCEV *Reg, uint32_t Complexity,
1891 std::pair<RegUsesTy::iterator, bool> Pair =
1892 RegUses.insert(std::make_pair(Reg, RegSortData()));
1893 RegSortData &BV = Pair.first->second;
1895 BV.Index = CurrentArbitraryRegIndex++;
1896 BV.MaxComplexity = Complexity;
1897 RegSequence.push_back(Reg);
1899 BV.MaxComplexity = std::max(BV.MaxComplexity, Complexity);
1901 BV.Bits.resize(std::max(BV.Bits.size(), LUIdx + 1));
1905 /// CountRegisters - Note which registers are used by the given formula,
1906 /// updating RegUses.
1907 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
1908 uint32_t Complexity = F.getComplexity();
1910 CountRegister(F.ScaledReg, Complexity, LUIdx);
1911 for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
1912 E = F.BaseRegs.end(); I != E; ++I)
1913 CountRegister(*I, Complexity, LUIdx);
1916 /// GenerateSymbolicOffsetReuse - Generate reuse formulae using symbolic
1918 void LSRInstance::GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx,
1920 // We can't add a symbolic offset if the address already contains one.
1921 if (Base.AM.BaseGV) return;
1923 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
1924 const SCEV *G = Base.BaseRegs[i];
1925 GlobalValue *GV = ExtractSymbol(G, SE);
1930 if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI))
1933 if (LU.InsertFormula(F))
1934 CountRegisters(LU.Formulae.back(), LUIdx);
1938 /// GenerateICmpZeroScaledReuse - For ICmpZero, check to see if we can scale up
1939 /// the comparison. For example, x == y -> x*c == y*c.
1940 void LSRInstance::GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx,
1942 if (LU.Kind != LSRUse::ICmpZero) return;
1944 // Determine the integer type for the base formula.
1945 const Type *IntTy = Base.getType();
1947 if (SE.getTypeSizeInBits(IntTy) > 64) return;
1948 IntTy = SE.getEffectiveSCEVType(IntTy);
1950 assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");
1952 // Check each interesting stride.
1953 for (SmallSetVector<int64_t, 4>::const_iterator
1954 I = Factors.begin(), E = Factors.end(); I != E; ++I) {
1955 int64_t Factor = *I;
1958 // Check that the multiplication doesn't overflow.
1959 F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs * Factor;
1960 if ((int64_t)F.AM.BaseOffs / Factor != F.AM.BaseOffs)
1963 // Check that this scale is legal.
1964 if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI))
1967 const SCEV *FactorS = SE.getSCEV(ConstantInt::get(IntTy, Factor));
1969 // Check that multiplying with each base register doesn't overflow.
1970 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
1971 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
1972 if (getSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
1976 // Check that multiplying with the scaled register doesn't overflow.
1978 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
1979 if (getSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
1983 // If we make it here and it's legal, add it.
1984 if (LU.InsertFormula(F))
1985 CountRegisters(LU.Formulae.back(), LUIdx);
1990 /// GenerateFormulaeFromReplacedBaseReg - If removing base register with
1991 /// index i from the BaseRegs list and adding the registers in AddOps
1992 /// to the address forms an interesting formula, pursue it.
1994 LSRInstance::GenerateFormulaeFromReplacedBaseReg(
1997 const Formula &Base, unsigned i,
1998 const SmallVectorImpl<const SCEV *>
2000 if (AddOps.empty()) return;
2003 std::swap(F.BaseRegs[i], F.BaseRegs.back());
2004 F.BaseRegs.pop_back();
2005 for (SmallVectorImpl<const SCEV *>::const_iterator I = AddOps.begin(),
2006 E = AddOps.end(); I != E; ++I)
2007 F.BaseRegs.push_back(*I);
2008 F.AM.HasBaseReg = !F.BaseRegs.empty();
2009 if (LU.InsertFormula(F)) {
2010 CountRegisters(LU.Formulae.back(), LUIdx);
2012 GenerateReassociationReuse(LU, LUIdx, LU.Formulae.back());
2016 /// GenerateReassociationReuse - Split out subexpressions from adds and
2017 /// the bases of addrecs.
2018 void LSRInstance::GenerateReassociationReuse(LSRUse &LU, unsigned LUIdx,
2020 SmallVector<const SCEV *, 8> AddOps;
2021 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2022 const SCEV *BaseReg = Base.BaseRegs[i];
2023 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(BaseReg)) {
2024 for (SCEVAddExpr::op_iterator J = Add->op_begin(), JE = Add->op_end();
2026 // Don't pull a constant into a register if the constant could be
2027 // folded into an immediate field.
2028 if (isAlwaysFoldable(*J, true, LU.Kind, LU.AccessTy, TLI, SE)) continue;
2029 SmallVector<const SCEV *, 8> InnerAddOps;
2030 for (SCEVAddExpr::op_iterator K = Add->op_begin(); K != JE; ++K)
2032 InnerAddOps.push_back(*K);
2033 // Splitting a 2-operand add both ways is redundant. Pruning this
2034 // now saves compile time.
2035 if (InnerAddOps.size() < 2 && next(J) == JE)
2037 AddOps.push_back(*J);
2038 const SCEV *InnerAdd = SE.getAddExpr(InnerAddOps);
2039 AddOps.push_back(InnerAdd);
2040 GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
2043 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BaseReg)) {
2044 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(AR->getStart())) {
2045 for (SCEVAddExpr::op_iterator J = Add->op_begin(), JE = Add->op_end();
2047 // Don't pull a constant into a register if the constant could be
2048 // folded into an immediate field.
2049 if (isAlwaysFoldable(*J, true, LU.Kind, LU.AccessTy, TLI, SE))
2051 SmallVector<const SCEV *, 8> InnerAddOps;
2052 for (SCEVAddExpr::op_iterator K = Add->op_begin(); K != JE; ++K)
2054 InnerAddOps.push_back(*K);
2055 AddOps.push_back(*J);
2056 const SCEV *InnerAdd = SE.getAddExpr(InnerAddOps);
2057 AddOps.push_back(SE.getAddRecExpr(InnerAdd,
2058 AR->getStepRecurrence(SE),
2060 GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
2063 } else if (!isAlwaysFoldable(AR->getStart(), Base.BaseRegs.size() > 1,
2064 LU.Kind, LU.AccessTy,
2066 AddOps.push_back(AR->getStart());
2067 AddOps.push_back(SE.getAddRecExpr(SE.getIntegerSCEV(0,
2068 BaseReg->getType()),
2069 AR->getStepRecurrence(SE),
2071 GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
2078 /// GenerateCombinationReuse - Generate a formula consisting of all of the
2079 /// loop-dominating registers added into a single register.
2080 void LSRInstance::GenerateCombinationReuse(LSRUse &LU, unsigned LUIdx,
2082 if (Base.BaseRegs.size() <= 1) return;
2086 SmallVector<const SCEV *, 4> Ops;
2087 for (SmallVectorImpl<const SCEV *>::const_iterator
2088 I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
2089 const SCEV *BaseReg = *I;
2090 if (BaseReg->properlyDominates(L->getHeader(), &DT) &&
2091 !BaseReg->hasComputableLoopEvolution(L))
2092 Ops.push_back(BaseReg);
2094 F.BaseRegs.push_back(BaseReg);
2096 if (Ops.size() > 1) {
2097 F.BaseRegs.push_back(SE.getAddExpr(Ops));
2098 if (LU.InsertFormula(F))
2099 CountRegisters(LU.Formulae.back(), LUIdx);
2103 /// GenerateScaledReuse - Generate stride factor reuse formulae by making
2104 /// use of scaled-offset address modes, for example.
2105 void LSRInstance::GenerateScaledReuse(LSRUse &LU, unsigned LUIdx,
2107 // Determine the integer type for the base formula.
2108 const Type *IntTy = Base.getType();
2110 IntTy = SE.getEffectiveSCEVType(IntTy);
2112 // Check each interesting stride.
2113 for (SmallSetVector<int64_t, 4>::const_iterator
2114 I = Factors.begin(), E = Factors.end(); I != E; ++I) {
2115 int64_t Factor = *I;
2117 // If this Formula already has a scaled register, we can't add another one.
2118 if (Base.AM.Scale != 0)
2121 F.AM.Scale = Factor;
2122 // Check whether this scale is going to be legal.
2123 if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI)) {
2124 // As a special-case, handle special out-of-loop Basic users specially.
2125 // TODO: Reconsider this special case.
2126 if (LU.Kind == LSRUse::Basic &&
2127 isLegalUse(F.AM, LSRUse::Special, LU.AccessTy, TLI) &&
2128 !L->contains(LU.UserInst))
2129 LU.Kind = LSRUse::Special;
2133 // For each addrec base reg, apply the scale, if possible.
2134 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
2135 if (const SCEVAddRecExpr *AR =
2136 dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
2137 const SCEV *FactorS = SE.getSCEV(ConstantInt::get(IntTy, Factor));
2138 // Divide out the factor, ignoring high bits, since we'll be
2139 // scaling the value back up in the end.
2140 if (const SCEV *Quotient = getSDiv(AR, FactorS, SE, true)) {
2141 // TODO: This could be optimized to avoid all the copying.
2143 NewF.ScaledReg = Quotient;
2144 std::swap(NewF.BaseRegs[i], NewF.BaseRegs.back());
2145 NewF.BaseRegs.pop_back();
2146 NewF.AM.HasBaseReg = !NewF.BaseRegs.empty();
2147 if (LU.InsertFormula(NewF))
2148 CountRegisters(LU.Formulae.back(), LUIdx);
2154 /// GenerateTruncateReuse - Generate reuse formulae from different IV types.
2155 void LSRInstance::GenerateTruncateReuse(LSRUse &LU, unsigned LUIdx,
2157 // This requires TargetLowering to tell us which truncates are free.
2160 // Don't attempt to truncate symbolic values.
2161 if (Base.AM.BaseGV) return;
2163 // Determine the integer type for the base formula.
2164 const Type *DstTy = Base.getType();
2166 DstTy = SE.getEffectiveSCEVType(DstTy);
2168 for (SmallSetVector<const Type *, 4>::const_iterator
2169 I = Types.begin(), E = Types.end(); I != E; ++I) {
2170 const Type *SrcTy = *I;
2171 if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
2173 if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
2174 for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
2175 JE = F.BaseRegs.end(); J != JE; ++J)
2176 *J = SE.getAnyExtendExpr(*J, SrcTy);
2177 if (LU.InsertFormula(F))
2178 CountRegisters(LU.Formulae.back(), LUIdx);
2185 /// WorkItem - Helper class for GenerateConstantOffsetReuse. It's used to
2186 /// defer modifications so that the search phase doesn't have to worry about
2187 /// the data structures moving underneath it.
2192 const SCEV *OrigReg;
2194 WorkItem(LSRUse *U, size_t LI, int64_t I, const SCEV *R)
2195 : LU(U), LUIdx(LI), Imm(I), OrigReg(R) {}
2197 void print(raw_ostream &OS) const;
2201 void WorkItem::print(raw_ostream &OS) const {
2204 OS << " (at index " << LUIdx << "), add offset " << Imm
2205 << " and compensate by adjusting refences to " << *OrigReg << "\n";
2208 void WorkItem::dump() const {
2209 print(errs()); errs() << '\n';
2214 /// GenerateConstantOffsetReuse - Look for registers which are a constant
2215 /// distance apart and try to form reuse opportunities between them.
2216 void LSRInstance::GenerateConstantOffsetReuse() {
2217 // Group the registers by their value without any added constant offset.
2218 typedef std::map<int64_t, const SCEV *> ImmMapTy;
2219 typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
2221 SmallVector<const SCEV *, 8> Sequence;
2222 for (SmallVectorImpl<const SCEV *>::iterator I = RegSequence.begin(),
2223 E = RegSequence.end(); I != E; ++I) {
2224 const SCEV *Reg = *I;
2225 int64_t Imm = ExtractImmediate(Reg, SE);
2226 std::pair<RegMapTy::iterator, bool> Pair =
2227 Map.insert(std::make_pair(Reg, ImmMapTy()));
2229 Sequence.push_back(Reg);
2230 Pair.first->second.insert(std::make_pair(Imm, *I));
2233 // Insert an artificial expression at offset 0 (if there isn't one already),
2234 // as this may lead to more reuse opportunities.
2235 for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
2236 E = Sequence.end(); I != E; ++I)
2237 Map.find(*I)->second.insert(ImmMapTy::value_type(0, 0));
2239 // Now examine each set of registers with the same base value. Build up
2240 // a list of work to do and do the work in a separate step so that we're
2241 // not adding formulae and register counts while we're searching.
2242 SmallVector<WorkItem, 32> WorkItems;
2243 for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
2244 E = Sequence.end(); I != E; ++I) {
2245 const SCEV *Reg = *I;
2246 const ImmMapTy &Imms = Map.find(Reg)->second;
2247 // Examine each offset.
2248 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
2250 const SCEV *OrigReg = J->second;
2251 // Skip the artifical register at offset 0.
2252 if (!OrigReg) continue;
2254 int64_t JImm = J->first;
2255 const SmallBitVector &Bits = RegUses.find(OrigReg)->second.Bits;
2257 // Examine each other offset associated with the same register. This is
2258 // quadradic in the number of registers with the same base, but it's
2259 // uncommon for this to be a large number.
2260 for (ImmMapTy::const_iterator M = Imms.begin(); M != JE; ++M) {
2261 if (M == J) continue;
2263 // Compute the difference between the two.
2264 int64_t Imm = (uint64_t)JImm - M->first;
2265 for (int LUIdx = Bits.find_first(); LUIdx != -1;
2266 LUIdx = Bits.find_next(LUIdx))
2267 // Make a memo of this use, offset, and register tuple.
2268 WorkItems.push_back(WorkItem(&Uses[LUIdx], LUIdx, Imm, OrigReg));
2273 // Now iterate through the worklist and add new formulae.
2274 for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
2275 E = WorkItems.end(); I != E; ++I) {
2276 const WorkItem &WI = *I;
2277 LSRUse &LU = *WI.LU;
2278 size_t LUIdx = WI.LUIdx;
2279 int64_t Imm = WI.Imm;
2280 const SCEV *OrigReg = WI.OrigReg;
2282 const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
2283 const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy,
2286 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
2287 Formula F = LU.Formulae[L];
2288 // Use the immediate in the scaled register.
2289 if (F.ScaledReg == OrigReg) {
2290 int64_t Offs = (uint64_t)F.AM.BaseOffs +
2291 Imm * (uint64_t)F.AM.Scale;
2292 // Don't create 50 + reg(-50).
2293 if (F.referencesReg(SE.getSCEV(
2294 ConstantInt::get(IntTy, -(uint64_t)Offs))))
2297 NewF.AM.BaseOffs = Offs;
2298 if (!isLegalUse(NewF.AM, LU.Kind, LU.AccessTy, TLI))
2300 const SCEV *Diff = SE.getAddExpr(NegImmS, NewF.ScaledReg);
2301 if (Diff->isZero()) continue;
2302 NewF.ScaledReg = Diff;
2303 if (LU.InsertFormula(NewF))
2304 CountRegisters(LU.Formulae.back(), LUIdx);
2306 // Use the immediate in a base register.
2307 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
2308 const SCEV *BaseReg = F.BaseRegs[N];
2309 if (BaseReg != OrigReg)
2312 NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
2313 if (!isLegalUse(NewF.AM, LU.Kind, LU.AccessTy, TLI))
2315 const SCEV *Diff = SE.getAddExpr(NegImmS, BaseReg);
2316 if (Diff->isZero()) continue;
2317 // Don't create 50 + reg(-50).
2319 SE.getSCEV(ConstantInt::get(IntTy,
2320 -(uint64_t)NewF.AM.BaseOffs)))
2322 NewF.BaseRegs[N] = Diff;
2323 if (LU.InsertFormula(NewF))
2324 CountRegisters(LU.Formulae.back(), LUIdx);
2330 /// GenerateAllReuseFormulae - Generate formulae for each use.
2332 LSRInstance::GenerateAllReuseFormulae() {
2333 SmallVector<Formula, 12> Save;
2334 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2335 LSRUse &LU = Uses[LUIdx];
2337 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2338 GenerateSymbolicOffsetReuse(LU, LUIdx, LU.Formulae[i]);
2339 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2340 GenerateICmpZeroScaledReuse(LU, LUIdx, LU.Formulae[i]);
2341 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2342 GenerateReassociationReuse(LU, LUIdx, LU.Formulae[i]);
2343 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2344 GenerateCombinationReuse(LU, LUIdx, LU.Formulae[i]);
2345 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2346 GenerateScaledReuse(LU, LUIdx, LU.Formulae[i]);
2347 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2348 GenerateTruncateReuse(LU, LUIdx, LU.Formulae[i]);
2351 GenerateConstantOffsetReuse();
2354 /// GenerateLoopInvariantRegisterUses - Check for other uses of loop-invariant
2355 /// values which we're tracking. These other uses will pin these values in
2356 /// registers, making them less profitable for elimination.
2357 /// TODO: This currently misses non-constant addrec step registers.
2358 /// TODO: Should this give more weight to users inside the loop?
2360 LSRInstance::GenerateLoopInvariantRegisterUses() {
2361 for (size_t i = 0, e = RegSequence.size(); i != e; ++i)
2362 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(RegSequence[i])) {
2363 const Value *V = U->getValue();
2364 if (const Instruction *Inst = dyn_cast<Instruction>(V))
2365 if (L->contains(Inst)) continue;
2366 for (Value::use_const_iterator UI = V->use_begin(), UE = V->use_end();
2368 const Instruction *UserInst = dyn_cast<Instruction>(*UI);
2369 // Ignore non-instructions.
2372 // Ignore instructions in other functions (as can happen with
2374 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
2376 // Ignore instructions not dominated by the loop.
2377 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
2378 UserInst->getParent() :
2379 cast<PHINode>(UserInst)->getIncomingBlock(
2380 PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
2381 if (!DT.dominates(L->getHeader(), UseBB))
2383 // Ignore uses which are part of other SCEV expressions, to avoid
2384 // analyzing them multiple times.
2385 if (SE.isSCEVable(UserInst->getType()) &&
2386 !isa<SCEVUnknown>(SE.getSCEV(const_cast<Instruction *>(UserInst))))
2388 // Ignore icmp instructions which are already being analyzed.
2389 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
2390 unsigned OtherIdx = !UI.getOperandNo();
2391 Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
2392 if (SE.getSCEV(OtherOp)->hasComputableLoopEvolution(L))
2396 LSRUse &LU = getNewUse();
2397 LU.UserInst = const_cast<Instruction *>(UserInst);
2398 LU.OperandValToReplace = UI.getUse();
2399 LU.InsertSupplementalFormula(U);
2400 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
2407 static void debug_winner(SmallVector<LSRUse, 16> const &Uses) {
2408 dbgs() << "LSR has selected formulae for each use:\n";
2409 for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
2410 E = Uses.end(); I != E; ++I) {
2411 const LSRUse &LU = *I;
2416 LU.Formulae.front().print(dbgs());
2423 LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
2424 : IU(P->getAnalysis<IVUsers>()),
2425 SE(P->getAnalysis<ScalarEvolution>()),
2426 DT(P->getAnalysis<DominatorTree>()),
2427 TLI(tli), L(l), Changed(false), CurrentArbitraryRegIndex(0) {
2429 // If LoopSimplify form is not available, stay out of trouble.
2430 if (!L->isLoopSimplifyForm()) return;
2432 // If there's no interesting work to be done, bail early.
2433 if (IU.IVUsesByStride.empty()) return;
2435 DEBUG(dbgs() << "\nLSR on loop ";
2436 WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
2439 // Sort the StrideOrder so we process larger strides first.
2440 std::stable_sort(IU.StrideOrder.begin(), IU.StrideOrder.end(),
2443 /// OptimizeShadowIV - If IV is used in a int-to-float cast
2444 /// inside the loop then try to eliminate the cast opeation.
2447 // Change loop terminating condition to use the postinc iv when possible.
2448 Instruction *IVIncInsertPos;
2449 Changed |= OptimizeLoopTermCond(IVIncInsertPos);
2451 for (SmallVectorImpl<const SCEV *>::const_iterator SIter =
2452 IU.StrideOrder.begin(), SEnd = IU.StrideOrder.end();
2453 SIter != SEnd; ++SIter) {
2454 const SCEV *Stride = *SIter;
2456 // Collect interesting types.
2457 Types.insert(SE.getEffectiveSCEVType(Stride->getType()));
2459 // Collect interesting factors.
2460 for (SmallVectorImpl<const SCEV *>::const_iterator NewStrideIter =
2461 SIter + 1; NewStrideIter != SEnd; ++NewStrideIter) {
2462 const SCEV *OldStride = Stride;
2463 const SCEV *NewStride = *NewStrideIter;
2465 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2466 SE.getTypeSizeInBits(NewStride->getType())) {
2467 if (SE.getTypeSizeInBits(OldStride->getType()) >
2468 SE.getTypeSizeInBits(NewStride->getType()))
2469 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2471 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2473 if (const SCEVConstant *Factor =
2474 dyn_cast_or_null<SCEVConstant>(getSDiv(NewStride, OldStride,
2476 if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
2477 Factors.insert(Factor->getValue()->getValue().getSExtValue());
2480 std::map<const SCEV *, IVUsersOfOneStride *>::const_iterator SI =
2481 IU.IVUsesByStride.find(Stride);
2482 assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
2483 for (ilist<IVStrideUse>::const_iterator UI = SI->second->Users.begin(),
2484 E = SI->second->Users.end(); UI != E; ++UI) {
2486 LSRUse &LU = getNewUse();
2487 LU.UserInst = UI->getUser();
2488 LU.OperandValToReplace = UI->getOperandValToReplace();
2489 if (isAddressUse(LU.UserInst, LU.OperandValToReplace)) {
2490 LU.Kind = LSRUse::Address;
2491 LU.AccessTy = getAccessType(LU.UserInst);
2493 if (UI->isUseOfPostIncrementedValue())
2496 const SCEV *S = IU.getCanonicalExpr(*UI);
2498 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
2499 // (N - i == 0), and this allows (N - i) to be the expression that we
2500 // work with rather than just N or i, so we can consider the register
2501 // requirements for both N and i at the same time. Limiting this code
2502 // to equality icmps is not a problem because all interesting loops
2503 // use equality icmps, thanks to IndVarSimplify.
2504 if (ICmpInst *CI = dyn_cast<ICmpInst>(LU.UserInst))
2505 if (CI->isEquality()) {
2506 // Swap the operands if needed to put the OperandValToReplace on
2507 // the left, for consistency.
2508 Value *NV = CI->getOperand(1);
2509 if (NV == LU.OperandValToReplace) {
2510 CI->setOperand(1, CI->getOperand(0));
2511 CI->setOperand(0, NV);
2514 // x == y --> x - y == 0
2515 const SCEV *N = SE.getSCEV(NV);
2516 if (N->isLoopInvariant(L)) {
2517 LU.Kind = LSRUse::ICmpZero;
2518 S = SE.getMinusSCEV(N, S);
2521 // -1 and the negations of all interesting strides (except the
2522 // negation of -1) are now also interesting.
2523 for (size_t i = 0, e = Factors.size(); i != e; ++i)
2524 if (Factors[i] != -1)
2525 Factors.insert(-(uint64_t)Factors[i]);
2529 // Ok, now enumerate all the different formulae we can find to compute
2530 // the value for this expression.
2531 LU.InsertInitialFormula(S, L, SE, DT);
2532 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
2536 // If all uses use the same type, don't bother looking for truncation-based
2538 if (Types.size() == 1)
2541 // Now use the reuse data to generate a bunch of interesting ways
2542 // to formulate the values needed for the uses.
2543 GenerateAllReuseFormulae();
2545 // If there are any uses of registers that we're tracking that have escaped
2546 // IVUsers' attention, add trivial uses for them, so that the register
2547 // voting process takes the into consideration.
2548 GenerateLoopInvariantRegisterUses();
2550 // Sort the formulae. TODO: This is redundantly sorted below.
2551 for (SmallVectorImpl<LSRUse>::iterator I = Uses.begin(), E = Uses.end();
2554 std::stable_sort(LU.Formulae.begin(), LU.Formulae.end(),
2555 ComplexitySorter());
2558 // Ok, we've now collected all the uses and noted their register uses. The
2559 // next step is to start looking at register reuse possibilities.
2560 DEBUG(print(dbgs()); dbgs() << '\n');
2562 // Start by assuming we'll assign each use its own register. This is
2563 // sometimes called "full" strength reduction, or "superhero" mode.
2564 // Sometimes this is the best solution, but if there are opportunities for
2565 // reuse we may find a better solution.
2567 CurScore.RateInitial(Uses, L, SE);
2569 // Create a sorted list of registers with those with the most uses appearing
2570 // earlier in the list. We'll visit them first, as they're the most likely
2571 // to represent profitable reuse opportunities.
2572 SmallVector<RegCount, 8> RegOrder;
2573 for (SmallVectorImpl<const SCEV *>::const_iterator I =
2574 RegSequence.begin(), E = RegSequence.end(); I != E; ++I)
2575 RegOrder.push_back(RegCount(*I, RegUses.find(*I)->second));
2576 std::stable_sort(RegOrder.begin(), RegOrder.end());
2578 // Visit each register. Determine which ones represent profitable reuse
2579 // opportunities and remember them.
2580 // TODO: Extract this code into a function.
2581 for (SmallVectorImpl<RegCount>::const_iterator I = RegOrder.begin(),
2582 E = RegOrder.end(); I != E; ++I) {
2583 const SCEV *Reg = I->Reg;
2584 const SmallBitVector &Bits = I->Sort.Bits;
2586 // Registers with only one use don't represent reuse opportunities, so
2587 // when we get there, we're done.
2588 if (Bits.count() <= 1) break;
2590 DEBUG(dbgs() << "Reg " << *Reg << ": ";
2591 I->Sort.print(dbgs());
2594 // Determine the total number of registers will be needed if we make use
2595 // of the reuse opportunity represented by the current register.
2597 NewScore.Rate(Reg, Bits, Uses, L, SE);
2599 // Now decide whether this register's reuse opportunity is an overall win.
2600 // Currently the decision is heavily skewed for register pressure.
2601 if (!(NewScore < CurScore)) {
2605 // Ok, use this opportunity.
2606 DEBUG(dbgs() << "This candidate has been accepted.\n");
2607 CurScore = NewScore;
2609 // Now that we've selected a new reuse opportunity, delete formulae that
2610 // do not participate in that opportunity.
2611 for (int j = Bits.find_first(); j != -1; j = Bits.find_next(j)) {
2612 LSRUse &LU = Uses[j];
2613 for (unsigned k = 0, h = LU.Formulae.size(); k != h; ++k) {
2614 Formula &F = LU.Formulae[k];
2615 if (!F.referencesReg(Reg)) {
2616 std::swap(LU.Formulae[k], LU.Formulae.back());
2617 LU.Formulae.pop_back();
2621 // Also re-sort the list to put the formulae with the fewest registers
2623 // TODO: Do this earlier, we don't need it each time.
2624 std::stable_sort(LU.Formulae.begin(), LU.Formulae.end(),
2625 ComplexitySorter());
2629 // Ok, we've now made all our decisions. The first formula for each use
2631 DEBUG(dbgs() << "Concluding, we need "; CurScore.print(dbgs());
2633 debug_winner(Uses));
2635 // Free memory no longer needed.
2640 RegSequence.clear();
2642 // Keep track of instructions we may have made dead, so that
2643 // we can remove them after we are done working.
2644 SmallVector<WeakVH, 16> DeadInsts;
2646 SCEVExpander Rewriter(SE);
2647 Rewriter.disableCanonicalMode();
2648 Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
2650 // Expand the new value definitions and update the users.
2651 for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
2652 E = Uses.end(); I != E; ++I) {
2653 // Formulae should be legal.
2654 DEBUG(for (SmallVectorImpl<Formula>::const_iterator J = I->Formulae.begin(),
2655 JE = I->Formulae.end(); J != JE; ++J)
2656 assert(isLegalUse(J->AM, I->Kind, I->AccessTy, TLI) &&
2657 "Illegal formula generated!"));
2659 // Expand the new code and update the user.
2660 I->Rewrite(L, Rewriter, DeadInsts, SE, DT, P);
2664 // Clean up after ourselves. This must be done before deleting any
2668 Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
2671 void LSRInstance::print(raw_ostream &OS) const {
2672 OS << "LSR has identified the following interesting factors and types: ";
2675 for (SmallSetVector<int64_t, 4>::const_iterator
2676 I = Factors.begin(), E = Factors.end(); I != E; ++I) {
2677 if (!First) OS << ", ";
2682 for (SmallSetVector<const Type *, 4>::const_iterator
2683 I = Types.begin(), E = Types.end(); I != E; ++I) {
2684 if (!First) OS << ", ";
2686 OS << '(' << **I << ')';
2690 OS << "LSR is examining the following uses, and candidate formulae:\n";
2691 for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
2692 E = Uses.end(); I != E; ++I) {
2693 const LSRUse &LU = *I;
2697 for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
2698 JE = LU.Formulae.end(); J != JE; ++J) {
2706 void LSRInstance::dump() const {
2707 print(errs()); errs() << '\n';
2712 class LoopStrengthReduce : public LoopPass {
2713 /// TLI - Keep a pointer of a TargetLowering to consult for determining
2714 /// transformation profitability.
2715 const TargetLowering *const TLI;
2718 static char ID; // Pass ID, replacement for typeid
2719 explicit LoopStrengthReduce(const TargetLowering *tli = NULL);
2722 bool runOnLoop(Loop *L, LPPassManager &LPM);
2723 void getAnalysisUsage(AnalysisUsage &AU) const;
2728 char LoopStrengthReduce::ID = 0;
2729 static RegisterPass<LoopStrengthReduce>
2730 X("loop-reduce", "Loop Strength Reduction");
2732 Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
2733 return new LoopStrengthReduce(TLI);
2736 LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
2737 : LoopPass(&ID), TLI(tli) {}
2739 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
2740 // We split critical edges, so we change the CFG. However, we do update
2741 // many analyses if they are around.
2742 AU.addPreservedID(LoopSimplifyID);
2743 AU.addPreserved<LoopInfo>();
2744 AU.addPreserved("domfrontier");
2746 AU.addRequiredID(LoopSimplifyID);
2747 AU.addRequired<DominatorTree>();
2748 AU.addPreserved<DominatorTree>();
2749 AU.addRequired<ScalarEvolution>();
2750 AU.addPreserved<ScalarEvolution>();
2751 AU.addRequired<IVUsers>();
2752 AU.addPreserved<IVUsers>();
2755 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
2756 bool Changed = false;
2758 // Run the main LSR transformation.
2759 Changed |= LSRInstance(TLI, L, this).getChanged();
2761 // At this point, it is worth checking to see if any recurrence PHIs are also
2762 // dead, so that we can remove them as well.
2763 Changed |= DeleteDeadPHIs(L->getHeader());