X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTransforms%2FVectorize%2FLoopVectorize.cpp;h=a8e1f4579c7f4242acd65c4c0583774576902d66;hb=1386692ef64d3151da8986589eadf0c58aba5c50;hp=fec15737dc79a1bc0efe64fcfa468f418ff09603;hpb=1af132dcf3c9fd87ac8ad0c103e74ef3f8a0bae6;p=oota-llvm.git diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index fec15737dc7..a8e1f4579c7 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8,11 +8,11 @@ //===----------------------------------------------------------------------===// // // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops -// and generates target-independent LLVM-IR. Legalization of the IR is done -// in the codegen. However, the vectorizes uses (will use) the codegen -// interfaces to generate IR that is likely to result in an optimal binary. +// and generates target-independent LLVM-IR. +// The vectorizer uses the TargetTransformInfo analysis to estimate the costs +// of instructions in order to estimate the profitability of vectorization. // -// The loop vectorizer combines consecutive loop iteration into a single +// The loop vectorizer combines consecutive loop iterations into a single // 'wide' iteration. After this transformation the index is incremented // by the SIMD vector width, and not by one. // @@ -32,7 +32,7 @@ // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. // // Variable uniformity checks are inspired by: -// Karrenberg, R. and Hack, S. Whole Function Vectorization. +// Karrenberg, R. and Hack, S. Whole Function Vectorization. // // Other ideas/concepts are from: // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. @@ -78,7 +78,9 @@ #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/PatternMatch.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" @@ -86,6 +88,7 @@ #include using namespace llvm; +using namespace llvm::PatternMatch; static cl::opt VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, @@ -101,17 +104,25 @@ EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); /// We don't vectorize loops with a known constant trip count below this number. -static const unsigned TinyTripCountVectorThreshold = 16; +static cl::opt +TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), + cl::Hidden, + cl::desc("Don't vectorize loops with a constant " + "trip count that is smaller than this " + "value.")); /// We don't unroll loops with a known constant trip count below this number. static const unsigned TinyTripCountUnrollThreshold = 128; -/// We don't unroll loops that are larget than this threshold. -static const unsigned MaxLoopSizeThreshold = 32; +/// When performing memory disambiguation checks at runtime do not make more +/// than this number of comparisons. +static const unsigned RuntimeMemoryCheckThreshold = 8; -/// When performing a runtime memory check, do not check more than this -/// number of pointers. Notice that the check is quadratic! -static const unsigned RuntimeMemoryCheckThreshold = 4; +/// We use a metadata with this name to indicate that a scalar loop was +/// vectorized and that we don't need to re-vectorize it if we run into it +/// again. +static const char* +AlreadyVectorizedMDName = "llvm.vectorizer.already_vectorized"; namespace { @@ -136,10 +147,11 @@ class LoopVectorizationCostModel; class InnerLoopVectorizer { public: InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, DataLayout *DL, unsigned VecWidth, + DominatorTree *DT, DataLayout *DL, + const TargetLibraryInfo *TLI, unsigned VecWidth, unsigned UnrollFactor) - : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), VF(VecWidth), - UF(UnrollFactor), Builder(SE->getContext()), Induction(0), + : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI), + VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0), OldInduction(0), WidenMap(UnrollFactor) {} // Perform the actual loop widening (vectorization). @@ -190,6 +202,10 @@ private: /// of scalars. void scalarizeInstruction(Instruction *Instr); + /// Vectorize Load and Store instructions, + void vectorizeMemoryInstruction(Instruction *Instr, + LoopVectorizationLegality *Legal); + /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction /// value. If this is the induction variable then we extend it to N, N+1, ... @@ -200,7 +216,7 @@ private: /// This function adds 0, 1, 2 ... to each vector element, starting at zero. /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). /// The sequence starts at StartIndex. - Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate); + Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. @@ -222,31 +238,34 @@ private: ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} /// \return True if 'Key' is saved in the Value Map. - bool has(Value *Key) { return MapStoreage.count(Key); } + bool has(Value *Key) const { return MapStorage.count(Key); } /// Initializes a new entry in the map. Sets all of the vector parts to the /// save value in 'Val'. /// \return A reference to a vector with splat values. VectorParts &splat(Value *Key, Value *Val) { - MapStoreage[Key].clear(); - MapStoreage[Key].append(UF, Val); - return MapStoreage[Key]; + VectorParts &Entry = MapStorage[Key]; + Entry.assign(UF, Val); + return Entry; } ///\return A reference to the value that is stored at 'Key'. VectorParts &get(Value *Key) { - if (!has(Key)) - MapStoreage[Key].resize(UF); - return MapStoreage[Key]; + VectorParts &Entry = MapStorage[Key]; + if (Entry.empty()) + Entry.resize(UF); + assert(Entry.size() == UF); + return Entry; } + private: /// The unroll factor. Each entry in the map stores this number of vector /// elements. unsigned UF; /// Map storage. We use std::map and not DenseMap because insertions to a /// dense map invalidates its iterators. - std::map MapStoreage; + std::map MapStorage; }; /// The original loop. @@ -259,6 +278,9 @@ private: DominatorTree *DT; /// Data Layout. DataLayout *DL; + /// Target Library Info. + const TargetLibraryInfo *TLI; + /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. unsigned VF; @@ -290,10 +312,99 @@ private: PHINode *Induction; /// The induction variable of the old basic block. PHINode *OldInduction; + /// Holds the extended (to the widest induction type) start index. + Value *ExtendedIdx; /// Maps scalars to widened vectors. ValueMap WidenMap; }; +/// \brief Check if conditionally executed loads are hoistable. +/// +/// This class has two functions. isHoistableLoad and canHoistAllLoads. +/// isHoistableLoad should be called on all load instructions that are executed +/// conditionally. After all conditional loads are processed, the client should +/// call canHoistAllLoads to determine if all of the conditional execute loads +/// have an unconditional memory access in the loop. +class LoadHoisting { + typedef SmallPtrSet MemorySet; + + Loop *TheLoop; + DominatorTree *DT; + MemorySet CondLoadAddrSet; + +public: + LoadHoisting(Loop *L, DominatorTree *D) : TheLoop(L), DT(D) {} + + /// \brief Check if the instruction is a load with a identifiable address. + bool isHoistableLoad(Instruction *L); + + /// \brief Check if all of the conditional loads are hoistable because there + /// exists an unconditional memory access to the same address in the loop. + bool canHoistAllLoads(); +}; + +bool LoadHoisting::isHoistableLoad(Instruction *L) { + LoadInst *LI = dyn_cast(L); + if (!LI) + return false; + + CondLoadAddrSet.insert(LI->getPointerOperand()); + return true; +} + +static void addMemAccesses(BasicBlock *BB, SmallPtrSet &Set) { + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) { + Instruction *I = &*BI; + Value *Addr = 0; + + // Try a load. + LoadInst *LI = dyn_cast(I); + if (LI) { + Addr = LI->getPointerOperand(); + Set.insert(Addr); + continue; + } + + // Try a store. + StoreInst *SI = dyn_cast(I); + if (!SI) + continue; + + Addr = SI->getPointerOperand(); + Set.insert(Addr); + } +} + +bool LoadHoisting::canHoistAllLoads() { + // No conditional loads. + if (CondLoadAddrSet.empty()) + return true; + + MemorySet UncondMemAccesses; + std::vector &LoopBlocks = TheLoop->getBlocksVector(); + BasicBlock *LoopLatch = TheLoop->getLoopLatch(); + + // Iterate over the unconditional blocks and collect memory access addresses. + for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { + BasicBlock *BB = LoopBlocks[i]; + + // Ignore conditional blocks. + if (BB != LoopLatch && !DT->dominates(BB, LoopLatch)) + continue; + + addMemAccesses(BB, UncondMemAccesses); + } + + // And make sure there is a matching unconditional access for every + // conditional load. + for (MemorySet::iterator MI = CondLoadAddrSet.begin(), + ME = CondLoadAddrSet.end(); MI != ME; ++MI) + if (!UncondMemAccesses.count(*MI)) + return false; + + return true; +} + /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -310,8 +421,11 @@ private: class LoopVectorizationLegality { public: LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, - DominatorTree *DT) - : TheLoop(L), SE(SE), DL(DL), DT(DT), Induction(0) {} + DominatorTree *DT, TargetTransformInfo* TTI, + AliasAnalysis *AA, TargetLibraryInfo *TLI) + : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI), + Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false), + LoadSpeculation(L, DT) {} /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -321,8 +435,10 @@ public: RK_IntegerOr, ///< Bitwise or logical OR of numbers. RK_IntegerAnd, ///< Bitwise or logical AND of numbers. RK_IntegerXor, ///< Bitwise or logical XOR of numbers. + RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()). RK_FloatAdd, ///< Sum of floats. - RK_FloatMult ///< Product of floats. + RK_FloatMult, ///< Product of floats. + RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()). }; /// This enum represents the kinds of inductions that we support. @@ -330,16 +446,29 @@ public: IK_NoInduction, ///< Not an induction variable. IK_IntInduction, ///< Integer induction variable. Step = 1. IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. - IK_PtrInduction ///< Pointer induction variable. Step = sizeof(elem). + IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem). + IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). + }; + + // This enum represents the kind of minmax reduction. + enum MinMaxReductionKind { + MRK_Invalid, + MRK_UIntMin, + MRK_UIntMax, + MRK_SIntMin, + MRK_SIntMax, + MRK_FloatMin, + MRK_FloatMax }; /// This POD struct holds information about reduction variables. struct ReductionDescriptor { ReductionDescriptor() : StartValue(0), LoopExitInstr(0), - Kind(RK_NoReduction) {} + Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} - ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K) - : StartValue(Start), LoopExitInstr(Exit), Kind(K) {} + ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K, + MinMaxReductionKind MK) + : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {} // The starting value of the reduction. // It does not have to be zero! @@ -348,6 +477,25 @@ public: Instruction *LoopExitInstr; // The kind of the reduction. ReductionKind Kind; + // If this a min/max reduction the kind of reduction. + MinMaxReductionKind MinMaxKind; + }; + + /// This POD struct holds information about a potential reduction operation. + struct ReductionInstDesc { + ReductionInstDesc(bool IsRedux, Instruction *I) : + IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {} + + ReductionInstDesc(Instruction *I, MinMaxReductionKind K) : + IsReduction(true), PatternLastInst(I), MinMaxKind(K) {} + + // Is this instruction a reduction candidate. + bool IsReduction; + // The last instruction in a min/max pattern (select of the select(icmp()) + // pattern), or the current reduction instruction otherwise. + Instruction *PatternLastInst; + // If this is a min/max pattern the comparison predicate. + MinMaxReductionKind MinMaxKind; }; // This POD struct holds information about the memory runtime legality @@ -364,7 +512,7 @@ public: } /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr); + void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr); /// This flag indicates if we need to add the runtime check. bool Need; @@ -374,6 +522,8 @@ public: SmallVector Starts; /// Holds the pointer value at the end of the loop. SmallVector Ends; + /// Holds the information if this pointer is used for writing to memory. + SmallVector IsWritePtr; }; /// A POD for saving information about induction variables. @@ -394,6 +544,11 @@ public: /// induction descriptor. typedef MapVector InductionList; + /// Alias(Multi)Map stores the values (GEPs or underlying objects and their + /// respective Store/Load instruction(s) to calculate aliasing. + typedef MapVector AliasMap; + typedef DenseMap > AliasMultiMap; + /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -408,6 +563,9 @@ public: /// Returns the induction variables found in the loop. InductionList *getInductionVars() { return &Inductions; } + /// Returns the widest induction type. + Type *getWidestInductionType() { return WidestIndTy; } + /// Returns True if V is an induction variable in this loop. bool isInductionVariable(const Value *V); @@ -433,6 +591,10 @@ public: /// Returns the information that we collected about runtime memory check. RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } + + /// This function returns the identity element (or neutral element) for + /// the operation K. + static Constant *getReductionIdentity(ReductionKind K, Type *Tp); private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -459,14 +621,30 @@ private: /// Returns True, if 'Phi' is the kind of reduction variable for type /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. bool AddReductionVar(PHINode *Phi, ReductionKind Kind); - /// Returns true if the instruction I can be a reduction variable of type - /// 'Kind'. - bool isReductionInstr(Instruction *I, ReductionKind Kind); + /// Returns a struct describing if the instruction 'I' can be a reduction + /// variable of type 'Kind'. If the reduction is a min/max pattern of + /// select(icmp()) this function advances the instruction pointer 'I' from the + /// compare instruction to the select instruction and stores this pointer in + /// 'PatternLastInst' member of the returned struct. + ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind, + ReductionInstDesc &Desc); + /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction + /// pattern corresponding to a min(X, Y) or max(X, Y). + static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I, + ReductionInstDesc &Prev); /// Returns the induction kind of Phi. This function may return NoInduction /// if the PHI is not an induction variable. InductionKind isInductionVariable(PHINode *Phi); /// Return true if can compute the address bounds of Ptr within the loop. bool hasComputableBounds(Value *Ptr); + /// Return true if there is the chance of write reorder. + bool hasPossibleGlobalWriteReorder(Value *Object, + Instruction *Inst, + AliasMultiMap &WriteObjects, + unsigned MaxByteWidth); + /// Return the AA location for a load or a store. + AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst); + /// The loop that we evaluate. Loop *TheLoop; @@ -474,8 +652,14 @@ private: ScalarEvolution *SE; /// DataLayout analysis. DataLayout *DL; - // Dominators. + /// Dominators. DominatorTree *DT; + /// Target Info. + TargetTransformInfo *TTI; + /// Alias Analysis. + AliasAnalysis *AA; + /// Target Library Info. + TargetLibraryInfo *TLI; // --- vectorization state --- // @@ -488,6 +672,8 @@ private: /// Notice that inductions don't need to start at zero and that induction /// variables can be pointers. InductionList Inductions; + /// Holds the widest induction type encountered. + Type *WidestIndTy; /// Allowed outside users. This holds the reduction /// vars which can be accessed from outside the loop. @@ -498,6 +684,11 @@ private: /// We need to check that all of the pointers in this list are disjoint /// at runtime. RuntimePointerCheck PtrRtCheck; + /// Can we assume the absence of NaNs. + bool HasFunNoNaNAttr; + + /// Utility to determine whether loads can be speculated. + LoadHoisting LoadSpeculation; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -511,16 +702,23 @@ class LoopVectorizationCostModel { public: LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, LoopVectorizationLegality *Legal, - const TargetTransformInfo &TTI) - : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI) {} - - /// \return The most profitable vectorization factor. + const TargetTransformInfo &TTI, + DataLayout *DL, const TargetLibraryInfo *TLI) + : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {} + + /// Information about vectorization costs + struct VectorizationFactor { + unsigned Width; // Vector width with best cost + unsigned Cost; // Cost of the loop with that width + }; + /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to VF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. - unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF); + VectorizationFactor selectVectorizationFactor(bool OptForSize, + unsigned UserVF); - /// \returns The size (in bits) of the widest type in the code that + /// \return The size (in bits) of the widest type in the code that /// needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. unsigned getWidestType(); @@ -528,7 +726,10 @@ public: /// \return The most profitable unroll factor. /// If UserUF is non-zero then this method finds the best unroll-factor /// based on register pressure and other parameters. - unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF); + /// VF and LoopCost are the selected vectorization factor and the cost of the + /// selected VF. + unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF, + unsigned LoopCost); /// \brief A struct that represents some properties of the register usage /// of a loop. @@ -560,6 +761,10 @@ private: /// the scalar type. static Type* ToVectorTy(Type *Scalar, unsigned VF); + /// Returns whether the instruction is a load or store and will be a emitted + /// as a vector operation. + bool isConsecutiveLoadOrStore(Instruction *I); + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. @@ -570,6 +775,10 @@ private: LoopVectorizationLegality *Legal; /// Vector target information. const TargetTransformInfo &TTI; + /// Target data layout information. + DataLayout *DL; + /// Target Library Info. + const TargetLibraryInfo *TLI; }; /// The LoopVectorize Pass. @@ -586,6 +795,8 @@ struct LoopVectorize : public LoopPass { LoopInfo *LI; TargetTransformInfo *TTI; DominatorTree *DT; + AliasAnalysis *AA; + TargetLibraryInfo *TLI; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { // We only vectorize innermost loops. @@ -597,21 +808,28 @@ struct LoopVectorize : public LoopPass { LI = &getAnalysis(); TTI = &getAnalysis(); DT = &getAnalysis(); + AA = getAnalysisIfAvailable(); + TLI = getAnalysisIfAvailable(); + + if (DL == NULL) { + DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout"); + return false; + } DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT); + LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing.\n"); return false; } // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI); + LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI); - // Check the function attribues to find out if this function should be + // Check the function attributes to find out if this function should be // optimized for size. Function *F = L->getHeader()->getParent(); Attribute::AttrKind SzAttr = Attribute::OptimizeForSize; @@ -626,20 +844,24 @@ struct LoopVectorize : public LoopPass { return false; } - unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); - unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll); + // Select the optimal vectorization factor. + LoopVectorizationCostModel::VectorizationFactor VF; + VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); + // Select the unroll factor. + unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll, + VF.Width, VF.Cost); - if (VF == 1) { + if (VF.Width == 1) { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); return false; } - DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< + DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<< F->getParent()->getModuleIdentifier()<<"\n"); DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n"); - // If we decided that it is *legal* to vectorizer the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF); + // If we decided that it is *legal* to vectorize the loop then do it. + InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); @@ -669,7 +891,8 @@ struct LoopVectorize : public LoopPass { void LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, - Loop *Lp, Value *Ptr) { + Loop *Lp, Value *Ptr, + bool WritePtr) { const SCEV *Sc = SE->getSCEV(Ptr); const SCEVAddRecExpr *AR = dyn_cast(Sc); assert(AR && "Invalid addrec expression"); @@ -678,6 +901,7 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, Pointers.push_back(Ptr); Starts.push_back(AR->getStart()); Ends.push_back(ScEnd); + IsWritePtr.push_back(WritePtr); } Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { @@ -703,7 +927,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } -Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx, +Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, bool Negate) { assert(Val->getType()->isVectorTy() && "Must be a vector"); assert(Val->getType()->getScalarType()->isIntegerTy() && @@ -716,8 +940,8 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx, // Create a vector of consecutive numbers from zero to VF. for (int i = 0; i < VLen; ++i) { - int Idx = Negate ? (-i): i; - Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx)); + int64_t Idx = Negate ? (-i) : i; + Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate)); } // Add the consecutive indices to the vector value. @@ -728,6 +952,9 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx, int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); + // Make sure that the pointer does not point to structs. + if (cast(Ptr->getType())->getElementType()->isAggregateType()) + return 0; // If this value is a pointer induction variable we know it is consecutive. PHINode *Phi = dyn_cast_or_null(Ptr); @@ -735,6 +962,8 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { InductionInfo II = Inductions[Phi]; if (IK_PtrInduction == II.IK) return 1; + else if (IK_ReversePtrInduction == II.IK) + return -1; } GetElementPtrInst *Gep = dyn_cast_or_null(Ptr); @@ -744,6 +973,29 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { unsigned NumOperands = Gep->getNumOperands(); Value *LastIndex = Gep->getOperand(NumOperands - 1); + Value *GpPtr = Gep->getPointerOperand(); + // If this GEP value is a consecutive pointer induction variable and all of + // the indices are constant then we know it is consecutive. We can + Phi = dyn_cast(GpPtr); + if (Phi && Inductions.count(Phi)) { + + // Make sure that the pointer does not point to structs. + PointerType *GepPtrType = cast(GpPtr->getType()); + if (GepPtrType->getElementType()->isAggregateType()) + return 0; + + // Make sure that all of the index operands are loop invariant. + for (unsigned i = 1; i < NumOperands; ++i) + if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + return 0; + + InductionInfo II = Inductions[Phi]; + if (IK_PtrInduction == II.IK) + return 1; + else if (IK_ReversePtrInduction == II.IK) + return -1; + } + // Check that all of the gep indices are uniform except for the last. for (unsigned i = 0; i < NumOperands - 1; ++i) if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) @@ -782,8 +1034,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) { // If this scalar is unknown, assume that it is a constant or that it is // loop invariant. Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); - WidenMap.splat(V, B); - return WidenMap.get(V); + return WidenMap.splat(V, B); } Value *InnerLoopVectorizer::reverseVector(Value *Vec) { @@ -797,6 +1048,117 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { "reverse"); } + +void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, + LoopVectorizationLegality *Legal) { + // Attempt to issue a wide load. + LoadInst *LI = dyn_cast(Instr); + StoreInst *SI = dyn_cast(Instr); + + assert((LI || SI) && "Invalid Load/Store instruction"); + + Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); + Type *DataTy = VectorType::get(ScalarDataTy, VF); + Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); + + unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); + unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; + + if (ScalarAllocatedSize != VectorElementSize) + return scalarizeInstruction(Instr); + + // If the pointer is loop invariant or if it is non consecutive, + // scalarize the load. + int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + bool Reverse = ConsecutiveStride < 0; + bool UniformLoad = LI && Legal->isUniform(Ptr); + if (!ConsecutiveStride || UniformLoad) + return scalarizeInstruction(Instr); + + Constant *Zero = Builder.getInt32(0); + VectorParts &Entry = WidenMap.get(Instr); + + // Handle consecutive loads/stores. + GetElementPtrInst *Gep = dyn_cast(Ptr); + if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { + Value *PtrOperand = Gep->getPointerOperand(); + Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; + FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast(Gep->clone()); + Gep2->setOperand(0, FirstBasePtr); + Gep2->setName("gep.indvar.base"); + Ptr = Builder.Insert(Gep2); + } else if (Gep) { + assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), + OrigLoop) && "Base ptr must be invariant"); + + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + + Value *LastGepOperand = Gep->getOperand(NumOperands - 1); + VectorParts &GEPParts = getVectorValue(LastGepOperand); + Value *LastIndex = GEPParts[0]; + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast(Gep->clone()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Gep2->setName("gep.indvar.idx"); + Ptr = Builder.Insert(Gep2); + } else { + // Use the induction element ptr. + assert(isa(Ptr) && "Invalid induction ptr"); + VectorParts &PtrVal = getVectorValue(Ptr); + Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); + } + + // Handle Stores: + if (SI) { + assert(!Legal->isUniform(SI->getPointerOperand()) && + "We do not allow storing to uniform addresses"); + + VectorParts &StoredVal = getVectorValue(SI->getValueOperand()); + for (unsigned Part = 0; Part < UF; ++Part) { + // Calculate the pointer for the specific unroll-part. + Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); + + if (Reverse) { + // If we store to reverse consecutive memory locations then we need + // to reverse the order of elements in the stored value. + StoredVal[Part] = reverseVector(StoredVal[Part]); + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + } + + Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo()); + Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); + } + } + + for (unsigned Part = 0; Part < UF; ++Part) { + // Calculate the pointer for the specific unroll-part. + Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); + + if (Reverse) { + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + } + + Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo()); + Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); + cast(LI)->setAlignment(Alignment); + Entry[Part] = Reverse ? reverseVector(LI) : LI; + } +} + void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. @@ -840,10 +1202,10 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); - // For each scalar that we create: - for (unsigned Width = 0; Width < VF; ++Width) { - // For each vector unroll 'part': - for (unsigned Part = 0; Part < UF; ++Part) { + // For each vector unroll 'part': + for (unsigned Part = 0; Part < UF; ++Part) { + // For each scalar that we create: + for (unsigned Width = 0; Width < VF; ++Width) { Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); @@ -906,29 +1268,27 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, } } + IRBuilder<> ChkBuilder(Loc); + for (unsigned i = 0; i < NumPointers; ++i) { for (unsigned j = i+1; j < NumPointers; ++j) { - Instruction::CastOps Op = Instruction::BitCast; - Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc); - Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc); - Value *End0 = CastInst::Create(Op, Ends[i], PtrArithTy, "bc", Loc); - Value *End1 = CastInst::Create(Op, Ends[j], PtrArithTy, "bc", Loc); - - Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, - Start0, End1, "bound0", Loc); - Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, - Start1, End0, "bound1", Loc); - Instruction *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, - Cmp1, "found.conflict", - Loc); + // No need to check if two readonly pointers intersect. + if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) + continue; + + Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc"); + Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc"); + Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc"); + Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy, "bc"); + + Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); + Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); + Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); if (MemoryRuntimeCheck) - MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, - MemoryRuntimeCheck, - IsConflict, - "conflict.rdx", Loc); - else - MemoryRuntimeCheck = IsConflict; + IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, + "conflict.rdx"); + MemoryRuntimeCheck = cast(IsConflict); } } @@ -971,13 +1331,17 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { BasicBlock *ExitBlock = OrigLoop->getExitBlock(); assert(ExitBlock && "Must have an exit block"); + // Mark the old scalar loop with metadata that tells us not to vectorize this + // loop again if we run into it. + MDNode *MD = MDNode::get(OldBasicBlock->getContext(), None); + OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD); + // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer // induction variables. In the code below we also support a case where we // don't have a single induction variable. OldInduction = Legal->getInduction(); - Type *IdxTy = OldInduction ? OldInduction->getType() : - DL->getIntPtrType(SE->getContext()); + Type *IdxTy = Legal->getWidestInductionType(); // Find the loop boundaries. const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch()); @@ -998,9 +1362,11 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // The loop index does not have to start at Zero. Find the original start // value from the induction PHI node. If we don't have an induction variable // then we know that it starts at zero. - Value *StartIdx = OldInduction ? - OldInduction->getIncomingValueForBlock(BypassBlock): - ConstantInt::get(IdxTy, 0); + Builder.SetInsertPoint(BypassBlock->getTerminator()); + Value *StartIdx = ExtendedIdx = OldInduction ? + Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock), + IdxTy): + ConstantInt::get(IdxTy, 0); assert(BypassBlock && "Invalid loop structure"); LoopBypassBlocks.push_back(BypassBlock); @@ -1015,10 +1381,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { BasicBlock *ScalarPH = MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); - // This is the location in which we add all of the logic for bypassing - // the new vector loop. - Instruction *Loc = BypassBlock->getTerminator(); - // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); @@ -1029,43 +1391,46 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // times the unroll factor (num of SIMD instructions). Constant *Step = ConstantInt::get(IdxTy, VF * UF); + // This is the IR builder that we use to add all of the logic for bypassing + // the new vector loop. + IRBuilder<> BypassBuilder(BypassBlock->getTerminator()); + // We may need to extend the index in case there is a type mismatch. // We know that the count starts at zero and does not overflow. - unsigned IdxTyBW = IdxTy->getScalarSizeInBits(); if (Count->getType() != IdxTy) { // The exit count can be of pointer type. Convert it to the correct // integer type. if (ExitCount->getType()->isPointerTy()) - Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc); - else if (IdxTyBW < Count->getType()->getScalarSizeInBits()) - Count = CastInst::CreateTruncOrBitCast(Count, IdxTy, "tr.cnt", Loc); + Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); else - Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc); + Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); } // Add the start index to the loop count to get the new end index. - Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); + Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); // Now we need to generate the expression for N - (N % VF), which is // the part that the vectorized body will execute. - Value *R = BinaryOperator::CreateURem(Count, Step, "n.mod.vf", Loc); - Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); - Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, - "end.idx.rnd.down", Loc); + Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); + Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); + Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, + "end.idx.rnd.down"); // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. - Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, - IdxEndRoundDown, - StartIdx, - "cmp.zero", Loc); + Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, + "cmp.zero"); + + BasicBlock *LastBypassBlock = BypassBlock; // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - if (Instruction *MemoryRuntimeCheck = addRuntimeCheck(Legal, Loc)) { + Instruction *MemRuntimeCheck = addRuntimeCheck(Legal, + BypassBlock->getTerminator()); + if (MemRuntimeCheck) { // Create a new block containing the memory check. - BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemoryRuntimeCheck, + BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck"); LoopBypassBlocks.push_back(CheckBlock); @@ -1075,13 +1440,13 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); OldTerm->eraseFromParent(); - Cmp = MemoryRuntimeCheck; - assert(Loc == CheckBlock->getTerminator()); + Cmp = MemRuntimeCheck; + LastBypassBlock = CheckBlock; } - BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); - // Remove the old terminator. - Loc->eraseFromParent(); + LastBypassBlock->getTerminator()->eraseFromParent(); + BranchInst::Create(MiddleBlock, VectorPH, Cmp, + LastBypassBlock); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the @@ -1095,64 +1460,101 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { PHINode *ResumeIndex = 0; LoopVectorizationLegality::InductionList::iterator I, E; LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); + // Set builder to point to last bypass block. + BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); for (I = List->begin(), E = List->end(); I != E; ++I) { PHINode *OrigPhi = I->first; LoopVectorizationLegality::InductionInfo II = I->second; - PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", + + Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); + PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", MiddleBlock->getTerminator()); + // We might have extended the type of the induction variable but we need a + // truncated version for the scalar loop. + PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? + PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", + MiddleBlock->getTerminator()) : 0; + Value *EndValue = 0; switch (II.IK) { case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); case LoopVectorizationLegality::IK_IntInduction: { - // Handle the integer induction counter: + // Handle the integer induction counter. assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); - assert(OrigPhi == OldInduction && "Unknown integer PHI"); - // We know what the end value is. - EndValue = IdxEndRoundDown; - // We also know which PHI node holds it. - ResumeIndex = ResumeVal; + + // We have the canonical induction variable. + if (OrigPhi == OldInduction) { + // Create a truncated version of the resume value for the scalar loop, + // we might have promoted the type to a larger width. + EndValue = + BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); + // The new PHI merges the original incoming value, in case of a bypass, + // or the value at the end of the vectorized loop. + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); + TruncResumeVal->addIncoming(EndValue, VecBody); + + // We know what the end value is. + EndValue = IdxEndRoundDown; + // We also know which PHI node holds it. + ResumeIndex = ResumeVal; + break; + } + + // Not the canonical induction variable - add the vector loop count to the + // start value. + Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, + II.StartValue->getType(), + "cast.crd"); + EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end"); break; } case LoopVectorizationLegality::IK_ReverseIntInduction: { // Convert the CountRoundDown variable to the PHI size. - unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits(); - unsigned IISize = II.StartValue->getType()->getScalarSizeInBits(); - Value *CRD = CountRoundDown; - if (CRDSize > IISize) - CRD = CastInst::Create(Instruction::Trunc, CountRoundDown, - II.StartValue->getType(), "tr.crd", - LoopBypassBlocks.back()->getTerminator()); - else if (CRDSize < IISize) - CRD = CastInst::Create(Instruction::SExt, CountRoundDown, - II.StartValue->getType(), - "sext.crd", - LoopBypassBlocks.back()->getTerminator()); - // Handle reverse integer induction counter: - EndValue = - BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end", - LoopBypassBlocks.back()->getTerminator()); + Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, + II.StartValue->getType(), + "cast.crd"); + // Handle reverse integer induction counter. + EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end"); break; } case LoopVectorizationLegality::IK_PtrInduction: { // For pointer induction variables, calculate the offset using // the end index. - EndValue = - GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end", - LoopBypassBlocks.back()->getTerminator()); + EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown, + "ptr.ind.end"); + break; + } + case LoopVectorizationLegality::IK_ReversePtrInduction: { + // The value at the end of the loop for the reverse pointer is calculated + // by creating a GEP with a negative index starting from the start value. + Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); + Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown, + "rev.ind.end"); + EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx, + "rev.ptr.ind.end"); break; } }// end of case // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) - ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) { + if (OrigPhi == OldInduction) + ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); + else + ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); + } ResumeVal->addIncoming(EndValue, VecBody); // Fix the scalar body counter (PHI node). unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); - OrigPhi->setIncomingValue(BlockIdx, ResumeVal); + // The old inductions phi node in the scalar body needs the truncated value. + if (OrigPhi == OldInduction) + OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal); + else + OrigPhi->setIncomingValue(BlockIdx, ResumeVal); } // If we are generating a new induction variable then we also need to @@ -1226,24 +1628,24 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { /// This function returns the identity element (or neutral element) for /// the operation K. -static Constant* -getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) { +Constant* +LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { switch (K) { - case LoopVectorizationLegality:: RK_IntegerXor: - case LoopVectorizationLegality:: RK_IntegerAdd: - case LoopVectorizationLegality:: RK_IntegerOr: + case RK_IntegerXor: + case RK_IntegerAdd: + case RK_IntegerOr: // Adding, Xoring, Oring zero to a number does not change it. return ConstantInt::get(Tp, 0); - case LoopVectorizationLegality:: RK_IntegerMult: + case RK_IntegerMult: // Multiplying a number by 1 does not change it. return ConstantInt::get(Tp, 1); - case LoopVectorizationLegality:: RK_IntegerAnd: + case RK_IntegerAnd: // AND-ing a number with an all-1 value does not change it. return ConstantInt::get(Tp, -1, true); - case LoopVectorizationLegality:: RK_FloatMult: + case RK_FloatMult: // Multiplying a number by 1 does not change it. return ConstantFP::get(Tp, 1.0L); - case LoopVectorizationLegality:: RK_FloatAdd: + case RK_FloatAdd: // Adding zero to a number does not change it. return ConstantFP::get(Tp, 0.0L); default: @@ -1251,38 +1653,112 @@ getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) { } } -static bool -isTriviallyVectorizableIntrinsic(Instruction *Inst) { - IntrinsicInst *II = dyn_cast(Inst); - if (!II) - return false; - switch (II->getIntrinsicID()) { - case Intrinsic::sqrt: - case Intrinsic::sin: - case Intrinsic::cos: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::log: - case Intrinsic::log10: - case Intrinsic::log2: - case Intrinsic::fabs: - case Intrinsic::floor: - case Intrinsic::ceil: - case Intrinsic::trunc: - case Intrinsic::rint: - case Intrinsic::nearbyint: - case Intrinsic::pow: - case Intrinsic::fma: - case Intrinsic::fmuladd: - return true; +static Intrinsic::ID +getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { + // If we have an intrinsic call, check if it is trivially vectorizable. + if (IntrinsicInst *II = dyn_cast(CI)) { + switch (II->getIntrinsicID()) { + case Intrinsic::sqrt: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: + case Intrinsic::fabs: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::nearbyint: + case Intrinsic::pow: + case Intrinsic::fma: + case Intrinsic::fmuladd: + return II->getIntrinsicID(); + default: + return Intrinsic::not_intrinsic; + } + } + + if (!TLI) + return Intrinsic::not_intrinsic; + + LibFunc::Func Func; + Function *F = CI->getCalledFunction(); + // We're going to make assumptions on the semantics of the functions, check + // that the target knows that it's available in this environment. + if (!F || !TLI->getLibFunc(F->getName(), Func)) + return Intrinsic::not_intrinsic; + + // Otherwise check if we have a call to a function that can be turned into a + // vector intrinsic. + switch (Func) { default: - return false; + break; + case LibFunc::sin: + case LibFunc::sinf: + case LibFunc::sinl: + return Intrinsic::sin; + case LibFunc::cos: + case LibFunc::cosf: + case LibFunc::cosl: + return Intrinsic::cos; + case LibFunc::exp: + case LibFunc::expf: + case LibFunc::expl: + return Intrinsic::exp; + case LibFunc::exp2: + case LibFunc::exp2f: + case LibFunc::exp2l: + return Intrinsic::exp2; + case LibFunc::log: + case LibFunc::logf: + case LibFunc::logl: + return Intrinsic::log; + case LibFunc::log10: + case LibFunc::log10f: + case LibFunc::log10l: + return Intrinsic::log10; + case LibFunc::log2: + case LibFunc::log2f: + case LibFunc::log2l: + return Intrinsic::log2; + case LibFunc::fabs: + case LibFunc::fabsf: + case LibFunc::fabsl: + return Intrinsic::fabs; + case LibFunc::floor: + case LibFunc::floorf: + case LibFunc::floorl: + return Intrinsic::floor; + case LibFunc::ceil: + case LibFunc::ceilf: + case LibFunc::ceill: + return Intrinsic::ceil; + case LibFunc::trunc: + case LibFunc::truncf: + case LibFunc::truncl: + return Intrinsic::trunc; + case LibFunc::rint: + case LibFunc::rintf: + case LibFunc::rintl: + return Intrinsic::rint; + case LibFunc::nearbyint: + case LibFunc::nearbyintf: + case LibFunc::nearbyintl: + return Intrinsic::nearbyint; + case LibFunc::pow: + case LibFunc::powf: + case LibFunc::powl: + return Intrinsic::pow; } - return false; + + return Intrinsic::not_intrinsic; } /// This function translates the reduction kind to an LLVM binary operator. -static Instruction::BinaryOps +static unsigned getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { switch (Kind) { case LoopVectorizationLegality::RK_IntegerAdd: @@ -1299,11 +1775,53 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { return Instruction::FMul; case LoopVectorizationLegality::RK_FloatAdd: return Instruction::FAdd; + case LoopVectorizationLegality::RK_IntegerMinMax: + return Instruction::ICmp; + case LoopVectorizationLegality::RK_FloatMinMax: + return Instruction::FCmp; default: llvm_unreachable("Unknown reduction operation"); } } +Value *createMinMaxOp(IRBuilder<> &Builder, + LoopVectorizationLegality::MinMaxReductionKind RK, + Value *Left, + Value *Right) { + CmpInst::Predicate P = CmpInst::ICMP_NE; + switch (RK) { + default: + llvm_unreachable("Unknown min/max reduction kind"); + case LoopVectorizationLegality::MRK_UIntMin: + P = CmpInst::ICMP_ULT; + break; + case LoopVectorizationLegality::MRK_UIntMax: + P = CmpInst::ICMP_UGT; + break; + case LoopVectorizationLegality::MRK_SIntMin: + P = CmpInst::ICMP_SLT; + break; + case LoopVectorizationLegality::MRK_SIntMax: + P = CmpInst::ICMP_SGT; + break; + case LoopVectorizationLegality::MRK_FloatMin: + P = CmpInst::FCMP_OLT; + break; + case LoopVectorizationLegality::MRK_FloatMax: + P = CmpInst::FCMP_OGT; + break; + } + + Value *Cmp; + if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax) + Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); + else + Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); + + Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); + return Select; +} + void InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { //===------------------------------------------------===// @@ -1313,9 +1831,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // the cost-model. // //===------------------------------------------------===// - BasicBlock &BB = *OrigLoop->getHeader(); - Constant *Zero = - ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0); + Constant *Zero = Builder.getInt32(0); // In order to support reduction variables we need to be able to vectorize // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two @@ -1361,7 +1877,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // To do so, we need to generate the 'identity' vector and overide // one of the elements with the incoming scalar reduction. We need // to do it in the vector-loop preheader. - Builder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); + Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator()); // This is the vector-clone of the value that leaves the loop. VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); @@ -1369,13 +1885,24 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Find the reduction identity variable. Zero for addition, or, xor, // one for multiplication, -1 for And. - Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType()); - Constant *Identity = ConstantVector::getSplat(VF, Iden); - - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - Value *VectorStart = Builder.CreateInsertElement(Identity, - RdxDesc.StartValue, Zero); + Value *Identity; + Value *VectorStart; + if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax || + RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) { + // MinMax reduction have the start value as their identify. + VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue, + "minmax.ident"); + } else { + Constant *Iden = + LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, + VecTy->getScalarType()); + Identity = ConstantVector::getSplat(VF, Iden); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + VectorStart = Builder.CreateInsertElement(Identity, + RdxDesc.StartValue, Zero); + } // Fix the vector-loop phi. // We created the induction variable so we know that the @@ -1417,10 +1944,15 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = RdxParts[0]; + unsigned Op = getReductionBinOp(RdxDesc.Kind); for (unsigned part = 1; part < UF; ++part) { - Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind); - ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx, - "bin.rdx"); + if (Op != Instruction::ICmp && Op != Instruction::FCmp) + ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op, + RdxParts[part], ReducedPartRdx, + "bin.rdx"); + else + ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind, + ReducedPartRdx, RdxParts[part]); } // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles @@ -1445,8 +1977,11 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { ConstantVector::get(ShuffleMask), "rdx.shuf"); - Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind); - TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx"); + if (Op != Instruction::ICmp && Op != Instruction::FCmp) + TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, + "bin.rdx"); + else + TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); } // The result is in the first element of the vector. @@ -1552,8 +2087,6 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { void InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, PhiVector *PV) { - Constant *Zero = Builder.getInt32(0); - // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { VectorParts &Entry = WidenMap.get(it); @@ -1581,18 +2114,33 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // We know that all PHIs in non header blocks are converted into // selects, so we don't have to worry about the insertion order and we // can just use the builder. - // At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. - VectorParts Cond = createEdgeMask(P->getIncomingBlock(0), - P->getParent()); - - for (unsigned part = 0; part < UF; ++part) { - VectorParts &In0 = getVectorValue(P->getIncomingValue(0)); - VectorParts &In1 = getVectorValue(P->getIncomingValue(1)); - Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part], - "predphi"); + + unsigned NumIncoming = P->getNumIncomingValues(); + assert(NumIncoming > 1 && "Invalid PHI"); + + // Generate a sequence of selects of the form: + // SELECT(Mask3, In3, + // SELECT(Mask2, In2, + // ( ...))) + for (unsigned In = 0; In < NumIncoming; In++) { + VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), + P->getParent()); + VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); + + for (unsigned part = 0; part < UF; ++part) { + // We don't need to 'select' the first PHI operand because it is + // the default value if all of the other masks don't match. + if (In == 0) + Entry[part] = In0[part]; + else + // Select between the current value and the previous incoming edge + // based on the incoming mask. + Entry[part] = Builder.CreateSelect(Cond[part], In0[part], + Entry[part], "predphi"); + } } continue; } @@ -1609,27 +2157,34 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); case LoopVectorizationLegality::IK_IntInduction: { - assert(P == OldInduction && "Unexpected PHI"); - Value *Broadcasted = getBroadcastInstrs(Induction); - // After broadcasting the induction variable we need to make the - // vector consecutive by adding 0, 1, 2 ... + assert(P->getType() == II.StartValue->getType() && "Types must match"); + Type *PhiTy = P->getType(); + Value *Broadcasted; + if (P == OldInduction) { + // Handle the canonical induction variable. We might have had to + // extend the type. + Broadcasted = Builder.CreateTrunc(Induction, PhiTy); + } else { + // Handle other induction variables that are now based on the + // canonical one. + Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, + "normalized.idx"); + NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); + Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, + "offset.idx"); + } + Broadcasted = getBroadcastInstrs(Broadcasted); + // After broadcasting the induction variable we need to make the vector + // consecutive by adding 0, 1, 2, etc. for (unsigned part = 0; part < UF; ++part) Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); continue; } case LoopVectorizationLegality::IK_ReverseIntInduction: case LoopVectorizationLegality::IK_PtrInduction: + case LoopVectorizationLegality::IK_ReversePtrInduction: // Handle reverse integer and pointer inductions. - Value *StartIdx = 0; - // If we have a single integer induction variable then use it. - // Otherwise, start counting at zero. - if (OldInduction) { - LoopVectorizationLegality::InductionInfo OldII = - Legal->getInductionVars()->lookup(OldInduction); - StartIdx = OldII.StartValue; - } else { - StartIdx = ConstantInt::get(Induction->getType(), 0); - } + Value *StartIdx = ExtendedIdx; // This is the normalized GEP that starts counting at zero. Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, "normalized.idx"); @@ -1647,22 +2202,31 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // After broadcasting the induction variable we need to make the // vector consecutive by adding ... -3, -2, -1, 0. for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true); + Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, + true); continue; } // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); + // Is this a reverse induction ptr or a consecutive induction ptr. + bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == + II.IK); + // This is the vector of results. Notice that we don't generate // vector geps because scalar geps result in better code. for (unsigned part = 0; part < UF; ++part) { Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); for (unsigned int i = 0; i < VF; ++i) { - Constant *Idx = ConstantInt::get(Induction->getType(), - i + part * VF); - Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, - "gep.idx"); + int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); + Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); + Value *GlobalIdx; + if (!Reverse) + GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); + else + GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); + Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, "next.gep"); VecVal = Builder.CreateInsertElement(VecVal, SclrGep, @@ -1759,124 +2323,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, break; } - case Instruction::Store: { - // Attempt to issue a wide store. - StoreInst *SI = dyn_cast(it); - Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); - Value *Ptr = SI->getPointerOperand(); - unsigned Alignment = SI->getAlignment(); - - assert(!Legal->isUniform(Ptr) && - "We do not allow storing to uniform addresses"); - - - int Stride = Legal->isConsecutivePtr(Ptr); - bool Reverse = Stride < 0; - if (Stride == 0) { - scalarizeInstruction(it); + case Instruction::Store: + case Instruction::Load: + vectorizeMemoryInstruction(it, Legal); break; - } - - // Handle consecutive stores. - - GetElementPtrInst *Gep = dyn_cast(Ptr); - if (Gep) { - // The last index does not have to be the induction. It can be - // consecutive and be a function of the index. For example A[I+1]; - unsigned NumOperands = Gep->getNumOperands(); - - Value *LastGepOperand = Gep->getOperand(NumOperands - 1); - VectorParts &GEPParts = getVectorValue(LastGepOperand); - Value *LastIndex = GEPParts[0]; - LastIndex = Builder.CreateExtractElement(LastIndex, Zero); - - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast(Gep->clone()); - Gep2->setOperand(NumOperands - 1, LastIndex); - Ptr = Builder.Insert(Gep2); - } else { - // Use the induction element ptr. - assert(isa(Ptr) && "Invalid induction ptr"); - VectorParts &PtrVal = getVectorValue(Ptr); - Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); - } - - VectorParts &StoredVal = getVectorValue(SI->getValueOperand()); - for (unsigned Part = 0; Part < UF; ++Part) { - // Calculate the pointer for the specific unroll-part. - Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); - - if (Reverse) { - // If we store to reverse consecutive memory locations then we need - // to reverse the order of elements in the stored value. - StoredVal[Part] = reverseVector(StoredVal[Part]); - // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. - PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); - } - - Value *VecPtr = Builder.CreateBitCast(PartPtr, StTy->getPointerTo()); - Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); - } - break; - } - case Instruction::Load: { - // Attempt to issue a wide load. - LoadInst *LI = dyn_cast(it); - Type *RetTy = VectorType::get(LI->getType(), VF); - Value *Ptr = LI->getPointerOperand(); - unsigned Alignment = LI->getAlignment(); - - // If the pointer is loop invariant or if it is non consecutive, - // scalarize the load. - int Stride = Legal->isConsecutivePtr(Ptr); - bool Reverse = Stride < 0; - if (Legal->isUniform(Ptr) || Stride == 0) { - scalarizeInstruction(it); - break; - } - - GetElementPtrInst *Gep = dyn_cast(Ptr); - if (Gep) { - // The last index does not have to be the induction. It can be - // consecutive and be a function of the index. For example A[I+1]; - unsigned NumOperands = Gep->getNumOperands(); - - Value *LastGepOperand = Gep->getOperand(NumOperands - 1); - VectorParts &GEPParts = getVectorValue(LastGepOperand); - Value *LastIndex = GEPParts[0]; - LastIndex = Builder.CreateExtractElement(LastIndex, Zero); - - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast(Gep->clone()); - Gep2->setOperand(NumOperands - 1, LastIndex); - Ptr = Builder.Insert(Gep2); - } else { - // Use the induction element ptr. - assert(isa(Ptr) && "Invalid induction ptr"); - VectorParts &PtrVal = getVectorValue(Ptr); - Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); - } - - for (unsigned Part = 0; Part < UF; ++Part) { - // Calculate the pointer for the specific unroll-part. - Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); - - if (Reverse) { - // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. - PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); - } - - Value *VecPtr = Builder.CreateBitCast(PartPtr, RetTy->getPointerTo()); - Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); - cast(LI)->setAlignment(Alignment); - Entry[Part] = Reverse ? reverseVector(LI) : LI; - } - break; - } case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -1913,17 +2363,21 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, } case Instruction::Call: { - assert(isTriviallyVectorizableIntrinsic(it)); + // Ignore dbg intrinsics. + if (isa(it)) + break; + Module *M = BB->getParent()->getParent(); - IntrinsicInst *II = cast(it); - Intrinsic::ID ID = II->getIntrinsicID(); + CallInst *CI = cast(it); + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); + assert(ID && "Not an intrinsic call!"); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector Args; - for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) { - VectorParts &Arg = getVectorValue(II->getArgOperand(i)); + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); Args.push_back(Arg[Part]); } - Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) }; + Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) }; Function *F = Intrinsic::getDeclaration(M, ID, Tys); Entry[Part] = Builder.CreateCall(F, Args); } @@ -1973,12 +2427,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!isa(BB->getTerminator())) return false; - // We must have at most two predecessors because we need to convert - // all PHIs to selects. - unsigned Preds = std::distance(pred_begin(BB), pred_end(BB)); - if (Preds > 2) - return false; - // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) return false; @@ -2056,10 +2504,38 @@ bool LoopVectorizationLegality::canVectorize() { return true; } +static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) { + if (Ty->isPointerTy()) + return DL.getIntPtrType(Ty->getContext()); + return Ty; +} + +static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) { + Ty0 = convertPointerToIntegerType(DL, Ty0); + Ty1 = convertPointerToIntegerType(DL, Ty1); + if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) + return Ty0; + return Ty1; +} + bool LoopVectorizationLegality::canVectorizeInstrs() { BasicBlock *PreHeader = TheLoop->getLoopPreheader(); BasicBlock *Header = TheLoop->getHeader(); + // If we marked the scalar loop as "already vectorized" then no need + // to vectorize it again. + if (Header->getTerminator()->getMetadata(AlreadyVectorizedMDName)) { + DEBUG(dbgs() << "LV: This loop was vectorized before\n"); + return false; + } + + // Look for the attribute signaling the absence of NaNs. + Function &F = *Header->getParent(); + if (F.hasFnAttribute("no-nans-fp-math")) + HasFunNoNaNAttr = F.getAttributes().getAttribute( + AttributeSet::FunctionIndex, + "no-nans-fp-math").getValueAsString() == "true"; + // For each block in the loop. for (Loop::block_iterator bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; ++bb) { @@ -2069,16 +2545,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { ++it) { if (PHINode *Phi = dyn_cast(it)) { - // This should not happen because the loop should be normalized. - if (Phi->getNumIncomingValues() != 2) { - DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); - return false; - } - + Type *PhiTy = Phi->getType(); // Check that this PHI type is allowed. - if (!Phi->getType()->isIntegerTy() && - !Phi->getType()->isFloatingPointTy() && - !Phi->getType()->isPointerTy()) { + if (!PhiTy->isIntegerTy() && + !PhiTy->isFloatingPointTy() && + !PhiTy->isPointerTy()) { DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; } @@ -2089,19 +2560,31 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (*bb != Header) continue; + // We only allow if-converted PHIs with more than two incoming values. + if (Phi->getNumIncomingValues() != 2) { + DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + return false; + } + // This is the value coming from the preheader. Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); // Check if this is an induction variable. InductionKind IK = isInductionVariable(Phi); if (IK_NoInduction != IK) { + // Get the widest type. + if (!WidestIndTy) + WidestIndTy = convertPointerToIntegerType(*DL, PhiTy); + else + WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy); + // Int inductions are special because we only allow one IV. if (IK == IK_IntInduction) { - if (Induction) { - DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); - return false; - } - Induction = Phi; + // Use the phi node with the widest type as induction. Use the last + // one if there are multiple (no good reason for doing this other + // than it is expedient). + if (!Induction || PhiTy == WidestIndTy) + Induction = Phi; } DEBUG(dbgs() << "LV: Found an induction variable.\n"); @@ -2129,6 +2612,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); continue; } + if (AddReductionVar(Phi, RK_IntegerMinMax)) { + DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n"); + continue; + } if (AddReductionVar(Phi, RK_FloatMult)) { DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n"); continue; @@ -2137,14 +2624,19 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n"); continue; } + if (AddReductionVar(Phi, RK_FloatMinMax)) { + DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n"); + continue; + } DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); return false; }// end of PHI handling - // We still don't handle functions. + // We still don't handle functions. However, we can ignore dbg intrinsic + // calls and we do handle certain intrinsic and libm functions. CallInst *CI = dyn_cast(it); - if (CI && !isTriviallyVectorizableIntrinsic(it)) { + if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa(CI)) { DEBUG(dbgs() << "LV: Found a call site.\n"); return false; } @@ -2182,7 +2674,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!Induction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); - assert(getInductionVars()->size() && "No induction variables"); + if (Inductions.empty()) + return false; } return true; @@ -2217,7 +2710,44 @@ void LoopVectorizationLegality::collectLoopUniforms() { } } +AliasAnalysis::Location +LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) { + if (StoreInst *Store = dyn_cast(Inst)) + return AA->getLocation(Store); + else if (LoadInst *Load = dyn_cast(Inst)) + return AA->getLocation(Load); + + llvm_unreachable("Should be either load or store instruction"); +} + +bool +LoopVectorizationLegality::hasPossibleGlobalWriteReorder( + Value *Object, + Instruction *Inst, + AliasMultiMap& WriteObjects, + unsigned MaxByteWidth) { + + AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst); + + std::vector::iterator + it = WriteObjects[Object].begin(), + end = WriteObjects[Object].end(); + + for (; it != end; ++it) { + Instruction* I = *it; + if (I == Inst) + continue; + + AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I); + if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth), + ThatLoc.getWithNewSize(MaxByteWidth))) + return true; + } + return false; +} + bool LoopVectorizationLegality::canVectorizeMemory() { + typedef SmallVector ValueVector; typedef SmallPtrSet ValueSet; // Holds the Load and Store *instructions*. @@ -2226,6 +2756,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { PtrRtCheck.Pointers.clear(); PtrRtCheck.Need = false; + const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); + // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; ++bb) { @@ -2240,7 +2772,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (it->mayReadFromMemory()) { LoadInst *Ld = dyn_cast(it); if (!Ld) return false; - if (!Ld->isSimple()) { + if (!Ld->isSimple() && !IsAnnotatedParallel) { DEBUG(dbgs() << "LV: Found a non-simple load.\n"); return false; } @@ -2252,7 +2784,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (it->mayWriteToMemory()) { StoreInst *St = dyn_cast(it); if (!St) return false; - if (!St->isSimple()) { + if (!St->isSimple() && !IsAnnotatedParallel) { DEBUG(dbgs() << "LV: Found a non-simple store.\n"); return false; } @@ -2271,9 +2803,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } - // Holds the read and read-write *pointers* that we find. - ValueVector Reads; - ValueVector ReadWrites; + // Holds the read and read-write *pointers* that we find. These maps hold + // unique values for pointers (so no need for multi-map). + AliasMap Reads; + AliasMap ReadWrites; // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once @@ -2295,7 +2828,14 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // If we did *not* see this pointer before, insert it to // the read-write list. At this phase it is only a 'write' list. if (Seen.insert(Ptr)) - ReadWrites.push_back(Ptr); + ReadWrites.insert(std::make_pair(Ptr, ST)); + } + + if (IsAnnotatedParallel) { + DEBUG(dbgs() + << "LV: A loop annotated parallel, ignore memory dependency " + << "checks.\n"); + return true; } for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { @@ -2310,7 +2850,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // read a few words, modify, and write a few words, and some of the // words may be written to the same address. if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr)) - Reads.push_back(Ptr); + Reads.insert(std::make_pair(Ptr, LD)); } // If we write (or read-write) to a single destination and there are no @@ -2320,29 +2860,41 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } + unsigned NumReadPtrs = 0; + unsigned NumWritePtrs = 0; + // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. bool CanDoRT = true; - for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) - if (hasComputableBounds(*I)) { - PtrRtCheck.insert(SE, TheLoop, *I); - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); + AliasMap::iterator MI, ME; + for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { + Value *V = (*MI).first; + if (hasComputableBounds(V)) { + PtrRtCheck.insert(SE, TheLoop, V, true); + NumWritePtrs++; + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); } else { CanDoRT = false; break; } - for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) - if (hasComputableBounds(*I)) { - PtrRtCheck.insert(SE, TheLoop, *I); - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); + } + for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { + Value *V = (*MI).first; + if (hasComputableBounds(V)) { + PtrRtCheck.insert(SE, TheLoop, V, false); + NumReadPtrs++; + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); } else { CanDoRT = false; break; } + } // Check that we did not collect too many pointers or found a // unsizeable pointer. - if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { + unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1)); + DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n"); + if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { PtrRtCheck.reset(); CanDoRT = false; } @@ -2353,47 +2905,104 @@ bool LoopVectorizationLegality::canVectorizeMemory() { bool NeedRTCheck = false; + // Biggest vectorized access possible, vector width * unroll factor. + // TODO: We're being very pessimistic here, find a way to know the + // real access width before getting here. + unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) * + TTI->getMaximumUnrollFactor(); // Now that the pointers are in two lists (Reads and ReadWrites), we // can check that there are no conflicts between each of the writes and // between the writes to the reads. - ValueSet WriteObjects; + // Note that WriteObjects duplicates the stores (indexed now by underlying + // objects) to avoid pointing to elements inside ReadWrites. + // TODO: Maybe create a new type where they can interact without duplication. + AliasMultiMap WriteObjects; ValueVector TempObjects; // Check that the read-writes do not conflict with other read-write // pointers. bool AllWritesIdentified = true; - for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { - GetUnderlyingObjects(*I, TempObjects, DL); - for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); - it != e; ++it) { - if (!isIdentifiedObject(*it)) { - DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); + for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { + Value *Val = (*MI).first; + Instruction *Inst = (*MI).second; + + GetUnderlyingObjects(Val, TempObjects, DL); + for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end(); + UI != UE; ++UI) { + if (!isIdentifiedObject(*UI)) { + DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n"); NeedRTCheck = true; AllWritesIdentified = false; } - if (!WriteObjects.insert(*it)) { + + // Never seen it before, can't alias. + if (WriteObjects[*UI].empty()) { + DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n"); + WriteObjects[*UI].push_back(Inst); + continue; + } + // Direct alias found. + if (!AA || dyn_cast(*UI) == NULL) { DEBUG(dbgs() << "LV: Found a possible write-write reorder:" - << **it <<"\n"); + << **UI <<"\n"); return false; } + DEBUG(dbgs() << "LV: Found a conflicting global value:" + << **UI <<"\n"); + DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n"); + DEBUG(dbgs() << "LV: On value:" << *Val <<"\n"); + + // If global alias, make sure they do alias. + if (hasPossibleGlobalWriteReorder(*UI, + Inst, + WriteObjects, + MaxByteWidth)) { + DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI + << "\n"); + return false; + } + + // Didn't alias, insert into map for further reference. + WriteObjects[*UI].push_back(Inst); } TempObjects.clear(); } /// Check that the reads don't conflict with the read-writes. - for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { - GetUnderlyingObjects(*I, TempObjects, DL); - for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); - it != e; ++it) { + for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { + Value *Val = (*MI).first; + GetUnderlyingObjects(Val, TempObjects, DL); + for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end(); + UI != UE; ++UI) { // If all of the writes are identified then we don't care if the read // pointer is identified or not. - if (!AllWritesIdentified && !isIdentifiedObject(*it)) { - DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); + if (!AllWritesIdentified && !isIdentifiedObject(*UI)) { + DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n"); NeedRTCheck = true; } - if (WriteObjects.count(*it)) { - DEBUG(dbgs() << "LV: Found a possible read/write reorder:" - << **it <<"\n"); + + // Never seen it before, can't alias. + if (WriteObjects[*UI].empty()) + continue; + // Direct alias found. + if (!AA || dyn_cast(*UI) == NULL) { + DEBUG(dbgs() << "LV: Found a possible write-write reorder:" + << **UI <<"\n"); + return false; + } + DEBUG(dbgs() << "LV: Found a global value: " + << **UI <<"\n"); + Instruction *Inst = (*MI).second; + DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n"); + DEBUG(dbgs() << "LV: On value:" << *Val <<"\n"); + + // If global alias, make sure they do alias. + if (hasPossibleGlobalWriteReorder(*UI, + Inst, + WriteObjects, + MaxByteWidth)) { + DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI + << "\n"); return false; } } @@ -2413,6 +3022,26 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } +static bool hasMultipleUsesOf(Instruction *I, + SmallPtrSet &Insts) { + unsigned NumUses = 0; + for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { + if (Insts.count(dyn_cast(*Use))) + ++NumUses; + if (NumUses > 1) + return true; + } + + return false; +} + +static bool areAllUsesIn(Instruction *I, SmallPtrSet &Set) { + for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) + if (!Set.count(dyn_cast(*Use))) + return false; + return true; +} + bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, ReductionKind Kind) { if (Phi->getNumIncomingValues() != 2) @@ -2431,122 +3060,235 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // This includes users of the reduction, variables (which form a cycle // which ends in the phi node). Instruction *ExitInstruction = 0; - // Indicates that we found a binary operation in our scan. - bool FoundBinOp = false; - - // Iter is our iterator. We start with the PHI node and scan for all of the - // users of this instruction. All users must be instructions that can be - // used as reduction variables (such as ADD). We may have a single - // out-of-block user. The cycle must end with the original PHI. - Instruction *Iter = Phi; - while (true) { - // If the instruction has no users then this is a broken - // chain and can't be a reduction variable. - if (Iter->use_empty()) + // Indicates that we found a reduction operation in our scan. + bool FoundReduxOp = false; + + // We start with the PHI node and scan for all of the users of this + // instruction. All users must be instructions that can be used as reduction + // variables (such as ADD). We must have a single out-of-block user. The cycle + // must include the original PHI. + bool FoundStartPHI = false; + + // To recognize min/max patterns formed by a icmp select sequence, we store + // the number of instruction we saw from the recognized min/max pattern, + // to make sure we only see exactly the two instructions. + unsigned NumCmpSelectPatternInst = 0; + ReductionInstDesc ReduxDesc(false, 0); + + SmallPtrSet VisitedInsts; + SmallVector Worklist; + Worklist.push_back(Phi); + VisitedInsts.insert(Phi); + + // A value in the reduction can be used: + // - By the reduction: + // - Reduction operation: + // - One use of reduction value (safe). + // - Multiple use of reduction value (not safe). + // - PHI: + // - All uses of the PHI must be the reduction (safe). + // - Otherwise, not safe. + // - By one instruction outside of the loop (safe). + // - By further instructions outside of the loop (not safe). + // - By an instruction that is not part of the reduction (not safe). + // This is either: + // * An instruction type other than PHI or the reduction operation. + // * A PHI in the header other than the initial PHI. + while (!Worklist.empty()) { + Instruction *Cur = Worklist.back(); + Worklist.pop_back(); + + // No Users. + // If the instruction has no users then this is a broken chain and can't be + // a reduction variable. + if (Cur->use_empty()) return false; - // Did we find a user inside this loop already ? - bool FoundInBlockUser = false; - // Did we reach the initial PHI node already ? - bool FoundStartPHI = false; + bool IsAPhi = isa(Cur); - // Is this a bin op ? - FoundBinOp |= !isa(Iter); + // A header PHI use other than the original PHI. + if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent()) + return false; - // For each of the *users* of iter. - for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); - it != e; ++it) { - Instruction *U = cast(*it); - // We already know that the PHI is a user. - if (U == Phi) { - FoundStartPHI = true; - continue; - } + // Reductions of instructions such as Div, and Sub is only possible if the + // LHS is the reduction variable. + if (!Cur->isCommutative() && !IsAPhi && !isa(Cur) && + !isa(Cur) && !isa(Cur) && + !VisitedInsts.count(dyn_cast(Cur->getOperand(0)))) + return false; + + // Any reduction instruction must be of one of the allowed kinds. + ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc); + if (!ReduxDesc.IsReduction) + return false; + + // A reduction operation must only have one use of the reduction value. + if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && + hasMultipleUsesOf(Cur, VisitedInsts)) + return false; + + // All inputs to a PHI node must be a reduction value. + if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) + return false; + + if (Kind == RK_IntegerMinMax && (isa(Cur) || + isa(Cur))) + ++NumCmpSelectPatternInst; + if (Kind == RK_FloatMinMax && (isa(Cur) || + isa(Cur))) + ++NumCmpSelectPatternInst; + + // Check whether we found a reduction operator. + FoundReduxOp |= !IsAPhi; + + // Process users of current instruction. Push non PHI nodes after PHI nodes + // onto the stack. This way we are going to have seen all inputs to PHI + // nodes once we get to them. + SmallVector NonPHIs; + SmallVector PHIs; + for (Value::use_iterator UI = Cur->use_begin(), E = Cur->use_end(); UI != E; + ++UI) { + Instruction *Usr = cast(*UI); // Check if we found the exit user. - BasicBlock *Parent = U->getParent(); + BasicBlock *Parent = Usr->getParent(); if (!TheLoop->contains(Parent)) { // Exit if you find multiple outside users. if (ExitInstruction != 0) return false; - ExitInstruction = Iter; + ExitInstruction = Cur; + continue; } - // We allow in-loop PHINodes which are not the original reduction PHI - // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE - // structure) then don't skip this PHI. - if (isa(Iter) && isa(U) && - U->getParent() != TheLoop->getHeader() && - TheLoop->contains(U) && - Iter->getNumUses() > 1) - continue; + // Process instructions only once (termination). + if (VisitedInsts.insert(Usr)) { + if (isa(Usr)) + PHIs.push_back(Usr); + else + NonPHIs.push_back(Usr); + } + // Remember that we completed the cycle. + if (Usr == Phi) + FoundStartPHI = true; + } + Worklist.append(PHIs.begin(), PHIs.end()); + Worklist.append(NonPHIs.begin(), NonPHIs.end()); + } - // We can't have multiple inside users. - if (FoundInBlockUser) - return false; - FoundInBlockUser = true; + // This means we have seen one but not the other instruction of the + // pattern or more than just a select and cmp. + if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && + NumCmpSelectPatternInst != 2) + return false; - // Any reduction instr must be of one of the allowed kinds. - if (!isReductionInstr(U, Kind)) - return false; + if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) + return false; - // Reductions of instructions such as Div, and Sub is only - // possible if the LHS is the reduction variable. - if (!U->isCommutative() && !isa(U) && U->getOperand(0) != Iter) - return false; + // We found a reduction var if we have reached the original phi node and we + // only have a single instruction with out-of-loop users. - Iter = U; - } + // This instruction is allowed to have out-of-loop users. + AllowedExit.insert(ExitInstruction); - // We found a reduction var if we have reached the original - // phi node and we only have a single instruction with out-of-loop - // users. - if (FoundStartPHI) { - // This instruction is allowed to have out-of-loop users. - AllowedExit.insert(ExitInstruction); - - // Save the description of this reduction variable. - ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); - Reductions[Phi] = RD; - // We've ended the cycle. This is a reduction variable if we have an - // outside user and it has a binary op. - return FoundBinOp && ExitInstruction; - } + // Save the description of this reduction variable. + ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, + ReduxDesc.MinMaxKind); + Reductions[Phi] = RD; + // We've ended the cycle. This is a reduction variable if we have an + // outside user and it has a binary op. + + return true; +} + +/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction +/// pattern corresponding to a min(X, Y) or max(X, Y). +LoopVectorizationLegality::ReductionInstDesc +LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, + ReductionInstDesc &Prev) { + + assert((isa(I) || isa(I) || isa(I)) && + "Expect a select instruction"); + Instruction *Cmp = 0; + SelectInst *Select = 0; + + // We must handle the select(cmp()) as a single instruction. Advance to the + // select. + if ((Cmp = dyn_cast(I)) || (Cmp = dyn_cast(I))) { + if (!Cmp->hasOneUse() || !(Select = dyn_cast(*I->use_begin()))) + return ReductionInstDesc(false, I); + return ReductionInstDesc(Select, Prev.MinMaxKind); } + + // Only handle single use cases for now. + if (!(Select = dyn_cast(I))) + return ReductionInstDesc(false, I); + if (!(Cmp = dyn_cast(I->getOperand(0))) && + !(Cmp = dyn_cast(I->getOperand(0)))) + return ReductionInstDesc(false, I); + if (!Cmp->hasOneUse()) + return ReductionInstDesc(false, I); + + Value *CmpLeft; + Value *CmpRight; + + // Look for a min/max pattern. + if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_UIntMin); + else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_UIntMax); + else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_SIntMax); + else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_SIntMin); + else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_FloatMin); + else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_FloatMax); + else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_FloatMin); + else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_FloatMax); + + return ReductionInstDesc(false, I); } -bool +LoopVectorizationLegality::ReductionInstDesc LoopVectorizationLegality::isReductionInstr(Instruction *I, - ReductionKind Kind) { + ReductionKind Kind, + ReductionInstDesc &Prev) { bool FP = I->getType()->isFloatingPointTy(); bool FastMath = (FP && I->isCommutative() && I->isAssociative()); - switch (I->getOpcode()) { default: - return false; + return ReductionInstDesc(false, I); case Instruction::PHI: - if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd)) - return false; - // possibly. - return true; + if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd && + Kind != RK_FloatMinMax)) + return ReductionInstDesc(false, I); + return ReductionInstDesc(I, Prev.MinMaxKind); case Instruction::Sub: case Instruction::Add: - return Kind == RK_IntegerAdd; - case Instruction::SDiv: - case Instruction::UDiv: + return ReductionInstDesc(Kind == RK_IntegerAdd, I); case Instruction::Mul: - return Kind == RK_IntegerMult; + return ReductionInstDesc(Kind == RK_IntegerMult, I); case Instruction::And: - return Kind == RK_IntegerAnd; + return ReductionInstDesc(Kind == RK_IntegerAnd, I); case Instruction::Or: - return Kind == RK_IntegerOr; + return ReductionInstDesc(Kind == RK_IntegerOr, I); case Instruction::Xor: - return Kind == RK_IntegerXor; + return ReductionInstDesc(Kind == RK_IntegerXor, I); case Instruction::FMul: - return Kind == RK_FloatMult && FastMath; + return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I); case Instruction::FAdd: - return Kind == RK_FloatAdd && FastMath; - } + return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I); + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: + if (Kind != RK_IntegerMinMax && + (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) + return ReductionInstDesc(false, I); + return isMinMaxSelectCmpPattern(I, Prev); + } } LoopVectorizationLegality::InductionKind @@ -2556,7 +3298,7 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) return IK_NoInduction; - // Check that the PHI is consecutive and starts at zero. + // Check that the PHI is consecutive. const SCEV *PhiScev = SE->getSCEV(Phi); const SCEVAddRecExpr *AR = dyn_cast(PhiScev); if (!AR) { @@ -2583,6 +3325,8 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); if (C->getValue()->equalsInt(Size)) return IK_PtrInduction; + else if (C->getValue()->equalsInt(0 - Size)) + return IK_ReversePtrInduction; return IK_NoInduction; } @@ -2606,8 +3350,12 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - // We don't predicate loads/stores at the moment. - if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow()) + // We might be able to hoist the load. + if (it->mayReadFromMemory() && !LoadSpeculation.isHoistableLoad(it)) + return false; + + // We predicate stores at the moment. + if (it->mayWriteToMemory() || it->mayThrow()) return false; // The instructions below can trap. @@ -2621,6 +3369,10 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { } } + // Check that we can actually speculate the hoistable loads. + if (!LoadSpeculation.canHoistAllLoads()) + return false; + return true; } @@ -2633,12 +3385,14 @@ bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { return AR->isAffine(); } -unsigned +LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, unsigned UserVF) { + // Width 1 means no vectorize + VectorizationFactor Factor = { 1U, 0U }; if (OptForSize && Legal->getRuntimePointerCheck()->Need) { DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); - return 1; + return Factor; } // Find the trip count. @@ -2657,7 +3411,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, } assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements" - " into one vector."); + " into one vector!"); unsigned VF = MaxVectorSize; @@ -2666,7 +3420,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, // If we are unable to calculate the trip count then don't try to vectorize. if (TC < 2) { DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); - return 1; + return Factor; } // Find the maximum SIMD width that can fit within the trip count. @@ -2679,7 +3433,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, // zero then we require a tail. if (VF < 2) { DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); - return 1; + return Factor; } } @@ -2687,7 +3441,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); DEBUG(dbgs() << "LV: Using user VF "<(it)) T = ST->getValueOperand()->getType(); - // Ignore stored/loaded pointer types. - if (T->isPointerTy()) + // Ignore loaded pointer types and stored pointer types that are not + // consecutive. However, we do want to take consecutive stores/loads of + // pointer vectors into account. + if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) continue; - MaxWidth = std::max(MaxWidth, T->getScalarSizeInBits()); + MaxWidth = std::max(MaxWidth, + (unsigned)DL->getTypeSizeInBits(T->getScalarType())); } } @@ -2748,7 +3508,24 @@ unsigned LoopVectorizationCostModel::getWidestType() { unsigned LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, - unsigned UserUF) { + unsigned UserUF, + unsigned VF, + unsigned LoopCost) { + + // -- The unroll heuristics -- + // We unroll the loop in order to expose ILP and reduce the loop overhead. + // There are many micro-architectural considerations that we can't predict + // at this level. For example frontend pressure (on decode or fetch) due to + // code size, or the number and capabilities of the execution ports. + // + // We use the following heuristics to select the unroll factor: + // 1. If the code has reductions the we unroll in order to break the cross + // iteration dependency. + // 2. If the loop is really small then we unroll in order to reduce the loop + // overhead. + // 3. We don't unroll if we think that we will spill registers to memory due + // to the increased register pressure. + // Use the user preference, unless 'auto' is selected. if (UserUF != 0) return UserUF; @@ -2781,19 +3558,39 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // fit without causing spills. unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers; - // We don't want to unroll the loops to the point where they do not fit into - // the decoded cache. Assume that we only allow 32 IR instructions. - UF = std::min(UF, (MaxLoopSizeThreshold / R.NumInstructions)); - // Clamp the unroll factor ranges to reasonable factors. unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor(); - + + // If we did not calculate the cost for VF (because the user selected the VF) + // then we calculate the cost of VF here. + if (LoopCost == 0) + LoopCost = expectedCost(VF); + + // Clamp the calculated UF to be between the 1 and the max unroll factor + // that the target allows. if (UF > MaxUnrollSize) UF = MaxUnrollSize; else if (UF < 1) UF = 1; - return UF; + if (Legal->getReductionVars()->size()) { + DEBUG(dbgs() << "LV: Unrolling because of reductions. \n"); + return UF; + } + + // We want to unroll tiny loops in order to reduce the loop overhead. + // We assume that the cost overhead is 1 and we use the cost model + // to estimate the cost of the loop and unroll until the cost of the + // loop overhead is about 5% of the cost of the loop. + DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n"); + if (LoopCost < 20) { + DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n"); + unsigned NewUF = 20/LoopCost + 1; + return std::min(NewUF, UF); + } + + DEBUG(dbgs() << "LV: Not Unrolling. \n"); + return 1; } LoopVectorizationCostModel::RegisterUsage @@ -2920,6 +3717,10 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + // Skip dbg intrinsics. + if (isa(it)) + continue; + unsigned C = getInstructionCost(it, VF); Cost += C; DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << @@ -2951,9 +3752,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: - // We mark this instruction as zero-cost because scalar GEPs are usually - // lowered to the intruction addressing mode. At the moment we don't - // generate vector geps. + // We mark this instruction as zero-cost because the cost of GEPs in + // vectorized code depends on whether the corresponding memory instruction + // is scalarized or not. Therefore, we handle GEPs with the memory + // instruction cost. return 0; case Instruction::Br: { return TTI.getCFInstrCost(I->getOpcode()); @@ -2978,14 +3780,25 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: - return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy); + case Instruction::Xor: { + // Certain instructions can be cheaper to vectorize if they have a constant + // second vector operand. One example of this are shifts on x86. + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue; + + if (isa(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + + return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK); + } case Instruction::Select: { SelectInst *SI = cast(I); const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); - if (ScalarCond) + if (!ScalarCond) CondTy = VectorType::get(CondTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); @@ -2996,83 +3809,61 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); } - case Instruction::Store: { - StoreInst *SI = cast(I); - Type *ValTy = SI->getValueOperand()->getType(); + case Instruction::Store: + case Instruction::Load: { + StoreInst *SI = dyn_cast(I); + LoadInst *LI = dyn_cast(I); + Type *ValTy = (SI ? SI->getValueOperand()->getType() : + LI->getType()); VectorTy = ToVectorTy(ValTy, VF); + unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); + unsigned AS = SI ? SI->getPointerAddressSpace() : + LI->getPointerAddressSpace(); + Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); + // We add the cost of address computation here instead of with the gep + // instruction because only here we know whether the operation is + // scalarized. if (VF == 1) - return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, - SI->getAlignment(), - SI->getPointerAddressSpace()); - - // Scalarized stores. - int Stride = Legal->isConsecutivePtr(SI->getPointerOperand()); - bool Reverse = Stride < 0; - if (0 == Stride) { + return TTI.getAddressComputationCost(VectorTy) + + TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + + // Scalarized loads/stores. + int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + bool Reverse = ConsecutiveStride < 0; + unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); + unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; + if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { unsigned Cost = 0; - // The cost of extracting from the value vector and pointer vector. - Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); + Type *PtrTy = ToVectorTy(Ptr->getType(), VF); for (unsigned i = 0; i < VF; ++i) { - Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - i); + // The cost of extracting the pointer operand. Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); + // In case of STORE, the cost of ExtractElement from the vector. + // In case of LOAD, the cost of InsertElement into the returned + // vector. + Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement : + Instruction::InsertElement, + VectorTy, i); } - // The cost of the scalar stores. + // The cost of the scalar loads/stores. + Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType()); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - SI->getAlignment(), - SI->getPointerAddressSpace()); + Alignment, AS); return Cost; } - // Wide stores. - unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy, - SI->getAlignment(), - SI->getPointerAddressSpace()); + // Wide load/stores. + unsigned Cost = TTI.getAddressComputationCost(VectorTy); + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + if (Reverse) Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); return Cost; } - case Instruction::Load: { - LoadInst *LI = cast(I); - - if (VF == 1) - return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), - LI->getPointerAddressSpace()); - - // Scalarized loads. - int Stride = Legal->isConsecutivePtr(LI->getPointerOperand()); - bool Reverse = Stride < 0; - if (0 == Stride) { - unsigned Cost = 0; - Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF); - - // The cost of extracting from the pointer vector. - for (unsigned i = 0; i < VF; ++i) - Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); - - // The cost of inserting data to the result vector. - for (unsigned i = 0; i < VF; ++i) - Cost += TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy, i); - - // The cost of the scalar stores. - Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), RetTy->getScalarType(), - LI->getAlignment(), - LI->getPointerAddressSpace()); - return Cost; - } - - // Wide loads. - unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy, - LI->getAlignment(), - LI->getPointerAddressSpace()); - if (Reverse) - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); - return Cost; - } case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -3096,13 +3887,14 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } case Instruction::Call: { - assert(isTriviallyVectorizableIntrinsic(I)); - IntrinsicInst *II = cast(I); - Type *RetTy = ToVectorTy(II->getType(), VF); + CallInst *CI = cast(I); + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); + assert(ID && "Not an intrinsic call!"); + Type *RetTy = ToVectorTy(CI->getType(), VF); SmallVector Tys; - for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) - Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF)); - return TTI.getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys); + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) + Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); + return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); } default: { // We are scalarizing the instruction. Return the cost of the scalar @@ -3150,4 +3942,14 @@ namespace llvm { } } +bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { + // Check for a store. + if (StoreInst *ST = dyn_cast(Inst)) + return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0; + + // Check for a load. + if (LoadInst *LI = dyn_cast(Inst)) + return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0; + return false; +}