X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTransforms%2FVectorize%2FLoopVectorize.cpp;h=17c25dfffc108a13695e2d2bdcecec8869c005fb;hb=c63a0fe41b81bac1ea6e1a053d2a8939e02edf17;hp=19dcccc6eec4de1dc91934f1aa39a8d60d5b1a9d;hpb=9146833fa313fb0339355f9ca8b63122dd73ba88;p=oota-llvm.git diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 19dcccc6eec..17c25dfffc1 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -48,7 +48,6 @@ #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" @@ -63,6 +62,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -101,6 +101,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include +#include #include #include @@ -125,6 +126,11 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), "trip count that is smaller than this " "value.")); +static cl::opt MaximizeBandwidth( + "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, + cl::desc("Maximize bandwidth when selecting vectorization factor which " + "will be determined by the smallest type in loop.")); + /// This enables versioning on the strides of symbolically striding memory /// accesses in code like the following. /// for (i = 0; i < N; ++i) @@ -138,7 +144,7 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), /// ... static cl::opt EnableMemAccessVersioning( "enable-mem-access-versioning", cl::init(true), cl::Hidden, - cl::desc("Enable symblic stride memory access versioning")); + cl::desc("Enable symbolic stride memory access versioning")); static cl::opt EnableInterleavedMemAccesses( "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, @@ -221,6 +227,15 @@ static cl::opt PragmaVectorizeMemoryCheckThreshold( cl::desc("The maximum allowed number of runtime memory checks with a " "vectorize(enable) pragma.")); +static cl::opt VectorizeSCEVCheckThreshold( + "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed.")); + +static cl::opt PragmaVectorizeSCEVCheckThreshold( + "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed with a " + "vectorize(enable) pragma")); + namespace { // Forward declarations. @@ -253,6 +268,32 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) { return VectorType::get(Scalar, VF); } +/// A helper function that returns GEP instruction and knows to skip a +/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination +/// pointee types of the 'bitcast' have the same size. +/// For example: +/// bitcast double** %var to i64* - can be skipped +/// bitcast double** %var to i8* - can not +static GetElementPtrInst *getGEPInstruction(Value *Ptr) { + + if (isa(Ptr)) + return cast(Ptr); + + if (isa(Ptr) && + isa(cast(Ptr)->getOperand(0))) { + Type *BitcastTy = Ptr->getType(); + Type *GEPTy = cast(Ptr)->getSrcTy(); + if (!isa(BitcastTy) || !isa(GEPTy)) + return nullptr; + Type *Pointee1Ty = cast(BitcastTy)->getPointerElementType(); + Type *Pointee2Ty = cast(GEPTy)->getPointerElementType(); + const DataLayout &DL = cast(Ptr)->getModule()->getDataLayout(); + if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty)) + return cast(cast(Ptr)->getOperand(0)); + } + return nullptr; +} + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -269,18 +310,24 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) { /// and reduction variables that were found to a given vectorization factor. class InnerLoopVectorizer { public: - InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, const TargetLibraryInfo *TLI, + InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, + LoopInfo *LI, DominatorTree *DT, + const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, unsigned VecWidth, unsigned UnrollFactor) - : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), - VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), + : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), + VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr), AddedSafetyChecks(false) {} // Perform the actual loop widening (vectorization). - void vectorize(LoopVectorizationLegality *L) { + // MinimumBitWidths maps scalar integer values to the smallest bitwidth they + // can be validly truncated to. The cost model has assumed this truncation + // will happen when vectorizing. + void vectorize(LoopVectorizationLegality *L, + MapVector MinimumBitWidths) { + MinBWs = MinimumBitWidths; Legal = L; // Create a new empty loop. Unlink the old loop and connect the new one. createEmptyLoop(); @@ -309,12 +356,6 @@ protected: typedef DenseMap, VectorParts> EdgeMaskCache; - /// \brief Add checks for strides that were assumed to be 1. - /// - /// Returns the last check instruction and the first check instruction in the - /// pair as (first, last). - std::pair addStrideCheck(Instruction *Loc); - /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(); /// Create a new induction variable inside L. @@ -329,6 +370,9 @@ protected: /// See PR14725. void fixLCSSAPHIs(); + /// Shrinks vector element sizes based on information in "MinBWs". + void truncateToMinimalBitwidths(); + /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True. It returns the *entry* /// mask for the block BB. @@ -339,7 +383,7 @@ protected: /// A helper function to vectorize a single BB within the innermost loop. void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); - + /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -395,11 +439,12 @@ protected: void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); /// Emit a bypass check to see if the vector trip count is nonzero. void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass); - /// Emit bypass checks to check if strides we've assumed to be one really are. - void emitStrideChecks(Loop *L, BasicBlock *Bypass); + /// Emit a bypass check to see if all of the SCEV assumptions we've + /// had to make are correct. + void emitSCEVChecks(Loop *L, BasicBlock *Bypass); /// Emit bypass checks to check any memory assumptions we may have made. void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); - + /// This is a helper class that holds the vectorizer state. It maps scalar /// instructions to vector instructions. When the code is 'unrolled' then /// then a single scalar value is mapped to multiple vector parts. The parts @@ -442,8 +487,10 @@ protected: /// The original loop. Loop *OrigLoop; - /// Scev analysis to use. - ScalarEvolution *SE; + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies + /// dynamic knowledge to simplify SCEV expressions and converts them to a + /// more usable form. + PredicatedScalarEvolution &PSE; /// Loop Info. LoopInfo *LI; /// Dominator Tree. @@ -499,6 +546,10 @@ protected: /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) Value *VectorTripCount; + /// Map of scalar integer values to the smallest bitwidth they can be legally + /// represented as. The vector equivalents of these values should be truncated + /// to this type. + MapVector MinBWs; LoopVectorizationLegality *Legal; // Record whether runtime check is added. @@ -507,10 +558,11 @@ protected: class InnerLoopUnroller : public InnerLoopVectorizer { public: - InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, const TargetLibraryInfo *TLI, + InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, + LoopInfo *LI, DominatorTree *DT, + const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, unsigned UnrollFactor) - : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {} + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {} private: void scalarizeInstruction(Instruction *Instr, @@ -591,7 +643,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) { } /// \brief Propagate known metadata from one instruction to a vector of others. -static void propagateMetadata(SmallVectorImpl &To, const Instruction *From) { +static void propagateMetadata(SmallVectorImpl &To, + const Instruction *From) { for (Value *V : To) if (Instruction *I = dyn_cast(V)) propagateMetadata(I, From); @@ -731,8 +784,9 @@ private: /// between the member and the group in a map. class InterleavedAccessInfo { public: - InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT) - : SE(SE), TheLoop(L), DT(DT) {} + InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, + DominatorTree *DT) + : PSE(PSE), TheLoop(L), DT(DT) {} ~InterleavedAccessInfo() { SmallSet DelSet; @@ -762,7 +816,11 @@ public: } private: - ScalarEvolution *SE; + /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. + /// Simplifies SCEV expressions in the context of existing SCEV assumptions. + /// The interleaved access analysis can also add new predicates (for example + /// by versioning strides of pointers). + PredicatedScalarEvolution &PSE; Loop *TheLoop; DominatorTree *DT; @@ -1123,14 +1181,15 @@ static void emitMissedWarning(Function *F, Loop *L, /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: - LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT, - TargetLibraryInfo *TLI, AliasAnalysis *AA, - Function *F, const TargetTransformInfo *TTI, + LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE, + DominatorTree *DT, TargetLibraryInfo *TLI, + AliasAnalysis *AA, Function *F, + const TargetTransformInfo *TTI, LoopAccessAnalysis *LAA, LoopVectorizationRequirements *R, const LoopVectorizeHints *H) - : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F), - TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT), + : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), + TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT), Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), Hints(H) {} @@ -1162,6 +1221,9 @@ public: /// Returns True if V is an induction variable in this loop. bool isInductionVariable(const Value *V); + /// Returns True if PN is a reduction variable in this loop. + bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } + /// Return true if the block BB needs to be predicated in order for the loop /// to be vectorized. bool blockNeedsPredication(BasicBlock *BB); @@ -1213,12 +1275,12 @@ public: /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedStore(Type *DataType, Value *Ptr) { - return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr)); + return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { - return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr)); + return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); } /// Returns true if vector representation of the instruction \p I /// requires mask. @@ -1276,8 +1338,12 @@ private: /// The loop that we evaluate. Loop *TheLoop; - /// Scev analysis. - ScalarEvolution *SE; + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. + /// Applies dynamic knowledge to simplify SCEV expressions in the context + /// of existing SCEV assumptions. The analysis will also add a minimal set + /// of new predicates if this is required to enable vectorization and + /// unrolling. + PredicatedScalarEvolution &PSE; /// Target Library Info. TargetLibraryInfo *TLI; /// Parent function @@ -1331,7 +1397,7 @@ private: /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic - SmallPtrSet MaskedOp; + SmallPtrSet MaskedOp; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1346,10 +1412,11 @@ public: LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI, AssumptionCache *AC, - const Function *F, const LoopVectorizeHints *Hints, + const TargetLibraryInfo *TLI, DemandedBits *DB, + AssumptionCache *AC, const Function *F, + const LoopVectorizeHints *Hints, SmallPtrSetImpl &ValuesToIgnore) - : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), + : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {} /// Information about vectorization costs @@ -1363,10 +1430,10 @@ public: /// possible. VectorizationFactor selectVectorizationFactor(bool OptForSize); - /// \return The size (in bits) of the widest type in the code that - /// needs to be vectorized. We ignore values that remain scalar such as + /// \return The size (in bits) of the smallest and widest types in the code + /// that needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. - unsigned getWidestType(); + std::pair getSmallestAndWidestTypes(); /// \return The desired interleave count. /// If interleave count has been specified by metadata it will be returned. @@ -1393,8 +1460,10 @@ public: unsigned NumInstructions; }; - /// \return information about the register usage of the loop. - RegisterUsage calculateRegisterUsage(); + /// \return Returns information about the register usages of the loop for the + /// given vectorization factors. + SmallVector + calculateRegisterUsage(const SmallVector &VFs); private: /// Returns the expected execution cost. The unit of the cost does @@ -1419,6 +1488,12 @@ private: emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); } +public: + /// Map of scalar integer values to the smallest bitwidth they can be legally + /// represented as. The vector equivalents of these values should be truncated + /// to this type. + MapVector MinBWs; + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. @@ -1431,6 +1506,8 @@ private: const TargetTransformInfo &TTI; /// Target Library Info. const TargetLibraryInfo *TLI; + /// Demanded bits analysis + DemandedBits *DB; const Function *TheFunction; // Loop Vectorize Hint. const LoopVectorizeHints *Hints; @@ -1523,6 +1600,7 @@ struct LoopVectorize : public FunctionPass { DominatorTree *DT; BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; + DemandedBits *DB; AliasAnalysis *AA; AssumptionCache *AC; LoopAccessAnalysis *LAA; @@ -1542,6 +1620,7 @@ struct LoopVectorize : public FunctionPass { AA = &getAnalysis().getAAResults(); AC = &getAnalysis().getAssumptionCache(F); LAA = &getAnalysis(); + DB = &getAnalysis(); // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. @@ -1666,9 +1745,11 @@ struct LoopVectorize : public FunctionPass { } } + PredicatedScalarEvolution PSE(*SE); + // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; - LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA, + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, &Requirements, &Hints); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); @@ -1687,8 +1768,8 @@ struct LoopVectorize : public FunctionPass { } // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints, - ValuesToIgnore); + LoopVectorizationCostModel CM(L, PSE.getSE(), LI, &LVL, *TTI, TLI, DB, AC, + F, &Hints, ValuesToIgnore); // Check the function attributes to find out if this function should be // optimized for size. @@ -1799,16 +1880,16 @@ struct LoopVectorize : public FunctionPass { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop then // interleave it. - InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC); - Unroller.vectorize(&LVL); + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC); + Unroller.vectorize(&LVL, CM.MinBWs); emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), Twine("interleaved loop (interleaved count: ") + Twine(IC) + ")"); } else { // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC); - LB.vectorize(&LVL); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC); + LB.vectorize(&LVL, CM.MinBWs); ++LoopsVectorized; // Add metadata to disable runtime unrolling scalar loop when there's no @@ -1842,6 +1923,7 @@ struct LoopVectorize : public FunctionPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); @@ -1907,6 +1989,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); + auto *SE = PSE.getSE(); // Make sure that the pointer does not point to structs. if (Ptr->getType()->getPointerElementType()->isAggregateType()) return 0; @@ -1918,7 +2001,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { return II.getConsecutiveDirection(); } - GetElementPtrInst *Gep = dyn_cast_or_null(Ptr); + GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (!Gep) return 0; @@ -1936,7 +2019,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // Make sure that all of the index operands are loop invariant. for (unsigned i = 1; i < NumOperands; ++i) - if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) return 0; InductionDescriptor II = Inductions[Phi]; @@ -1949,14 +2032,14 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // operand. for (unsigned i = 0; i != NumOperands; ++i) if (i != InductionOperand && - !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) return 0; // We can emit wide load/stores only if the last non-zero index is the // induction variable. const SCEV *Last = nullptr; if (!Strides.count(Gep)) - Last = SE->getSCEV(Gep->getOperand(InductionOperand)); + Last = PSE.getSCEV(Gep->getOperand(InductionOperand)); else { // Because of the multiplication by a stride we can have a s/zext cast. // We are going to replace this stride by 1 so the cast is safe to ignore. @@ -1967,7 +2050,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // %idxprom = zext i32 %mul to i64 << Safe cast. // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom // - Last = replaceSymbolicStrideSCEV(SE, Strides, + Last = replaceSymbolicStrideSCEV(PSE, Strides, Gep->getOperand(InductionOperand), Gep); if (const SCEVCastExpr *C = dyn_cast(Last)) Last = @@ -2311,7 +2394,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { VectorParts &Entry = WidenMap.get(Instr); // Handle consecutive loads/stores. - GetElementPtrInst *Gep = dyn_cast(Ptr); + GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { setDebugLocFromInst(Builder, Gep); Value *PtrOperand = Gep->getPointerOperand(); @@ -2325,8 +2408,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Ptr = Builder.Insert(Gep2); } else if (Gep) { setDebugLocFromInst(Builder, Gep); - assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), - OrigLoop) && "Base ptr must be invariant"); + assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()), + OrigLoop) && + "Base ptr must be invariant"); // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; @@ -2343,7 +2427,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { if (i == InductionOperand || (GepOperandInst && OrigLoop->contains(GepOperandInst))) { assert((i == InductionOperand || - SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && + PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst), + OrigLoop)) && "Must be last index or loop invariant"); VectorParts &GEPParts = getVectorValue(GepOperand); @@ -2371,7 +2456,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // We don't want to update the value in the map as it might be used in // another expression. So don't use a reference type for "StoredVal". VectorParts StoredVal = getVectorValue(SI->getValueOperand()); - + for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. Value *PartPtr = @@ -2432,7 +2517,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { } } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + bool IfPredicateStore) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; @@ -2494,7 +2580,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic Value *Cmp = nullptr; if (IfPredicateStore) { Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); - Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); + Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, + ConstantInt::get(Cmp->getType(), 1)); } Instruction *Cloned = Instr->clone(); @@ -2525,56 +2612,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic } } -static Instruction *getFirstInst(Instruction *FirstInst, Value *V, - Instruction *Loc) { - if (FirstInst) - return FirstInst; - if (Instruction *I = dyn_cast(V)) - return I->getParent() == Loc->getParent() ? I : nullptr; - return nullptr; -} - -std::pair -InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { - Instruction *tnullptr = nullptr; - if (!Legal->mustCheckStrides()) - return std::pair(tnullptr, tnullptr); - - IRBuilder<> ChkBuilder(Loc); - - // Emit checks. - Value *Check = nullptr; - Instruction *FirstInst = nullptr; - for (SmallPtrSet::iterator SI = Legal->strides_begin(), - SE = Legal->strides_end(); - SI != SE; ++SI) { - Value *Ptr = stripIntegerCast(*SI); - Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1), - "stride.chk"); - // Store the first instruction we create. - FirstInst = getFirstInst(FirstInst, C, Loc); - if (Check) - Check = ChkBuilder.CreateOr(Check, C); - else - Check = C; - } - - // We have to do this trickery because the IRBuilder might fold the check to a - // constant expression in which case there is no Instruction anchored in a - // the block. - LLVMContext &Ctx = Loc->getContext(); - Instruction *TheCheck = - BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx)); - ChkBuilder.Insert(TheCheck, "stride.not.one"); - FirstInst = getFirstInst(FirstInst, TheCheck, Loc); - - return std::make_pair(FirstInst, TheCheck); -} - -PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, - Value *Start, - Value *End, - Value *Step, +PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, + Value *End, Value *Step, Instruction *DL) { BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); @@ -2582,8 +2621,8 @@ PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, // yet. If so, use the header as this will be a single block loop. if (!Latch) Latch = Header; - - IRBuilder<> Builder(Header->getFirstInsertionPt()); + + IRBuilder<> Builder(&*Header->getFirstInsertionPt()); setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); @@ -2609,8 +2648,10 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); // Find the loop boundaries. + ScalarEvolution *SE = PSE.getSE(); const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop); - assert(BackedgeTakenCount != SE->getCouldNotCompute() && "Invalid loop count"); + assert(BackedgeTakenCount != SE->getCouldNotCompute() && + "Invalid loop count"); Type *IdxTy = Legal->getWidestInductionType(); @@ -2625,9 +2666,8 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); // Get the total trip count from the count by adding 1. - const SCEV *ExitCount = - SE->getAddExpr(BackedgeTakenCount, - SE->getConstant(BackedgeTakenCount->getType(), 1)); + const SCEV *ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); @@ -2710,26 +2750,28 @@ void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, LoopBypassBlocks.push_back(BB); } -void InnerLoopVectorizer::emitStrideChecks(Loop *L, - BasicBlock *Bypass) { +void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { BasicBlock *BB = L->getLoopPreheader(); - - // Generate the code to check that the strides we assumed to be one are really - // one. We want the new basic block to start at the first instruction in a + + // Generate the code to check that the SCEV assumptions that we made. + // We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. - Instruction *StrideCheck; - Instruction *FirstCheckInst; - std::tie(FirstCheckInst, StrideCheck) = addStrideCheck(BB->getTerminator()); - if (!StrideCheck) - return; + SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), + "scev.check"); + Value *SCEVCheck = + Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); + + if (auto *C = dyn_cast(SCEVCheck)) + if (C->isZero()) + return; // Create a new block containing the stride check. - BB->setName("vector.stridecheck"); + BB->setName("vector.scevcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); if (L->getParentLoop()) L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, StrideCheck)); + BranchInst::Create(Bypass, NewBB, SCEVCheck)); LoopBypassBlocks.push_back(BB); AddedSafetyChecks = true; } @@ -2849,10 +2891,10 @@ void InnerLoopVectorizer::createEmptyLoop() { // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. emitVectorLoopEnteredCheck(Lp, ScalarPH); - // Generate the code to check that the strides we assumed to be one are really - // one. We want the new basic block to start at the first instruction in a - // sequence of instructions that form a check. - emitStrideChecks(Lp, ScalarPH); + // Generate the code to check any assumptions that we've made for SCEV + // expressions. + emitSCEVChecks(Lp, ScalarPH); + // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. @@ -2925,7 +2967,7 @@ void InnerLoopVectorizer::createEmptyLoop() { BranchInst::Create(ExitBlock, ScalarPH, CmpN)); // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); // Save the state. LoopVectorPreHeader = Lp->getLoopPreheader(); @@ -2981,7 +3023,7 @@ static void cse(SmallVector &BBs) { for (unsigned i = 0, e = BBs.size(); i != e; ++i) { BasicBlock *BB = BBs[i]; for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { - Instruction *In = I++; + Instruction *In = &*I++; if (!CSEDenseMapInfo::canHandle(In)) continue; @@ -3103,6 +3145,117 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); } +static Type *smallestIntegerVectorType(Type *T1, Type *T2) { + IntegerType *I1 = cast(T1->getVectorElementType()); + IntegerType *I2 = cast(T2->getVectorElementType()); + return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; +} +static Type *largestIntegerVectorType(Type *T1, Type *T2) { + IntegerType *I1 = cast(T1->getVectorElementType()); + IntegerType *I2 = cast(T2->getVectorElementType()); + return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; +} + +void InnerLoopVectorizer::truncateToMinimalBitwidths() { + // For every instruction `I` in MinBWs, truncate the operands, create a + // truncated version of `I` and reextend its result. InstCombine runs + // later and will remove any ext/trunc pairs. + // + for (auto &KV : MinBWs) { + VectorParts &Parts = WidenMap.get(KV.first); + for (Value *&I : Parts) { + if (I->use_empty()) + continue; + Type *OriginalTy = I->getType(); + Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), + KV.second); + Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, + OriginalTy->getVectorNumElements()); + if (TruncatedTy == OriginalTy) + continue; + + IRBuilder<> B(cast(I)); + auto ShrinkOperand = [&](Value *V) -> Value* { + if (auto *ZI = dyn_cast(V)) + if (ZI->getSrcTy() == TruncatedTy) + return ZI->getOperand(0); + return B.CreateZExtOrTrunc(V, TruncatedTy); + }; + + // The actual instruction modification depends on the instruction type, + // unfortunately. + Value *NewI = nullptr; + if (BinaryOperator *BO = dyn_cast(I)) { + NewI = B.CreateBinOp(BO->getOpcode(), + ShrinkOperand(BO->getOperand(0)), + ShrinkOperand(BO->getOperand(1))); + cast(NewI)->copyIRFlags(I); + } else if (ICmpInst *CI = dyn_cast(I)) { + NewI = B.CreateICmp(CI->getPredicate(), + ShrinkOperand(CI->getOperand(0)), + ShrinkOperand(CI->getOperand(1))); + } else if (SelectInst *SI = dyn_cast(I)) { + NewI = B.CreateSelect(SI->getCondition(), + ShrinkOperand(SI->getTrueValue()), + ShrinkOperand(SI->getFalseValue())); + } else if (CastInst *CI = dyn_cast(I)) { + switch (CI->getOpcode()) { + default: llvm_unreachable("Unhandled cast!"); + case Instruction::Trunc: + NewI = ShrinkOperand(CI->getOperand(0)); + break; + case Instruction::SExt: + NewI = B.CreateSExtOrTrunc(CI->getOperand(0), + smallestIntegerVectorType(OriginalTy, + TruncatedTy)); + break; + case Instruction::ZExt: + NewI = B.CreateZExtOrTrunc(CI->getOperand(0), + smallestIntegerVectorType(OriginalTy, + TruncatedTy)); + break; + } + } else if (ShuffleVectorInst *SI = dyn_cast(I)) { + auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); + auto *O0 = + B.CreateZExtOrTrunc(SI->getOperand(0), + VectorType::get(ScalarTruncatedTy, Elements0)); + auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); + auto *O1 = + B.CreateZExtOrTrunc(SI->getOperand(1), + VectorType::get(ScalarTruncatedTy, Elements1)); + + NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); + } else if (isa(I)) { + // Don't do anything with the operands, just extend the result. + continue; + } else { + llvm_unreachable("Unhandled instruction type!"); + } + + // Lastly, extend the result. + NewI->takeName(cast(I)); + Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); + I->replaceAllUsesWith(Res); + cast(I)->eraseFromParent(); + I = Res; + } + } + + // We'll have created a bunch of ZExts that are now parentless. Clean up. + for (auto &KV : MinBWs) { + VectorParts &Parts = WidenMap.get(KV.first); + for (Value *&I : Parts) { + ZExtInst *Inst = dyn_cast(I); + if (Inst && Inst->use_empty()) { + Value *NewI = Inst->getOperand(0); + Inst->eraseFromParent(); + I = NewI; + } + } + } +} + void InnerLoopVectorizer::vectorizeLoop() { //===------------------------------------------------===// // @@ -3133,6 +3286,11 @@ void InnerLoopVectorizer::vectorizeLoop() { be = DFS.endRPO(); bb != be; ++bb) vectorizeBlockInLoop(*bb, &RdxPHIsToFix); + // Insert truncates and extends for any truncated instructions as hints to + // InstCombine. + if (VF > 1) + truncateToMinimalBitwidths(); + // At this point every instruction in the original loop is widened to // a vector form. We are almost done. Now, we need to fix the PHI nodes // that we vectorized. The PHI nodes are currently empty because we did @@ -3148,7 +3306,7 @@ void InnerLoopVectorizer::vectorizeLoop() { assert(RdxPhi && "Unable to recover vectorized PHI"); // Find the reduction variable descriptor. - assert(Legal->getReductionVars()->count(RdxPhi) && + assert(Legal->isReductionVariable(RdxPhi) && "Unable to find the reduction variable"); RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; @@ -3223,7 +3381,7 @@ void InnerLoopVectorizer::vectorizeLoop() { // the PHIs and the values we are going to write. // This allows us to write both PHINodes and the extractelement // instructions. - Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); VectorParts RdxParts = getVectorValue(LoopExitInst); setDebugLocFromInst(Builder, LoopExitInst); @@ -3247,7 +3405,7 @@ void InnerLoopVectorizer::vectorizeLoop() { ++UI; } } - Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); for (unsigned part = 0; part < UF; ++part) RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy); } @@ -3361,8 +3519,8 @@ void InnerLoopVectorizer::vectorizeLoop() { // Predicate any stores. for (auto KV : PredicatedStores) { BasicBlock::iterator I(KV.first); - auto *BB = SplitBlock(I->getParent(), std::next(I), DT, LI); - auto *T = SplitBlockAndInsertIfThen(KV.second, I, /*Unreachable=*/false, + auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI); + auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, /*BranchWeights=*/nullptr, DT); I->moveBefore(T); I->getParent()->setName("pred.store.if"); @@ -3443,18 +3601,18 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { return BlockMask; } -void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, - InnerLoopVectorizer::VectorParts &Entry, - unsigned UF, unsigned VF, PhiVector *PV) { +void InnerLoopVectorizer::widenPHIInstruction( + Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, + unsigned VF, PhiVector *PV) { PHINode* P = cast(PN); // Handle reduction variables: - if (Legal->getReductionVars()->count(P)) { + if (Legal->isReductionVariable(P)) { for (unsigned part = 0; part < UF; ++part) { // This is phase one of vectorizing PHIs. Type *VecTy = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); - Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", - LoopVectorBody.back()-> getFirstInsertionPt()); + Entry[part] = PHINode::Create( + VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt()); } PV->push_back(P); return; @@ -3510,7 +3668,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, case InductionDescriptor::IK_NoInduction: llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: { - assert(P->getType() == II.getStartValue()->getType() && "Types must match"); + assert(P->getType() == II.getStartValue()->getType() && + "Types must match"); // Handle other induction variables that are now based on the // canonical one. Value *V = Induction; @@ -3565,7 +3724,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - VectorParts &Entry = WidenMap.get(it); + VectorParts &Entry = WidenMap.get(&*it); + switch (it->getOpcode()) { case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the @@ -3573,7 +3733,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { continue; case Instruction::PHI: { // Vectorize PHINodes. - widenPHIInstruction(it, Entry, UF, VF, PV); + widenPHIInstruction(&*it, Entry, UF, VF, PV); continue; }// End of PHI. @@ -3611,16 +3771,17 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = V; } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } case Instruction::Select: { // Widen selects. // If the selector is loop invariant we can create a select // instruction with a scalar condition. Otherwise, use vector-select. - bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), - OrigLoop); - setDebugLocFromInst(Builder, it); + auto *SE = PSE.getSE(); + bool InvariantCond = + SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop); + setDebugLocFromInst(Builder, &*it); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. @@ -3629,7 +3790,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { VectorParts &Cond = getVectorValue(it->getOperand(0)); VectorParts &Op0 = getVectorValue(it->getOperand(1)); VectorParts &Op1 = getVectorValue(it->getOperand(2)); - + Value *ScalarCond = (VF == 1) ? Cond[0] : Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); @@ -3640,7 +3801,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Op1[Part]); } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } @@ -3649,25 +3810,27 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Widen compares. Generate vector compares. bool FCmp = (it->getOpcode() == Instruction::FCmp); CmpInst *Cmp = dyn_cast(it); - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); VectorParts &A = getVectorValue(it->getOperand(0)); VectorParts &B = getVectorValue(it->getOperand(1)); for (unsigned Part = 0; Part < UF; ++Part) { Value *C = nullptr; - if (FCmp) + if (FCmp) { C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); - else + cast(C)->copyFastMathFlags(&*it); + } else { C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); + } Entry[Part] = C; } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } case Instruction::Store: case Instruction::Load: - vectorizeMemoryInstruction(it); + vectorizeMemoryInstruction(&*it); break; case Instruction::ZExt: case Instruction::SExt: @@ -3682,7 +3845,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { case Instruction::FPTrunc: case Instruction::BitCast: { CastInst *CI = dyn_cast(it); - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); /// Optimize the special case where the source is the induction /// variable. Notice that we can only optimize the 'trunc' case /// because: a. FP conversions lose precision, b. sext/zext may wrap, @@ -3692,12 +3855,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, CI->getType()); Value *Broadcasted = getBroadcastInstrs(ScalarCast); - InductionDescriptor II = Legal->getInductionVars()->lookup(OldInduction); - Constant *Step = - ConstantInt::getSigned(CI->getType(), II.getStepValue()->getSExtValue()); + InductionDescriptor II = + Legal->getInductionVars()->lookup(OldInduction); + Constant *Step = ConstantInt::getSigned( + CI->getType(), II.getStepValue()->getSExtValue()); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } /// Vectorize casts. @@ -3707,7 +3871,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { VectorParts &A = getVectorValue(it->getOperand(0)); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } @@ -3715,7 +3879,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Ignore dbg intrinsics. if (isa(it)) break; - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); Module *M = BB->getParent()->getParent(); CallInst *CI = cast(it); @@ -3731,7 +3895,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || ID == Intrinsic::lifetime_start)) { - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; } // The flag shows whether we use Intrinsic or a usual Call for vectorized @@ -3742,7 +3906,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { bool UseVectorIntrinsic = ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; if (!UseVectorIntrinsic && NeedToScalarize) { - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; } @@ -3783,13 +3947,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = Builder.CreateCall(VectorF, Args); } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } default: // All other instructions are unsupported. Scalarize them. - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; }// end of switch. }// end of for_each instr. @@ -3797,7 +3961,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { void InnerLoopVectorizer::updateAnalysis() { // Forget the original basic block. - SE->forgetLoop(OrigLoop); + PSE.getSE()->forgetLoop(OrigLoop); // Update the dominator tree information. assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && @@ -3949,10 +4113,10 @@ bool LoopVectorizationLegality::canVectorize() { } // ScalarEvolution needs to be able to find the exit count. - const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); - if (ExitCount == SE->getCouldNotCompute()) { - emitAnalysis(VectorizationReport() << - "could not determine number of loop iterations"); + const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop); + if (ExitCount == PSE.getSE()->getCouldNotCompute()) { + emitAnalysis(VectorizationReport() + << "could not determine number of loop iterations"); DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } @@ -3986,7 +4150,19 @@ bool LoopVectorizationLegality::canVectorize() { // Analyze interleaved memory accesses. if (UseInterleaved) - InterleaveInfo.analyzeInterleaving(Strides); + InterleaveInfo.analyzeInterleaving(Strides); + + unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; + if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) + SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; + + if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { + emitAnalysis(VectorizationReport() + << "Too many SCEV assumptions need to be made and checked " + << "at runtime"); + DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); + return false; + } // Okay! We can vectorize. At this point we don't have any other mem analysis // which may limit our maximum vectorization factor, so just return true with @@ -4057,7 +4233,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; @@ -4069,9 +4245,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (*bb != Header) { // Check that this instruction has no outside users or is an // identified reduction value with an outside user. - if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (!hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) continue; - emitAnalysis(VectorizationReport(it) << + emitAnalysis(VectorizationReport(&*it) << "value could not be identified as " "an induction or reduction variable"); return false; @@ -4079,14 +4255,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // We only allow if-converted PHIs with exactly two incoming values. if (Phi->getNumIncomingValues() != 2) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } InductionDescriptor ID; - if (InductionDescriptor::isInductionPHI(Phi, SE, ID)) { + if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) { Inductions[Phi] = ID; // Get the widest type. if (!WidestIndTy) @@ -4111,8 +4287,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Until we explicitly handle the case of an induction variable with // an outside loop user we have to give up vectorizing this loop. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(VectorizationReport(it) << + if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { + emitAnalysis(VectorizationReport(&*it) << "use of induction value outside of the " "loop is not handled by vectorizer"); return false; @@ -4121,16 +4297,16 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } - if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, - Reductions[Phi])) { - if (Reductions[Phi].hasUnsafeAlgebra()) - Requirements->addUnsafeAlgebraInst( - Reductions[Phi].getUnsafeAlgebraInst()); - AllowedExit.insert(Reductions[Phi].getLoopExitInstr()); + RecurrenceDescriptor RedDes; + if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) { + if (RedDes.hasUnsafeAlgebra()) + Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst()); + AllowedExit.insert(RedDes.getLoopExitInstr()); + Reductions[Phi] = RedDes; continue; } - emitAnalysis(VectorizationReport(it) << + emitAnalysis(VectorizationReport(&*it) << "value that could not be identified as " "reduction is used outside the loop"); DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); @@ -4145,8 +4321,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa(CI) && !(CI->getCalledFunction() && TLI && TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { - emitAnalysis(VectorizationReport(it) << - "call instruction cannot be vectorized"); + emitAnalysis(VectorizationReport(&*it) + << "call instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); return false; } @@ -4155,8 +4331,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // second argument is the same (i.e. loop invariant) if (CI && hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { - if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { - emitAnalysis(VectorizationReport(it) + auto *SE = PSE.getSE(); + if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { + emitAnalysis(VectorizationReport(&*it) << "intrinsic instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); return false; @@ -4167,7 +4344,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(it->getType()) && !it->getType()->isVoidTy()) || isa(it)) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; @@ -4191,8 +4368,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(VectorizationReport(it) << + if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { + emitAnalysis(VectorizationReport(&*it) << "value cannot be used outside the loop"); return false; } @@ -4228,7 +4405,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { else return; - Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop); + Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop); if (!Stride) return; @@ -4254,7 +4431,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { BE = TheLoop->block_end(); B != BE; ++B) for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end(); I != IE; ++I) - if (I->getType()->isPointerTy() && isConsecutivePtr(I)) + if (I->getType()->isPointerTy() && isConsecutivePtr(&*I)) Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); while (!Worklist.empty()) { @@ -4292,6 +4469,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); + PSE.addPredicate(LAI->PSE.getUnionPredicate()); return true; } @@ -4347,8 +4525,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || !isSinglePredecessor) { - // Build a masked store if it is legal for the target, otherwise scalarize - // the block. + // Build a masked store if it is legal for the target, otherwise + // scalarize the block. bool isLegalMaskedOp = isLegalMaskedStore(SI->getValueOperand()->getType(), SI->getPointerOperand()); @@ -4406,7 +4584,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses( StoreInst *SI = dyn_cast(I); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); - int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides); + int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides); // The factor of the corresponding interleave group. unsigned Factor = std::abs(Stride); @@ -4415,7 +4593,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses( if (Factor < 2 || Factor > MaxInterleaveGroupFactor) continue; - const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); + const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); PointerType *PtrTy = dyn_cast(Ptr->getType()); unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); @@ -4461,6 +4639,8 @@ void InterleavedAccessInfo::analyzeInterleaving( // Holds all interleaved store groups temporarily. SmallSetVector StoreGroups; + // Holds all interleaved load groups temporarily. + SmallSetVector LoadGroups; // Search the load-load/write-write pair B-A in bottom-up order and try to // insert B into the interleave group of A according to 3 rules: @@ -4488,6 +4668,8 @@ void InterleavedAccessInfo::analyzeInterleaving( if (A->mayWriteToMemory()) StoreGroups.insert(Group); + else + LoadGroups.insert(Group); for (auto II = std::next(I); II != E; ++II) { Instruction *B = II->first; @@ -4502,12 +4684,12 @@ void InterleavedAccessInfo::analyzeInterleaving( continue; // Calculate the distance and prepare for the rule 3. - const SCEVConstant *DistToA = - dyn_cast(SE->getMinusSCEV(DesB.Scev, DesA.Scev)); + const SCEVConstant *DistToA = dyn_cast( + PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev)); if (!DistToA) continue; - int DistanceToA = DistToA->getValue()->getValue().getSExtValue(); + int DistanceToA = DistToA->getAPInt().getSExtValue(); // Skip if the distance is not multiple of size as they are not in the // same group. @@ -4535,6 +4717,12 @@ void InterleavedAccessInfo::analyzeInterleaving( for (InterleaveGroup *Group : StoreGroups) if (Group->getNumMembers() != Group->getFactor()) releaseGroup(Group); + + // Remove interleaved load groups that don't have the first and last member. + // This guarantees that we won't do speculative out of bounds loads. + for (InterleaveGroup *Group : LoadGroups) + if (!Group->getMember(0) || !Group->getMember(Group->getFactor() - 1)) + releaseGroup(Group); } LoopVectorizationCostModel::VectorizationFactor @@ -4562,7 +4750,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { unsigned TC = SE->getSmallConstantTripCount(TheLoop); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - unsigned WidestType = getWidestType(); + MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); + unsigned SmallestType, WidestType; + std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); unsigned MaxSafeDepDist = -1U; if (Legal->getMaxSafeDepDistBytes() != -1U) @@ -4570,7 +4760,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { WidestRegister = ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist); unsigned MaxVectorSize = WidestRegister / WidestType; - DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); + + DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " + << WidestType << " bits.\n"); DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"); @@ -4583,6 +4775,26 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { " into one vector!"); unsigned VF = MaxVectorSize; + if (MaximizeBandwidth && !OptForSize) { + // Collect all viable vectorization factors. + SmallVector VFs; + unsigned NewMaxVectorSize = WidestRegister / SmallestType; + for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2) + VFs.push_back(VS); + + // For each VF calculate its register usage. + auto RUs = calculateRegisterUsage(VFs); + + // Select the largest VF which doesn't require more registers than existing + // ones. + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); + for (int i = RUs.size() - 1; i >= 0; --i) { + if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { + VF = VFs[i]; + break; + } + } + } // If we optimize the program for size, avoid creating the tail loop. if (OptForSize) { @@ -4658,7 +4870,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { return Factor; } -unsigned LoopVectorizationCostModel::getWidestType() { +std::pair +LoopVectorizationCostModel::getSmallestAndWidestTypes() { + unsigned MinWidth = -1U; unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); @@ -4672,7 +4886,7 @@ unsigned LoopVectorizationCostModel::getWidestType() { Type *T = it->getType(); // Skip ignored values. - if (ValuesToIgnore.count(it)) + if (ValuesToIgnore.count(&*it)) continue; // Only examine Loads, Stores and PHINodes. @@ -4682,7 +4896,7 @@ unsigned LoopVectorizationCostModel::getWidestType() { // Examine PHI nodes that are reduction variables. Update the type to // account for the recurrence type. if (PHINode *PN = dyn_cast(it)) { - if (!Legal->getReductionVars()->count(PN)) + if (!Legal->isReductionVariable(PN)) continue; RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; T = RdxDesc.getRecurrenceType(); @@ -4695,15 +4909,17 @@ unsigned LoopVectorizationCostModel::getWidestType() { // Ignore loaded pointer types and stored pointer types that are not // consecutive. However, we do want to take consecutive stores/loads of // pointer vectors into account. - if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) + if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it)) continue; + MinWidth = std::min(MinWidth, + (unsigned)DL.getTypeSizeInBits(T->getScalarType())); MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(T->getScalarType())); } } - return MaxWidth; + return {MinWidth, MaxWidth}; } unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, @@ -4749,7 +4965,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, TargetNumRegisters = ForceTargetNumVectorRegs; } - LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); + RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); @@ -4847,8 +5063,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, } // Interleave if this is a large loop (small loops are already dealt with by - // this - // point) that could benefit from interleaving. + // this point) that could benefit from interleaving. bool HasReductions = (Legal->getReductionVars()->size() > 0); if (TTI.enableAggressiveInterleaving(HasReductions)) { DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); @@ -4859,8 +5074,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, return 1; } -LoopVectorizationCostModel::RegisterUsage -LoopVectorizationCostModel::calculateRegisterUsage() { +SmallVector +LoopVectorizationCostModel::calculateRegisterUsage( + const SmallVector &VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -4881,8 +5097,8 @@ LoopVectorizationCostModel::calculateRegisterUsage() { LoopBlocksDFS DFS(TheLoop); DFS.perform(LI); - RegisterUsage R; - R.NumInstructions = 0; + RegisterUsage RU; + RU.NumInstructions = 0; // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the @@ -4901,15 +5117,13 @@ LoopVectorizationCostModel::calculateRegisterUsage() { unsigned Index = 0; for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), be = DFS.endRPO(); bb != be; ++bb) { - R.NumInstructions += (*bb)->size(); - for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; - ++it) { - Instruction *I = it; - IdxToInstr[Index++] = I; + RU.NumInstructions += (*bb)->size(); + for (Instruction &I : **bb) { + IdxToInstr[Index++] = &I; // Save the end location of each USE. - for (unsigned i = 0; i < I->getNumOperands(); ++i) { - Value *U = I->getOperand(i); + for (unsigned i = 0; i < I.getNumOperands(); ++i) { + Value *U = I.getOperand(i); Instruction *Instr = dyn_cast(U); // Ignore non-instruction values such as arguments, constants, etc. @@ -4938,10 +5152,26 @@ LoopVectorizationCostModel::calculateRegisterUsage() { TransposeEnds[it->second].push_back(it->first); SmallSet OpenIntervals; - unsigned MaxUsage = 0; + // Get the size of the widest register. + unsigned MaxSafeDepDist = -1U; + if (Legal->getMaxSafeDepDistBytes() != -1U) + MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; + unsigned WidestRegister = + std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); + const DataLayout &DL = TheFunction->getParent()->getDataLayout(); + + SmallVector RUs(VFs.size()); + SmallVector MaxUsages(VFs.size(), 0); DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + + // A lambda that gets the register usage for the given type and VF. + auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { + unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); + return std::max(1, VF * TypeSize / WidestRegister); + }; + for (unsigned int i = 0; i < Index; ++i) { Instruction *I = IdxToInstr[i]; // Ignore instructions that are never used within the loop. @@ -4953,27 +5183,50 @@ LoopVectorizationCostModel::calculateRegisterUsage() { // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; - for (unsigned int j=0, e = List.size(); j < e; ++j) + for (unsigned int j = 0, e = List.size(); j < e; ++j) OpenIntervals.erase(List[j]); - // Count the number of live interals. - MaxUsage = std::max(MaxUsage, OpenIntervals.size()); + // For each VF find the maximum usage of registers. + for (unsigned j = 0, e = VFs.size(); j < e; ++j) { + if (VFs[j] == 1) { + MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); + continue; + } + + // Count the number of live intervals. + unsigned RegUsage = 0; + for (auto Inst : OpenIntervals) + RegUsage += GetRegUsage(Inst->getType(), VFs[j]); + MaxUsages[j] = std::max(MaxUsages[j], RegUsage); + } - DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << - OpenIntervals.size() << '\n'); + DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " + << OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); } - unsigned Invariant = LoopInvariants.size(); - DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'); - DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); - DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'); + for (unsigned i = 0, e = VFs.size(); i < e; ++i) { + unsigned Invariant = 0; + if (VFs[i] == 1) + Invariant = LoopInvariants.size(); + else { + for (auto Inst : LoopInvariants) + Invariant += GetRegUsage(Inst->getType(), VFs[i]); + } + + DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); + DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); + DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); + DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n'); + + RU.LoopInvariantRegs = Invariant; + RU.MaxLocalUsers = MaxUsages[i]; + RUs[i] = RU; + } - R.LoopInvariantRegs = Invariant; - R.MaxLocalUsers = MaxUsage; - return R; + return RUs; } unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { @@ -4992,10 +5245,10 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { continue; // Skip ignored values. - if (ValuesToIgnore.count(it)) + if (ValuesToIgnore.count(&*it)) continue; - unsigned C = getInstructionCost(it, VF); + unsigned C = getInstructionCost(&*it, VF); // Check if we should override the cost. if (ForceTargetInstructionCost.getNumOccurrences() > 0) @@ -5060,7 +5313,7 @@ static bool isLikelyComplexAddressComputation(Value *Ptr, if (!C) return true; - const APInt &APStepVal = C->getValue()->getValue(); + const APInt &APStepVal = C->getAPInt(); // Huge step value - give up. if (APStepVal.getBitWidth() > 64) @@ -5072,9 +5325,8 @@ static bool isLikelyComplexAddressComputation(Value *Ptr, } static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { - if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1))) - return true; - return false; + return Legal->hasStride(I->getOperand(0)) || + Legal->hasStride(I->getOperand(1)); } unsigned @@ -5085,6 +5337,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { VF = 1; Type *RetTy = I->getType(); + if (VF > 1 && MinBWs.count(I)) + RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); Type *VectorTy = ToVectorTy(RetTy, VF); // TODO: We need to estimate the cost of intrinsic calls. @@ -5167,6 +5421,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); + Instruction *Op0AsInstruction = dyn_cast(I->getOperand(0)); + auto It = MinBWs.find(Op0AsInstruction); + if (VF > 1 && It != MinBWs.end()) + ValTy = IntegerType::get(ValTy->getContext(), It->second); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); } @@ -5290,8 +5548,28 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { Legal->isInductionVariable(I->getOperand(0))) return TTI.getCastInstrCost(I->getOpcode(), I->getType(), I->getOperand(0)->getType()); - - Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); + + Type *SrcScalarTy = I->getOperand(0)->getType(); + Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); + if (VF > 1 && MinBWs.count(I)) { + // This cast is going to be shrunk. This may remove the cast or it might + // turn it into slightly different cast. For example, if MinBW == 16, + // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". + // + // Calculate the modified src and dest types. + Type *MinVecTy = VectorTy; + if (I->getOpcode() == Instruction::Trunc) { + SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); + VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), + MinVecTy); + } else if (I->getOpcode() == Instruction::ZExt || + I->getOpcode() == Instruction::SExt) { + SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); + VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), + MinVecTy); + } + } + return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } case Instruction::Call: { @@ -5342,6 +5620,7 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(DemandedBits) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm {