lib/Transforms/Vectorize/LoopVectorize.h

   1 //===- LoopVectorize.h --- A Loop Vectorizer ------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
  11 // and generates target-independent LLVM-IR. Legalization of the IR is done
  12 // in the codegen. However, the vectorizes uses (will use) the codegen
  13 // interfaces to generate IR that is likely to result in an optimal binary.
  14 //
  15 // The loop vectorizer combines consecutive loop iteration into a single
  16 // 'wide' iteration. After this transformation the index is incremented
  17 // by the SIMD vector width, and not by one.
  18 //
  19 // This pass has three parts:
  20 // 1. The main loop pass that drives the different parts.
  21 // 2. LoopVectorizationLegality - A unit that checks for the legality
  22 //    of the vectorization.
  23 // 3. InnerLoopVectorizer - A unit that performs the actual
  24 //    widening of instructions.
  25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
  26 //    of vectorization. It decides on the optimal vector width, which
  27 //    can be one, if vectorization is not profitable.
  28 //
  29 //===----------------------------------------------------------------------===//
  30 //
  31 // The reduction-variable vectorization is based on the paper:
  32 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
  33 //
  34 // Variable uniformity checks are inspired by:
  35 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
  36 //
  37 // Other ideas/concepts are from:
  38 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
  39 //
  40 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
  41 //  Vectorizing Compilers.
  42 //
  43 //===----------------------------------------------------------------------===//
  44 #ifndef LLVM_TRANSFORM_VECTORIZE_LOOP_VECTORIZE_H
  45 #define LLVM_TRANSFORM_VECTORIZE_LOOP_VECTORIZE_H
  46
  47 #define LV_NAME "loop-vectorize"
  48 #define DEBUG_TYPE LV_NAME
  49
  50 #include "llvm/ADT/DenseMap.h"
  51 #include "llvm/ADT/MapVector.h"
  52 #include "llvm/ADT/SmallPtrSet.h"
  53 #include "llvm/ADT/SmallVector.h"
  54 #include "llvm/Analysis/ScalarEvolution.h"
  55 #include "llvm/IR/IRBuilder.h"
  56 #include <algorithm>
  57 #include <map>
  58
  59 using namespace llvm;
  60
  61 /// We don't vectorize loops with a known constant trip count below this number.
  62 const unsigned TinyTripCountThreshold = 16;
  63
  64 /// When performing a runtime memory check, do not check more than this
  65 /// number of pointers. Notice that the check is quadratic!
  66 const unsigned RuntimeMemoryCheckThreshold = 4;
  67
  68 /// This is the highest vector width that we try to generate.
  69 const unsigned MaxVectorSize = 8;
  70
  71 /// This is the highest Unroll Factor.
  72 const unsigned MaxUnrollSize = 4;
  73
  74 namespace llvm {
  75
  76 // Forward declarations.
  77 class LoopVectorizationLegality;
  78 class LoopVectorizationCostModel;
  79 class TargetTransformInfo;
  80
  81 /// InnerLoopVectorizer vectorizes loops which contain only one basic
  82 /// block to a specified vectorization factor (VF).
  83 /// This class performs the widening of scalars into vectors, or multiple
  84 /// scalars. This class also implements the following features:
  85 /// * It inserts an epilogue loop for handling loops that don't have iteration
  86 ///   counts that are known to be a multiple of the vectorization factor.
  87 /// * It handles the code generation for reduction variables.
  88 /// * Scalarization (implementation using scalars) of un-vectorizable
  89 ///   instructions.
  90 /// InnerLoopVectorizer does not perform any vectorization-legality
  91 /// checks, and relies on the caller to check for the different legality
  92 /// aspects. The InnerLoopVectorizer relies on the
  93 /// LoopVectorizationLegality class to provide information about the induction
  94 /// and reduction variables that were found to a given vectorization factor.
  95 class InnerLoopVectorizer {
  96 public:
  97   /// Ctor.
  98   InnerLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
  99                       DominatorTree *Dt, DataLayout *Dl,
 100                       unsigned VecWidth, unsigned UnrollFactor):
 101   OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth),
 102   UF(UnrollFactor), Builder(Se->getContext()), Induction(0), OldInduction(0),
 103   WidenMap(UnrollFactor) { }
 104
 105   // Perform the actual loop widening (vectorization).
 106   void vectorize(LoopVectorizationLegality *Legal) {
 107     // Create a new empty loop. Unlink the old loop and connect the new one.
 108     createEmptyLoop(Legal);
 109     // Widen each instruction in the old loop to a new one in the new loop.
 110     // Use the Legality module to find the induction and reduction variables.
 111     vectorizeLoop(Legal);
 112     // Register the new loop and update the analysis passes.
 113     updateAnalysis();
 114   }
 115
 116 private:
 117   /// A small list of PHINodes.
 118   typedef SmallVector<PHINode*, 4> PhiVector;
 119   /// When we unroll loops we have multiple vector values for each scalar.
 120   /// This data structure holds the unrolled and vectorized values that
 121   /// originated from one scalar instruction.
 122   typedef SmallVector<Value*, 2> VectorParts;
 123
 124   /// Add code that checks at runtime if the accessed arrays overlap.
 125   /// Returns the comparator value or NULL if no check is needed.
 126   Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
 127                          Instruction *Loc);
 128   /// Create an empty loop, based on the loop ranges of the old loop.
 129   void createEmptyLoop(LoopVectorizationLegality *Legal);
 130   /// Copy and widen the instructions from the old loop.
 131   void vectorizeLoop(LoopVectorizationLegality *Legal);
 132
 133   /// A helper function that computes the predicate of the block BB, assuming
 134   /// that the header block of the loop is set to True. It returns the *entry*
 135   /// mask for the block BB.
 136   VectorParts createBlockInMask(BasicBlock *BB);
 137   /// A helper function that computes the predicate of the edge between SRC
 138   /// and DST.
 139   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
 140
 141   /// A helper function to vectorize a single BB within the innermost loop.
 142   void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
 143                             PhiVector *PV);
 144
 145   /// Insert the new loop to the loop hierarchy and pass manager
 146   /// and update the analysis passes.
 147   void updateAnalysis();
 148
 149   /// This instruction is un-vectorizable. Implement it as a sequence
 150   /// of scalars.
 151   void scalarizeInstruction(Instruction *Instr);
 152
 153   /// Create a broadcast instruction. This method generates a broadcast
 154   /// instruction (shuffle) for loop invariant values and for the induction
 155   /// value. If this is the induction variable then we extend it to N, N+1, ...
 156   /// this is needed because each iteration in the loop corresponds to a SIMD
 157   /// element.
 158   Value *getBroadcastInstrs(Value *V);
 159
 160   /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
 161   /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
 162   /// The sequence starts at StartIndex.
 163   Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate);
 164
 165   /// When we go over instructions in the basic block we rely on previous
 166   /// values within the current basic block or on loop invariant values.
 167   /// When we widen (vectorize) values we place them in the map. If the values
 168   /// are not within the map, they have to be loop invariant, so we simply
 169   /// broadcast them into a vector.
 170   VectorParts &getVectorValue(Value *V);
 171
 172   /// Get a uniform vector of constant integers. We use this to get
 173   /// vectors of ones and zeros for the reduction code.
 174   Constant* getUniformVector(unsigned Val, Type* ScalarTy);
 175
 176   /// Generate a shuffle sequence that will reverse the vector Vec.
 177   Value *reverseVector(Value *Vec);
 178
 179   /// This is a helper class that holds the vectorizer state. It maps scalar
 180   /// instructions to vector instructions. When the code is 'unrolled' then
 181   /// then a single scalar value is mapped to multiple vector parts. The parts
 182   /// are stored in the VectorPart type.
 183   struct ValueMap {
 184     /// C'tor.  UnrollFactor controls the number of vectors ('parts') that
 185     /// are mapped.
 186     ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
 187
 188     /// \return True if 'Key' is saved in the Value Map.
 189     bool has(Value *Key) { return MapStoreage.count(Key); }
 190
 191     /// Initializes a new entry in the map. Sets all of the vector parts to the
 192     /// save value in 'Val'.
 193     /// \return A reference to a vector with splat values.
 194     VectorParts &splat(Value *Key, Value *Val) {
 195         MapStoreage[Key].clear();
 196         MapStoreage[Key].append(UF, Val);
 197         return MapStoreage[Key];
 198     }
 199
 200     ///\return A reference to the value that is stored at 'Key'.
 201     VectorParts &get(Value *Key) {
 202       if (!has(Key))
 203         MapStoreage[Key].resize(UF);
 204       return MapStoreage[Key];
 205     }
 206
 207     /// The unroll factor. Each entry in the map stores this number of vector
 208     /// elements.
 209     unsigned UF;
 210
 211     /// Map storage. We use std::map and not DenseMap because insertions to a
 212     /// dense map invalidates its iterators.
 213     std::map<Value*, VectorParts> MapStoreage;
 214   };
 215
 216   /// The original loop.
 217   Loop *OrigLoop;
 218   /// Scev analysis to use.
 219   ScalarEvolution *SE;
 220   /// Loop Info.
 221   LoopInfo *LI;
 222   /// Dominator Tree.
 223   DominatorTree *DT;
 224   /// Data Layout.
 225   DataLayout *DL;
 226   /// The vectorization SIMD factor to use. Each vector will have this many
 227   /// vector elements.
 228   unsigned VF;
 229   /// The vectorization unroll factor to use. Each scalar is vectorized to this
 230   /// many different vector instructions.
 231   unsigned UF;
 232
 233   /// The builder that we use
 234   IRBuilder<> Builder;
 235
 236   // --- Vectorization state ---
 237
 238   /// The vector-loop preheader.
 239   BasicBlock *LoopVectorPreHeader;
 240   /// The scalar-loop preheader.
 241   BasicBlock *LoopScalarPreHeader;
 242   /// Middle Block between the vector and the scalar.
 243   BasicBlock *LoopMiddleBlock;
 244   ///The ExitBlock of the scalar loop.
 245   BasicBlock *LoopExitBlock;
 246   ///The vector loop body.
 247   BasicBlock *LoopVectorBody;
 248   ///The scalar loop body.
 249   BasicBlock *LoopScalarBody;
 250   ///The first bypass block.
 251   BasicBlock *LoopBypassBlock;
 252
 253   /// The new Induction variable which was added to the new block.
 254   PHINode *Induction;
 255   /// The induction variable of the old basic block.
 256   PHINode *OldInduction;
 257   /// Maps scalars to widened vectors.
 258   ValueMap WidenMap;
 259 };
 260
 261 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
 262 /// to what vectorization factor.
 263 /// This class does not look at the profitability of vectorization, only the
 264 /// legality. This class has two main kinds of checks:
 265 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
 266 ///   will change the order of memory accesses in a way that will change the
 267 ///   correctness of the program.
 268 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
 269 /// checks for a number of different conditions, such as the availability of a
 270 /// single induction variable, that all types are supported and vectorize-able,
 271 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
 272 /// This class is also used by InnerLoopVectorizer for identifying
 273 /// induction variable and the different reduction variables.
 274 class LoopVectorizationLegality {
 275 public:
 276   LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl,
 277                             DominatorTree *Dt):
 278   TheLoop(Lp), SE(Se), DL(Dl), DT(Dt), Induction(0) { }
 279
 280   /// This enum represents the kinds of reductions that we support.
 281   enum ReductionKind {
 282     NoReduction, /// Not a reduction.
 283     IntegerAdd,  /// Sum of numbers.
 284     IntegerMult, /// Product of numbers.
 285     IntegerOr,   /// Bitwise or logical OR of numbers.
 286     IntegerAnd,  /// Bitwise or logical AND of numbers.
 287     IntegerXor   /// Bitwise or logical XOR of numbers.
 288   };
 289
 290   /// This enum represents the kinds of inductions that we support.
 291   enum InductionKind {
 292     NoInduction,         /// Not an induction variable.
 293     IntInduction,        /// Integer induction variable. Step = 1.
 294     ReverseIntInduction, /// Reverse int induction variable. Step = -1.
 295     PtrInduction         /// Pointer induction variable. Step = sizeof(elem).
 296   };
 297
 298   /// This POD struct holds information about reduction variables.
 299   struct ReductionDescriptor {
 300     // Default C'tor
 301     ReductionDescriptor():
 302     StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
 303
 304     // C'tor.
 305     ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
 306     StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
 307
 308     // The starting value of the reduction.
 309     // It does not have to be zero!
 310     Value *StartValue;
 311     // The instruction who's value is used outside the loop.
 312     Instruction *LoopExitInstr;
 313     // The kind of the reduction.
 314     ReductionKind Kind;
 315   };
 316
 317   // This POD struct holds information about the memory runtime legality
 318   // check that a group of pointers do not overlap.
 319   struct RuntimePointerCheck {
 320     RuntimePointerCheck(): Need(false) {}
 321
 322     /// Reset the state of the pointer runtime information.
 323     void reset() {
 324       Need = false;
 325       Pointers.clear();
 326       Starts.clear();
 327       Ends.clear();
 328     }
 329
 330     /// Insert a pointer and calculate the start and end SCEVs.
 331     void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr);
 332
 333     /// This flag indicates if we need to add the runtime check.
 334     bool Need;
 335     /// Holds the pointers that we need to check.
 336     SmallVector<Value*, 2> Pointers;
 337     /// Holds the pointer value at the beginning of the loop.
 338     SmallVector<const SCEV*, 2> Starts;
 339     /// Holds the pointer value at the end of the loop.
 340     SmallVector<const SCEV*, 2> Ends;
 341   };
 342
 343   /// A POD for saving information about induction variables.
 344   struct InductionInfo {
 345     /// Ctors.
 346     InductionInfo(Value *Start, InductionKind K):
 347     StartValue(Start), IK(K) {};
 348     InductionInfo(): StartValue(0), IK(NoInduction) {};
 349     /// Start value.
 350     Value *StartValue;
 351     /// Induction kind.
 352     InductionKind IK;
 353   };
 354
 355   /// ReductionList contains the reduction descriptors for all
 356   /// of the reductions that were found in the loop.
 357   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
 358
 359   /// InductionList saves induction variables and maps them to the
 360   /// induction descriptor.
 361   typedef MapVector<PHINode*, InductionInfo> InductionList;
 362
 363   /// Returns true if it is legal to vectorize this loop.
 364   /// This does not mean that it is profitable to vectorize this
 365   /// loop, only that it is legal to do so.
 366   bool canVectorize();
 367
 368   /// Returns the Induction variable.
 369   PHINode *getInduction() {return Induction;}
 370
 371   /// Returns the reduction variables found in the loop.
 372   ReductionList *getReductionVars() { return &Reductions; }
 373
 374   /// Returns the induction variables found in the loop.
 375   InductionList *getInductionVars() { return &Inductions; }
 376
 377   /// Returns True if V is an induction variable in this loop.
 378   bool isInductionVariable(const Value *V);
 379
 380   /// Return true if the block BB needs to be predicated in order for the loop
 381   /// to be vectorized.
 382   bool blockNeedsPredication(BasicBlock *BB);
 383
 384   /// Check if this  pointer is consecutive when vectorizing. This happens
 385   /// when the last index of the GEP is the induction variable, or that the
 386   /// pointer itself is an induction variable.
 387   /// This check allows us to vectorize A[idx] into a wide load/store.
 388   /// Returns:
 389   /// 0 - Stride is unknown or non consecutive.
 390   /// 1 - Address is consecutive.
 391   /// -1 - Address is consecutive, and decreasing.
 392   int isConsecutivePtr(Value *Ptr);
 393
 394   /// Returns true if the value V is uniform within the loop.
 395   bool isUniform(Value *V);
 396
 397   /// Returns true if this instruction will remain scalar after vectorization.
 398   bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
 399
 400   /// Returns the information that we collected about runtime memory check.
 401   RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
 402 private:
 403   /// Check if a single basic block loop is vectorizable.
 404   /// At this point we know that this is a loop with a constant trip count
 405   /// and we only need to check individual instructions.
 406   bool canVectorizeInstrs();
 407
 408   /// When we vectorize loops we may change the order in which
 409   /// we read and write from memory. This method checks if it is
 410   /// legal to vectorize the code, considering only memory constrains.
 411   /// Returns true if the loop is vectorizable
 412   bool canVectorizeMemory();
 413
 414   /// Return true if we can vectorize this loop using the IF-conversion
 415   /// transformation.
 416   bool canVectorizeWithIfConvert();
 417
 418   /// Collect the variables that need to stay uniform after vectorization.
 419   void collectLoopUniforms();
 420
 421   /// Return true if all of the instructions in the block can be speculatively
 422   /// executed.
 423   bool blockCanBePredicated(BasicBlock *BB);
 424
 425   /// Returns True, if 'Phi' is the kind of reduction variable for type
 426   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
 427   bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
 428   /// Returns true if the instruction I can be a reduction variable of type
 429   /// 'Kind'.
 430   bool isReductionInstr(Instruction *I, ReductionKind Kind);
 431   /// Returns the induction kind of Phi. This function may return NoInduction
 432   /// if the PHI is not an induction variable.
 433   InductionKind isInductionVariable(PHINode *Phi);
 434   /// Return true if can compute the address bounds of Ptr within the loop.
 435   bool hasComputableBounds(Value *Ptr);
 436
 437   /// The loop that we evaluate.
 438   Loop *TheLoop;
 439   /// Scev analysis.
 440   ScalarEvolution *SE;
 441   /// DataLayout analysis.
 442   DataLayout *DL;
 443   // Dominators.
 444   DominatorTree *DT;
 445
 446   //  ---  vectorization state --- //
 447
 448   /// Holds the integer induction variable. This is the counter of the
 449   /// loop.
 450   PHINode *Induction;
 451   /// Holds the reduction variables.
 452   ReductionList Reductions;
 453   /// Holds all of the induction variables that we found in the loop.
 454   /// Notice that inductions don't need to start at zero and that induction
 455   /// variables can be pointers.
 456   InductionList Inductions;
 457
 458   /// Allowed outside users. This holds the reduction
 459   /// vars which can be accessed from outside the loop.
 460   SmallPtrSet<Value*, 4> AllowedExit;
 461   /// This set holds the variables which are known to be uniform after
 462   /// vectorization.
 463   SmallPtrSet<Instruction*, 4> Uniforms;
 464   /// We need to check that all of the pointers in this list are disjoint
 465   /// at runtime.
 466   RuntimePointerCheck PtrRtCheck;
 467 };
 468
 469 /// LoopVectorizationCostModel - estimates the expected speedups due to
 470 /// vectorization.
 471 /// In many cases vectorization is not profitable. This can happen because of
 472 /// a number of reasons. In this class we mainly attempt to predict the
 473 /// expected speedup/slowdowns due to the supported instruction set. We use the
 474 /// TargetTransformInfo to query the different backends for the cost of
 475 /// different operations.
 476 class LoopVectorizationCostModel {
 477 public:
 478   /// C'tor.
 479   LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, LoopInfo *Li,
 480                              LoopVectorizationLegality *Leg,
 481                              const TargetTransformInfo *Tti):
 482   TheLoop(Lp), SE(Se), LI(Li), Legal(Leg), TTI(Tti) { }
 483
 484   /// \return The most profitable vectorization factor.
 485   /// This method checks every power of two up to VF. If UserVF is not ZERO
 486   /// then this vectorization factor will be selected if vectorization is
 487   /// possible.
 488   unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF);
 489
 490
 491   /// \return The most profitable unroll factor.
 492   /// If UserUF is non-zero then this method finds the best unroll-factor
 493   /// based on register pressure and other parameters.
 494   unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF);
 495
 496   /// \brief A struct that represents some properties of the register usage
 497   /// of a loop.
 498   struct RegisterUsage {
 499     /// Holds the number of loop invariant values that are used in the loop.
 500     unsigned LoopInvariantRegs;
 501     /// Holds the maximum number of concurrent live intervals in the loop.
 502     unsigned MaxLocalUsers;
 503     /// Holds the number of instructions in the loop.
 504     unsigned NumInstructions;
 505   };
 506
 507   /// \return  information about the register usage of the loop.
 508   RegisterUsage calculateRegisterUsage();
 509
 510 private:
 511   /// Returns the expected execution cost. The unit of the cost does
 512   /// not matter because we use the 'cost' units to compare different
 513   /// vector widths. The cost that is returned is *not* normalized by
 514   /// the factor width.
 515   unsigned expectedCost(unsigned VF);
 516
 517   /// Returns the execution time cost of an instruction for a given vector
 518   /// width. Vector width of one means scalar.
 519   unsigned getInstructionCost(Instruction *I, unsigned VF);
 520
 521   /// A helper function for converting Scalar types to vector types.
 522   /// If the incoming type is void, we return void. If the VF is 1, we return
 523   /// the scalar type.
 524   static Type* ToVectorTy(Type *Scalar, unsigned VF);
 525
 526   /// The loop that we evaluate.
 527   Loop *TheLoop;
 528   /// Scev analysis.
 529   ScalarEvolution *SE;
 530   /// Loop Info analysis.
 531   LoopInfo *LI;
 532   /// Vectorization legality.
 533   LoopVectorizationLegality *Legal;
 534   /// Vector target information.
 535   const TargetTransformInfo *TTI;
 536 };
 537
 538 }// namespace llvm
 539
 540 #endif //LLVM_TRANSFORM_VECTORIZE_LOOP_VECTORIZE_H
 541