1 //===- LoopVectorize.h --- A Loop Vectorizer ------------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR. Legalization of the IR is done
12 // in the codegen. However, the vectorizes uses (will use) the codegen
13 // interfaces to generate IR that is likely to result in an optimal binary.
15 // The loop vectorizer combines consecutive loop iteration into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 // of the vectorization.
23 // 3. InnerLoopVectorizer - A unit that performs the actual
24 // widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 // of vectorization. It decides on the optimal vector width, which
27 // can be one, if vectorization is not profitable.
29 //===----------------------------------------------------------------------===//
31 // The reduction-variable vectorization is based on the paper:
32 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
34 // Variable uniformity checks are inspired by:
35 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
37 // Other ideas/concepts are from:
38 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
40 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
41 // Vectorizing Compilers.
43 //===----------------------------------------------------------------------===//
44 #ifndef LLVM_TRANSFORM_VECTORIZE_LOOP_VECTORIZE_H
45 #define LLVM_TRANSFORM_VECTORIZE_LOOP_VECTORIZE_H
47 #define LV_NAME "loop-vectorize"
48 #define DEBUG_TYPE LV_NAME
50 #include "llvm/ADT/DenseMap.h"
51 #include "llvm/ADT/MapVector.h"
52 #include "llvm/ADT/SmallPtrSet.h"
53 #include "llvm/ADT/SmallVector.h"
54 #include "llvm/Analysis/ScalarEvolution.h"
55 #include "llvm/IR/IRBuilder.h"
61 /// We don't vectorize loops with a known constant trip count below this number.
62 const unsigned TinyTripCountThreshold = 16;
64 /// When performing a runtime memory check, do not check more than this
65 /// number of pointers. Notice that the check is quadratic!
66 const unsigned RuntimeMemoryCheckThreshold = 4;
68 /// This is the highest vector width that we try to generate.
69 const unsigned MaxVectorSize = 8;
71 /// This is the highest Unroll Factor.
72 const unsigned MaxUnrollSize = 4;
76 // Forward declarations.
77 class LoopVectorizationLegality;
78 class LoopVectorizationCostModel;
79 class TargetTransformInfo;
81 /// InnerLoopVectorizer vectorizes loops which contain only one basic
82 /// block to a specified vectorization factor (VF).
83 /// This class performs the widening of scalars into vectors, or multiple
84 /// scalars. This class also implements the following features:
85 /// * It inserts an epilogue loop for handling loops that don't have iteration
86 /// counts that are known to be a multiple of the vectorization factor.
87 /// * It handles the code generation for reduction variables.
88 /// * Scalarization (implementation using scalars) of un-vectorizable
90 /// InnerLoopVectorizer does not perform any vectorization-legality
91 /// checks, and relies on the caller to check for the different legality
92 /// aspects. The InnerLoopVectorizer relies on the
93 /// LoopVectorizationLegality class to provide information about the induction
94 /// and reduction variables that were found to a given vectorization factor.
95 class InnerLoopVectorizer {
98 InnerLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
99 DominatorTree *Dt, DataLayout *Dl,
100 unsigned VecWidth, unsigned UnrollFactor):
101 OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth),
102 UF(UnrollFactor), Builder(Se->getContext()), Induction(0), OldInduction(0),
103 WidenMap(UnrollFactor) { }
105 // Perform the actual loop widening (vectorization).
106 void vectorize(LoopVectorizationLegality *Legal) {
107 // Create a new empty loop. Unlink the old loop and connect the new one.
108 createEmptyLoop(Legal);
109 // Widen each instruction in the old loop to a new one in the new loop.
110 // Use the Legality module to find the induction and reduction variables.
111 vectorizeLoop(Legal);
112 // Register the new loop and update the analysis passes.
117 /// A small list of PHINodes.
118 typedef SmallVector<PHINode*, 4> PhiVector;
119 /// When we unroll loops we have multiple vector values for each scalar.
120 /// This data structure holds the unrolled and vectorized values that
121 /// originated from one scalar instruction.
122 typedef SmallVector<Value*, 2> VectorParts;
124 /// Add code that checks at runtime if the accessed arrays overlap.
125 /// Returns the comparator value or NULL if no check is needed.
126 Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
128 /// Create an empty loop, based on the loop ranges of the old loop.
129 void createEmptyLoop(LoopVectorizationLegality *Legal);
130 /// Copy and widen the instructions from the old loop.
131 void vectorizeLoop(LoopVectorizationLegality *Legal);
133 /// A helper function that computes the predicate of the block BB, assuming
134 /// that the header block of the loop is set to True. It returns the *entry*
135 /// mask for the block BB.
136 VectorParts createBlockInMask(BasicBlock *BB);
137 /// A helper function that computes the predicate of the edge between SRC
139 VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
141 /// A helper function to vectorize a single BB within the innermost loop.
142 void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
145 /// Insert the new loop to the loop hierarchy and pass manager
146 /// and update the analysis passes.
147 void updateAnalysis();
149 /// This instruction is un-vectorizable. Implement it as a sequence
151 void scalarizeInstruction(Instruction *Instr);
153 /// Create a broadcast instruction. This method generates a broadcast
154 /// instruction (shuffle) for loop invariant values and for the induction
155 /// value. If this is the induction variable then we extend it to N, N+1, ...
156 /// this is needed because each iteration in the loop corresponds to a SIMD
158 Value *getBroadcastInstrs(Value *V);
160 /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
161 /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
162 /// The sequence starts at StartIndex.
163 Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate);
165 /// When we go over instructions in the basic block we rely on previous
166 /// values within the current basic block or on loop invariant values.
167 /// When we widen (vectorize) values we place them in the map. If the values
168 /// are not within the map, they have to be loop invariant, so we simply
169 /// broadcast them into a vector.
170 VectorParts &getVectorValue(Value *V);
172 /// Get a uniform vector of constant integers. We use this to get
173 /// vectors of ones and zeros for the reduction code.
174 Constant* getUniformVector(unsigned Val, Type* ScalarTy);
176 /// Generate a shuffle sequence that will reverse the vector Vec.
177 Value *reverseVector(Value *Vec);
179 /// This is a helper class that holds the vectorizer state. It maps scalar
180 /// instructions to vector instructions. When the code is 'unrolled' then
181 /// then a single scalar value is mapped to multiple vector parts. The parts
182 /// are stored in the VectorPart type.
184 /// C'tor. UnrollFactor controls the number of vectors ('parts') that
186 ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
188 /// \return True if 'Key' is saved in the Value Map.
189 bool has(Value *Key) { return MapStoreage.count(Key); }
191 /// Initializes a new entry in the map. Sets all of the vector parts to the
192 /// save value in 'Val'.
193 /// \return A reference to a vector with splat values.
194 VectorParts &splat(Value *Key, Value *Val) {
195 MapStoreage[Key].clear();
196 MapStoreage[Key].append(UF, Val);
197 return MapStoreage[Key];
200 ///\return A reference to the value that is stored at 'Key'.
201 VectorParts &get(Value *Key) {
203 MapStoreage[Key].resize(UF);
204 return MapStoreage[Key];
207 /// The unroll factor. Each entry in the map stores this number of vector
211 /// Map storage. We use std::map and not DenseMap because insertions to a
212 /// dense map invalidates its iterators.
213 std::map<Value*, VectorParts> MapStoreage;
216 /// The original loop.
218 /// Scev analysis to use.
226 /// The vectorization SIMD factor to use. Each vector will have this many
229 /// The vectorization unroll factor to use. Each scalar is vectorized to this
230 /// many different vector instructions.
233 /// The builder that we use
236 // --- Vectorization state ---
238 /// The vector-loop preheader.
239 BasicBlock *LoopVectorPreHeader;
240 /// The scalar-loop preheader.
241 BasicBlock *LoopScalarPreHeader;
242 /// Middle Block between the vector and the scalar.
243 BasicBlock *LoopMiddleBlock;
244 ///The ExitBlock of the scalar loop.
245 BasicBlock *LoopExitBlock;
246 ///The vector loop body.
247 BasicBlock *LoopVectorBody;
248 ///The scalar loop body.
249 BasicBlock *LoopScalarBody;
250 ///The first bypass block.
251 BasicBlock *LoopBypassBlock;
253 /// The new Induction variable which was added to the new block.
255 /// The induction variable of the old basic block.
256 PHINode *OldInduction;
257 /// Maps scalars to widened vectors.
261 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
262 /// to what vectorization factor.
263 /// This class does not look at the profitability of vectorization, only the
264 /// legality. This class has two main kinds of checks:
265 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
266 /// will change the order of memory accesses in a way that will change the
267 /// correctness of the program.
268 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
269 /// checks for a number of different conditions, such as the availability of a
270 /// single induction variable, that all types are supported and vectorize-able,
271 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
272 /// This class is also used by InnerLoopVectorizer for identifying
273 /// induction variable and the different reduction variables.
274 class LoopVectorizationLegality {
276 LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl,
278 TheLoop(Lp), SE(Se), DL(Dl), DT(Dt), Induction(0) { }
280 /// This enum represents the kinds of reductions that we support.
282 NoReduction, /// Not a reduction.
283 IntegerAdd, /// Sum of numbers.
284 IntegerMult, /// Product of numbers.
285 IntegerOr, /// Bitwise or logical OR of numbers.
286 IntegerAnd, /// Bitwise or logical AND of numbers.
287 IntegerXor /// Bitwise or logical XOR of numbers.
290 /// This enum represents the kinds of inductions that we support.
292 NoInduction, /// Not an induction variable.
293 IntInduction, /// Integer induction variable. Step = 1.
294 ReverseIntInduction, /// Reverse int induction variable. Step = -1.
295 PtrInduction /// Pointer induction variable. Step = sizeof(elem).
298 /// This POD struct holds information about reduction variables.
299 struct ReductionDescriptor {
301 ReductionDescriptor():
302 StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
305 ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
306 StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
308 // The starting value of the reduction.
309 // It does not have to be zero!
311 // The instruction who's value is used outside the loop.
312 Instruction *LoopExitInstr;
313 // The kind of the reduction.
317 // This POD struct holds information about the memory runtime legality
318 // check that a group of pointers do not overlap.
319 struct RuntimePointerCheck {
320 RuntimePointerCheck(): Need(false) {}
322 /// Reset the state of the pointer runtime information.
330 /// Insert a pointer and calculate the start and end SCEVs.
331 void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr);
333 /// This flag indicates if we need to add the runtime check.
335 /// Holds the pointers that we need to check.
336 SmallVector<Value*, 2> Pointers;
337 /// Holds the pointer value at the beginning of the loop.
338 SmallVector<const SCEV*, 2> Starts;
339 /// Holds the pointer value at the end of the loop.
340 SmallVector<const SCEV*, 2> Ends;
343 /// A POD for saving information about induction variables.
344 struct InductionInfo {
346 InductionInfo(Value *Start, InductionKind K):
347 StartValue(Start), IK(K) {};
348 InductionInfo(): StartValue(0), IK(NoInduction) {};
355 /// ReductionList contains the reduction descriptors for all
356 /// of the reductions that were found in the loop.
357 typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
359 /// InductionList saves induction variables and maps them to the
360 /// induction descriptor.
361 typedef MapVector<PHINode*, InductionInfo> InductionList;
363 /// Returns true if it is legal to vectorize this loop.
364 /// This does not mean that it is profitable to vectorize this
365 /// loop, only that it is legal to do so.
368 /// Returns the Induction variable.
369 PHINode *getInduction() {return Induction;}
371 /// Returns the reduction variables found in the loop.
372 ReductionList *getReductionVars() { return &Reductions; }
374 /// Returns the induction variables found in the loop.
375 InductionList *getInductionVars() { return &Inductions; }
377 /// Returns True if V is an induction variable in this loop.
378 bool isInductionVariable(const Value *V);
380 /// Return true if the block BB needs to be predicated in order for the loop
381 /// to be vectorized.
382 bool blockNeedsPredication(BasicBlock *BB);
384 /// Check if this pointer is consecutive when vectorizing. This happens
385 /// when the last index of the GEP is the induction variable, or that the
386 /// pointer itself is an induction variable.
387 /// This check allows us to vectorize A[idx] into a wide load/store.
389 /// 0 - Stride is unknown or non consecutive.
390 /// 1 - Address is consecutive.
391 /// -1 - Address is consecutive, and decreasing.
392 int isConsecutivePtr(Value *Ptr);
394 /// Returns true if the value V is uniform within the loop.
395 bool isUniform(Value *V);
397 /// Returns true if this instruction will remain scalar after vectorization.
398 bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
400 /// Returns the information that we collected about runtime memory check.
401 RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
403 /// Check if a single basic block loop is vectorizable.
404 /// At this point we know that this is a loop with a constant trip count
405 /// and we only need to check individual instructions.
406 bool canVectorizeInstrs();
408 /// When we vectorize loops we may change the order in which
409 /// we read and write from memory. This method checks if it is
410 /// legal to vectorize the code, considering only memory constrains.
411 /// Returns true if the loop is vectorizable
412 bool canVectorizeMemory();
414 /// Return true if we can vectorize this loop using the IF-conversion
416 bool canVectorizeWithIfConvert();
418 /// Collect the variables that need to stay uniform after vectorization.
419 void collectLoopUniforms();
421 /// Return true if all of the instructions in the block can be speculatively
423 bool blockCanBePredicated(BasicBlock *BB);
425 /// Returns True, if 'Phi' is the kind of reduction variable for type
426 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
427 bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
428 /// Returns true if the instruction I can be a reduction variable of type
430 bool isReductionInstr(Instruction *I, ReductionKind Kind);
431 /// Returns the induction kind of Phi. This function may return NoInduction
432 /// if the PHI is not an induction variable.
433 InductionKind isInductionVariable(PHINode *Phi);
434 /// Return true if can compute the address bounds of Ptr within the loop.
435 bool hasComputableBounds(Value *Ptr);
437 /// The loop that we evaluate.
441 /// DataLayout analysis.
446 // --- vectorization state --- //
448 /// Holds the integer induction variable. This is the counter of the
451 /// Holds the reduction variables.
452 ReductionList Reductions;
453 /// Holds all of the induction variables that we found in the loop.
454 /// Notice that inductions don't need to start at zero and that induction
455 /// variables can be pointers.
456 InductionList Inductions;
458 /// Allowed outside users. This holds the reduction
459 /// vars which can be accessed from outside the loop.
460 SmallPtrSet<Value*, 4> AllowedExit;
461 /// This set holds the variables which are known to be uniform after
463 SmallPtrSet<Instruction*, 4> Uniforms;
464 /// We need to check that all of the pointers in this list are disjoint
466 RuntimePointerCheck PtrRtCheck;
469 /// LoopVectorizationCostModel - estimates the expected speedups due to
471 /// In many cases vectorization is not profitable. This can happen because of
472 /// a number of reasons. In this class we mainly attempt to predict the
473 /// expected speedup/slowdowns due to the supported instruction set. We use the
474 /// TargetTransformInfo to query the different backends for the cost of
475 /// different operations.
476 class LoopVectorizationCostModel {
479 LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, LoopInfo *Li,
480 LoopVectorizationLegality *Leg,
481 const TargetTransformInfo *Tti):
482 TheLoop(Lp), SE(Se), LI(Li), Legal(Leg), TTI(Tti) { }
484 /// \return The most profitable vectorization factor.
485 /// This method checks every power of two up to VF. If UserVF is not ZERO
486 /// then this vectorization factor will be selected if vectorization is
488 unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF);
491 /// \return The most profitable unroll factor.
492 /// If UserUF is non-zero then this method finds the best unroll-factor
493 /// based on register pressure and other parameters.
494 unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF);
496 /// \brief A struct that represents some properties of the register usage
498 struct RegisterUsage {
499 /// Holds the number of loop invariant values that are used in the loop.
500 unsigned LoopInvariantRegs;
501 /// Holds the maximum number of concurrent live intervals in the loop.
502 unsigned MaxLocalUsers;
503 /// Holds the number of instructions in the loop.
504 unsigned NumInstructions;
507 /// \return information about the register usage of the loop.
508 RegisterUsage calculateRegisterUsage();
511 /// Returns the expected execution cost. The unit of the cost does
512 /// not matter because we use the 'cost' units to compare different
513 /// vector widths. The cost that is returned is *not* normalized by
514 /// the factor width.
515 unsigned expectedCost(unsigned VF);
517 /// Returns the execution time cost of an instruction for a given vector
518 /// width. Vector width of one means scalar.
519 unsigned getInstructionCost(Instruction *I, unsigned VF);
521 /// A helper function for converting Scalar types to vector types.
522 /// If the incoming type is void, we return void. If the VF is 1, we return
524 static Type* ToVectorTy(Type *Scalar, unsigned VF);
526 /// The loop that we evaluate.
530 /// Loop Info analysis.
532 /// Vectorization legality.
533 LoopVectorizationLegality *Legal;
534 /// Vector target information.
535 const TargetTransformInfo *TTI;
540 #endif //LLVM_TRANSFORM_VECTORIZE_LOOP_VECTORIZE_H