1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10 // stores that can be put together into vector-stores. Next, it attempts to
11 // construct vectorizable tree using the use-def chains. If a profitable tree
12 // was found, the SLP vectorizer performs vectorization on the tree.
14 // The pass is inspired by the work described in the paper:
15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
17 //===----------------------------------------------------------------------===//
18 #include "llvm/Transforms/Vectorize.h"
19 #include "llvm/ADT/MapVector.h"
20 #include "llvm/ADT/Optional.h"
21 #include "llvm/ADT/PostOrderIterator.h"
22 #include "llvm/ADT/SetVector.h"
23 #include "llvm/ADT/Statistic.h"
24 #include "llvm/Analysis/AliasAnalysis.h"
25 #include "llvm/Analysis/AssumptionCache.h"
26 #include "llvm/Analysis/CodeMetrics.h"
27 #include "llvm/Analysis/LoopInfo.h"
28 #include "llvm/Analysis/ScalarEvolution.h"
29 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
30 #include "llvm/Analysis/TargetTransformInfo.h"
31 #include "llvm/Analysis/ValueTracking.h"
32 #include "llvm/IR/DataLayout.h"
33 #include "llvm/IR/Dominators.h"
34 #include "llvm/IR/IRBuilder.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Module.h"
38 #include "llvm/IR/NoFolder.h"
39 #include "llvm/IR/Type.h"
40 #include "llvm/IR/Value.h"
41 #include "llvm/IR/Verifier.h"
42 #include "llvm/Pass.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/raw_ostream.h"
46 #include "llvm/Analysis/VectorUtils.h"
53 #define SV_NAME "slp-vectorizer"
54 #define DEBUG_TYPE "SLP"
56 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
59 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
60 cl::desc("Only vectorize if you gain more than this "
64 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
65 cl::desc("Attempt to vectorize horizontal reductions"));
67 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
68 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
70 "Attempt to vectorize horizontal reductions feeding into a store"));
73 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
74 cl::desc("Attempt to vectorize for this register size in bits"));
78 // FIXME: Set this via cl::opt to allow overriding.
79 static const unsigned MinVecRegSize = 128;
81 static const unsigned RecursionMaxDepth = 12;
83 // Limit the number of alias checks. The limit is chosen so that
84 // it has no negative effect on the llvm benchmarks.
85 static const unsigned AliasedCheckLimit = 10;
87 // Another limit for the alias checks: The maximum distance between load/store
88 // instructions where alias checks are done.
89 // This limit is useful for very large basic blocks.
90 static const unsigned MaxMemDepDistance = 160;
92 /// \brief Predicate for the element types that the SLP vectorizer supports.
94 /// The most important thing to filter here are types which are invalid in LLVM
95 /// vectors. We also filter target specific types which have absolutely no
96 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
97 /// avoids spending time checking the cost model and realizing that they will
98 /// be inevitably scalarized.
99 static bool isValidElementType(Type *Ty) {
100 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
101 !Ty->isPPC_FP128Ty();
104 /// \returns the parent basic block if all of the instructions in \p VL
105 /// are in the same block or null otherwise.
106 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
107 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
110 BasicBlock *BB = I0->getParent();
111 for (int i = 1, e = VL.size(); i < e; i++) {
112 Instruction *I = dyn_cast<Instruction>(VL[i]);
116 if (BB != I->getParent())
122 /// \returns True if all of the values in \p VL are constants.
123 static bool allConstant(ArrayRef<Value *> VL) {
124 for (unsigned i = 0, e = VL.size(); i < e; ++i)
125 if (!isa<Constant>(VL[i]))
130 /// \returns True if all of the values in \p VL are identical.
131 static bool isSplat(ArrayRef<Value *> VL) {
132 for (unsigned i = 1, e = VL.size(); i < e; ++i)
138 ///\returns Opcode that can be clubbed with \p Op to create an alternate
139 /// sequence which can later be merged as a ShuffleVector instruction.
140 static unsigned getAltOpcode(unsigned Op) {
142 case Instruction::FAdd:
143 return Instruction::FSub;
144 case Instruction::FSub:
145 return Instruction::FAdd;
146 case Instruction::Add:
147 return Instruction::Sub;
148 case Instruction::Sub:
149 return Instruction::Add;
155 ///\returns bool representing if Opcode \p Op can be part
156 /// of an alternate sequence which can later be merged as
157 /// a ShuffleVector instruction.
158 static bool canCombineAsAltInst(unsigned Op) {
159 if (Op == Instruction::FAdd || Op == Instruction::FSub ||
160 Op == Instruction::Sub || Op == Instruction::Add)
165 /// \returns ShuffleVector instruction if intructions in \p VL have
166 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
167 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
168 static unsigned isAltInst(ArrayRef<Value *> VL) {
169 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
170 unsigned Opcode = I0->getOpcode();
171 unsigned AltOpcode = getAltOpcode(Opcode);
172 for (int i = 1, e = VL.size(); i < e; i++) {
173 Instruction *I = dyn_cast<Instruction>(VL[i]);
174 if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
177 return Instruction::ShuffleVector;
180 /// \returns The opcode if all of the Instructions in \p VL have the same
182 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
183 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
186 unsigned Opcode = I0->getOpcode();
187 for (int i = 1, e = VL.size(); i < e; i++) {
188 Instruction *I = dyn_cast<Instruction>(VL[i]);
189 if (!I || Opcode != I->getOpcode()) {
190 if (canCombineAsAltInst(Opcode) && i == 1)
191 return isAltInst(VL);
198 /// Get the intersection (logical and) of all of the potential IR flags
199 /// of each scalar operation (VL) that will be converted into a vector (I).
200 /// Flag set: NSW, NUW, exact, and all of fast-math.
201 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
202 if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
203 if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
204 // Intersection is initialized to the 0th scalar,
205 // so start counting from index '1'.
206 for (int i = 1, e = VL.size(); i < e; ++i) {
207 if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
208 Intersection->andIRFlags(Scalar);
210 VecOp->copyIRFlags(Intersection);
215 /// \returns \p I after propagating metadata from \p VL.
216 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
217 Instruction *I0 = cast<Instruction>(VL[0]);
218 SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
219 I0->getAllMetadataOtherThanDebugLoc(Metadata);
221 for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
222 unsigned Kind = Metadata[i].first;
223 MDNode *MD = Metadata[i].second;
225 for (int i = 1, e = VL.size(); MD && i != e; i++) {
226 Instruction *I = cast<Instruction>(VL[i]);
227 MDNode *IMD = I->getMetadata(Kind);
231 MD = nullptr; // Remove unknown metadata
233 case LLVMContext::MD_tbaa:
234 MD = MDNode::getMostGenericTBAA(MD, IMD);
236 case LLVMContext::MD_alias_scope:
237 MD = MDNode::getMostGenericAliasScope(MD, IMD);
239 case LLVMContext::MD_noalias:
240 MD = MDNode::intersect(MD, IMD);
242 case LLVMContext::MD_fpmath:
243 MD = MDNode::getMostGenericFPMath(MD, IMD);
247 I->setMetadata(Kind, MD);
252 /// \returns The type that all of the values in \p VL have or null if there
253 /// are different types.
254 static Type* getSameType(ArrayRef<Value *> VL) {
255 Type *Ty = VL[0]->getType();
256 for (int i = 1, e = VL.size(); i < e; i++)
257 if (VL[i]->getType() != Ty)
263 /// \returns True if the ExtractElement instructions in VL can be vectorized
264 /// to use the original vector.
265 static bool CanReuseExtract(ArrayRef<Value *> VL) {
266 assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
267 // Check if all of the extracts come from the same vector and from the
270 ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
271 Value *Vec = E0->getOperand(0);
273 // We have to extract from the same vector type.
274 unsigned NElts = Vec->getType()->getVectorNumElements();
276 if (NElts != VL.size())
279 // Check that all of the indices extract from the correct offset.
280 ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
281 if (!CI || CI->getZExtValue())
284 for (unsigned i = 1, e = VL.size(); i < e; ++i) {
285 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
286 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
288 if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
295 /// \returns True if in-tree use also needs extract. This refers to
296 /// possible scalar operand in vectorized instruction.
297 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
298 TargetLibraryInfo *TLI) {
300 unsigned Opcode = UserInst->getOpcode();
302 case Instruction::Load: {
303 LoadInst *LI = cast<LoadInst>(UserInst);
304 return (LI->getPointerOperand() == Scalar);
306 case Instruction::Store: {
307 StoreInst *SI = cast<StoreInst>(UserInst);
308 return (SI->getPointerOperand() == Scalar);
310 case Instruction::Call: {
311 CallInst *CI = cast<CallInst>(UserInst);
312 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
313 if (hasVectorInstrinsicScalarOpd(ID, 1)) {
314 return (CI->getArgOperand(1) == Scalar);
322 /// \returns the AA location that is being access by the instruction.
323 static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
324 if (StoreInst *SI = dyn_cast<StoreInst>(I))
325 return MemoryLocation::get(SI);
326 if (LoadInst *LI = dyn_cast<LoadInst>(I))
327 return MemoryLocation::get(LI);
328 return MemoryLocation();
331 /// \returns True if the instruction is not a volatile or atomic load/store.
332 static bool isSimple(Instruction *I) {
333 if (LoadInst *LI = dyn_cast<LoadInst>(I))
334 return LI->isSimple();
335 if (StoreInst *SI = dyn_cast<StoreInst>(I))
336 return SI->isSimple();
337 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
338 return !MI->isVolatile();
342 /// Bottom Up SLP Vectorizer.
345 typedef SmallVector<Value *, 8> ValueList;
346 typedef SmallVector<Instruction *, 16> InstrList;
347 typedef SmallPtrSet<Value *, 16> ValueSet;
348 typedef SmallVector<StoreInst *, 8> StoreList;
350 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
351 TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
352 DominatorTree *Dt, AssumptionCache *AC)
353 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
354 SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
355 Builder(Se->getContext()) {
356 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
359 /// \brief Vectorize the tree that starts with the elements in \p VL.
360 /// Returns the vectorized root.
361 Value *vectorizeTree();
363 /// \returns the cost incurred by unwanted spills and fills, caused by
364 /// holding live values over call sites.
367 /// \returns the vectorization cost of the subtree that starts at \p VL.
368 /// A negative number means that this is profitable.
371 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
372 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
373 void buildTree(ArrayRef<Value *> Roots,
374 ArrayRef<Value *> UserIgnoreLst = None);
376 /// Clear the internal data structures that are created by 'buildTree'.
378 VectorizableTree.clear();
379 ScalarToTreeEntry.clear();
381 ExternalUses.clear();
382 NumLoadsWantToKeepOrder = 0;
383 NumLoadsWantToChangeOrder = 0;
384 for (auto &Iter : BlocksSchedules) {
385 BlockScheduling *BS = Iter.second.get();
390 /// \returns true if the memory operations A and B are consecutive.
391 bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
393 /// \brief Perform LICM and CSE on the newly generated gather sequences.
394 void optimizeGatherSequence();
396 /// \returns true if it is benefitial to reverse the vector order.
397 bool shouldReorder() const {
398 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
404 /// \returns the cost of the vectorizable entry.
405 int getEntryCost(TreeEntry *E);
407 /// This is the recursive part of buildTree.
408 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
410 /// Vectorize a single entry in the tree.
411 Value *vectorizeTree(TreeEntry *E);
413 /// Vectorize a single entry in the tree, starting in \p VL.
414 Value *vectorizeTree(ArrayRef<Value *> VL);
416 /// \returns the pointer to the vectorized value if \p VL is already
417 /// vectorized, or NULL. They may happen in cycles.
418 Value *alreadyVectorized(ArrayRef<Value *> VL) const;
420 /// \brief Take the pointer operand from the Load/Store instruction.
421 /// \returns NULL if this is not a valid Load/Store instruction.
422 static Value *getPointerOperand(Value *I);
424 /// \brief Take the address space operand from the Load/Store instruction.
425 /// \returns -1 if this is not a valid Load/Store instruction.
426 static unsigned getAddressSpaceOperand(Value *I);
428 /// \returns the scalarization cost for this type. Scalarization in this
429 /// context means the creation of vectors from a group of scalars.
430 int getGatherCost(Type *Ty);
432 /// \returns the scalarization cost for this list of values. Assuming that
433 /// this subtree gets vectorized, we may need to extract the values from the
434 /// roots. This method calculates the cost of extracting the values.
435 int getGatherCost(ArrayRef<Value *> VL);
437 /// \brief Set the Builder insert point to one after the last instruction in
439 void setInsertPointAfterBundle(ArrayRef<Value *> VL);
441 /// \returns a vector from a collection of scalars in \p VL.
442 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
444 /// \returns whether the VectorizableTree is fully vectoriable and will
445 /// be beneficial even the tree height is tiny.
446 bool isFullyVectorizableTinyTree();
448 /// \reorder commutative operands in alt shuffle if they result in
450 void reorderAltShuffleOperands(ArrayRef<Value *> VL,
451 SmallVectorImpl<Value *> &Left,
452 SmallVectorImpl<Value *> &Right);
453 /// \reorder commutative operands to get better probability of
454 /// generating vectorized code.
455 void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
456 SmallVectorImpl<Value *> &Left,
457 SmallVectorImpl<Value *> &Right);
459 TreeEntry() : Scalars(), VectorizedValue(nullptr),
462 /// \returns true if the scalars in VL are equal to this entry.
463 bool isSame(ArrayRef<Value *> VL) const {
464 assert(VL.size() == Scalars.size() && "Invalid size");
465 return std::equal(VL.begin(), VL.end(), Scalars.begin());
468 /// A vector of scalars.
471 /// The Scalars are vectorized into this value. It is initialized to Null.
472 Value *VectorizedValue;
474 /// Do we need to gather this sequence ?
478 /// Create a new VectorizableTree entry.
479 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
480 VectorizableTree.emplace_back();
481 int idx = VectorizableTree.size() - 1;
482 TreeEntry *Last = &VectorizableTree[idx];
483 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
484 Last->NeedToGather = !Vectorized;
486 for (int i = 0, e = VL.size(); i != e; ++i) {
487 assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
488 ScalarToTreeEntry[VL[i]] = idx;
491 MustGather.insert(VL.begin(), VL.end());
496 /// -- Vectorization State --
497 /// Holds all of the tree entries.
498 std::vector<TreeEntry> VectorizableTree;
500 /// Maps a specific scalar to its tree entry.
501 SmallDenseMap<Value*, int> ScalarToTreeEntry;
503 /// A list of scalars that we found that we need to keep as scalars.
506 /// This POD struct describes one external user in the vectorized tree.
507 struct ExternalUser {
508 ExternalUser (Value *S, llvm::User *U, int L) :
509 Scalar(S), User(U), Lane(L){};
510 // Which scalar in our function.
512 // Which user that uses the scalar.
514 // Which lane does the scalar belong to.
517 typedef SmallVector<ExternalUser, 16> UserList;
519 /// Checks if two instructions may access the same memory.
521 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
522 /// is invariant in the calling loop.
523 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
524 Instruction *Inst2) {
526 // First check if the result is already in the cache.
527 AliasCacheKey key = std::make_pair(Inst1, Inst2);
528 Optional<bool> &result = AliasCache[key];
529 if (result.hasValue()) {
530 return result.getValue();
532 MemoryLocation Loc2 = getLocation(Inst2, AA);
534 if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
535 // Do the alias check.
536 aliased = AA->alias(Loc1, Loc2);
538 // Store the result in the cache.
543 typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
545 /// Cache for alias results.
546 /// TODO: consider moving this to the AliasAnalysis itself.
547 DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
549 /// Removes an instruction from its block and eventually deletes it.
550 /// It's like Instruction::eraseFromParent() except that the actual deletion
551 /// is delayed until BoUpSLP is destructed.
552 /// This is required to ensure that there are no incorrect collisions in the
553 /// AliasCache, which can happen if a new instruction is allocated at the
554 /// same address as a previously deleted instruction.
555 void eraseInstruction(Instruction *I) {
556 I->removeFromParent();
557 I->dropAllReferences();
558 DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
561 /// Temporary store for deleted instructions. Instructions will be deleted
562 /// eventually when the BoUpSLP is destructed.
563 SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
565 /// A list of values that need to extracted out of the tree.
566 /// This list holds pairs of (Internal Scalar : External User).
567 UserList ExternalUses;
569 /// Values used only by @llvm.assume calls.
570 SmallPtrSet<const Value *, 32> EphValues;
572 /// Holds all of the instructions that we gathered.
573 SetVector<Instruction *> GatherSeq;
574 /// A list of blocks that we are going to CSE.
575 SetVector<BasicBlock *> CSEBlocks;
577 /// Contains all scheduling relevant data for an instruction.
578 /// A ScheduleData either represents a single instruction or a member of an
579 /// instruction bundle (= a group of instructions which is combined into a
580 /// vector instruction).
581 struct ScheduleData {
583 // The initial value for the dependency counters. It means that the
584 // dependencies are not calculated yet.
585 enum { InvalidDeps = -1 };
588 : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
589 NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
590 Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
591 UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
593 void init(int BlockSchedulingRegionID) {
594 FirstInBundle = this;
595 NextInBundle = nullptr;
596 NextLoadStore = nullptr;
598 SchedulingRegionID = BlockSchedulingRegionID;
599 UnscheduledDepsInBundle = UnscheduledDeps;
603 /// Returns true if the dependency information has been calculated.
604 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
606 /// Returns true for single instructions and for bundle representatives
607 /// (= the head of a bundle).
608 bool isSchedulingEntity() const { return FirstInBundle == this; }
610 /// Returns true if it represents an instruction bundle and not only a
611 /// single instruction.
612 bool isPartOfBundle() const {
613 return NextInBundle != nullptr || FirstInBundle != this;
616 /// Returns true if it is ready for scheduling, i.e. it has no more
617 /// unscheduled depending instructions/bundles.
618 bool isReady() const {
619 assert(isSchedulingEntity() &&
620 "can't consider non-scheduling entity for ready list");
621 return UnscheduledDepsInBundle == 0 && !IsScheduled;
624 /// Modifies the number of unscheduled dependencies, also updating it for
625 /// the whole bundle.
626 int incrementUnscheduledDeps(int Incr) {
627 UnscheduledDeps += Incr;
628 return FirstInBundle->UnscheduledDepsInBundle += Incr;
631 /// Sets the number of unscheduled dependencies to the number of
633 void resetUnscheduledDeps() {
634 incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
637 /// Clears all dependency information.
638 void clearDependencies() {
639 Dependencies = InvalidDeps;
640 resetUnscheduledDeps();
641 MemoryDependencies.clear();
644 void dump(raw_ostream &os) const {
645 if (!isSchedulingEntity()) {
647 } else if (NextInBundle) {
649 ScheduleData *SD = NextInBundle;
651 os << ';' << *SD->Inst;
652 SD = SD->NextInBundle;
662 /// Points to the head in an instruction bundle (and always to this for
663 /// single instructions).
664 ScheduleData *FirstInBundle;
666 /// Single linked list of all instructions in a bundle. Null if it is a
667 /// single instruction.
668 ScheduleData *NextInBundle;
670 /// Single linked list of all memory instructions (e.g. load, store, call)
671 /// in the block - until the end of the scheduling region.
672 ScheduleData *NextLoadStore;
674 /// The dependent memory instructions.
675 /// This list is derived on demand in calculateDependencies().
676 SmallVector<ScheduleData *, 4> MemoryDependencies;
678 /// This ScheduleData is in the current scheduling region if this matches
679 /// the current SchedulingRegionID of BlockScheduling.
680 int SchedulingRegionID;
682 /// Used for getting a "good" final ordering of instructions.
683 int SchedulingPriority;
685 /// The number of dependencies. Constitutes of the number of users of the
686 /// instruction plus the number of dependent memory instructions (if any).
687 /// This value is calculated on demand.
688 /// If InvalidDeps, the number of dependencies is not calculated yet.
692 /// The number of dependencies minus the number of dependencies of scheduled
693 /// instructions. As soon as this is zero, the instruction/bundle gets ready
695 /// Note that this is negative as long as Dependencies is not calculated.
698 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
699 /// single instructions.
700 int UnscheduledDepsInBundle;
702 /// True if this instruction is scheduled (or considered as scheduled in the
708 friend raw_ostream &operator<<(raw_ostream &os,
709 const BoUpSLP::ScheduleData &SD);
712 /// Contains all scheduling data for a basic block.
714 struct BlockScheduling {
716 BlockScheduling(BasicBlock *BB)
717 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
718 ScheduleStart(nullptr), ScheduleEnd(nullptr),
719 FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
720 // Make sure that the initial SchedulingRegionID is greater than the
721 // initial SchedulingRegionID in ScheduleData (which is 0).
722 SchedulingRegionID(1) {}
726 ScheduleStart = nullptr;
727 ScheduleEnd = nullptr;
728 FirstLoadStoreInRegion = nullptr;
729 LastLoadStoreInRegion = nullptr;
731 // Make a new scheduling region, i.e. all existing ScheduleData is not
732 // in the new region yet.
733 ++SchedulingRegionID;
736 ScheduleData *getScheduleData(Value *V) {
737 ScheduleData *SD = ScheduleDataMap[V];
738 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
743 bool isInSchedulingRegion(ScheduleData *SD) {
744 return SD->SchedulingRegionID == SchedulingRegionID;
747 /// Marks an instruction as scheduled and puts all dependent ready
748 /// instructions into the ready-list.
749 template <typename ReadyListType>
750 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
751 SD->IsScheduled = true;
752 DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
754 ScheduleData *BundleMember = SD;
755 while (BundleMember) {
756 // Handle the def-use chain dependencies.
757 for (Use &U : BundleMember->Inst->operands()) {
758 ScheduleData *OpDef = getScheduleData(U.get());
759 if (OpDef && OpDef->hasValidDependencies() &&
760 OpDef->incrementUnscheduledDeps(-1) == 0) {
761 // There are no more unscheduled dependencies after decrementing,
762 // so we can put the dependent instruction into the ready list.
763 ScheduleData *DepBundle = OpDef->FirstInBundle;
764 assert(!DepBundle->IsScheduled &&
765 "already scheduled bundle gets ready");
766 ReadyList.insert(DepBundle);
767 DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
770 // Handle the memory dependencies.
771 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
772 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
773 // There are no more unscheduled dependencies after decrementing,
774 // so we can put the dependent instruction into the ready list.
775 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
776 assert(!DepBundle->IsScheduled &&
777 "already scheduled bundle gets ready");
778 ReadyList.insert(DepBundle);
779 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
782 BundleMember = BundleMember->NextInBundle;
786 /// Put all instructions into the ReadyList which are ready for scheduling.
787 template <typename ReadyListType>
788 void initialFillReadyList(ReadyListType &ReadyList) {
789 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
790 ScheduleData *SD = getScheduleData(I);
791 if (SD->isSchedulingEntity() && SD->isReady()) {
792 ReadyList.insert(SD);
793 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
798 /// Checks if a bundle of instructions can be scheduled, i.e. has no
799 /// cyclic dependencies. This is only a dry-run, no instructions are
800 /// actually moved at this stage.
801 bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
803 /// Un-bundles a group of instructions.
804 void cancelScheduling(ArrayRef<Value *> VL);
806 /// Extends the scheduling region so that V is inside the region.
807 void extendSchedulingRegion(Value *V);
809 /// Initialize the ScheduleData structures for new instructions in the
810 /// scheduling region.
811 void initScheduleData(Instruction *FromI, Instruction *ToI,
812 ScheduleData *PrevLoadStore,
813 ScheduleData *NextLoadStore);
815 /// Updates the dependency information of a bundle and of all instructions/
816 /// bundles which depend on the original bundle.
817 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
820 /// Sets all instruction in the scheduling region to un-scheduled.
821 void resetSchedule();
825 /// Simple memory allocation for ScheduleData.
826 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
828 /// The size of a ScheduleData array in ScheduleDataChunks.
831 /// The allocator position in the current chunk, which is the last entry
832 /// of ScheduleDataChunks.
835 /// Attaches ScheduleData to Instruction.
836 /// Note that the mapping survives during all vectorization iterations, i.e.
837 /// ScheduleData structures are recycled.
838 DenseMap<Value *, ScheduleData *> ScheduleDataMap;
840 struct ReadyList : SmallVector<ScheduleData *, 8> {
841 void insert(ScheduleData *SD) { push_back(SD); }
844 /// The ready-list for scheduling (only used for the dry-run).
845 ReadyList ReadyInsts;
847 /// The first instruction of the scheduling region.
848 Instruction *ScheduleStart;
850 /// The first instruction _after_ the scheduling region.
851 Instruction *ScheduleEnd;
853 /// The first memory accessing instruction in the scheduling region
855 ScheduleData *FirstLoadStoreInRegion;
857 /// The last memory accessing instruction in the scheduling region
859 ScheduleData *LastLoadStoreInRegion;
861 /// The ID of the scheduling region. For a new vectorization iteration this
862 /// is incremented which "removes" all ScheduleData from the region.
863 int SchedulingRegionID;
866 /// Attaches the BlockScheduling structures to basic blocks.
867 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
869 /// Performs the "real" scheduling. Done before vectorization is actually
870 /// performed in a basic block.
871 void scheduleBlock(BlockScheduling *BS);
873 /// List of users to ignore during scheduling and that don't need extracting.
874 ArrayRef<Value *> UserIgnoreList;
876 // Number of load-bundles, which contain consecutive loads.
877 int NumLoadsWantToKeepOrder;
879 // Number of load-bundles of size 2, which are consecutive loads if reversed.
880 int NumLoadsWantToChangeOrder;
882 // Analysis and block reference.
885 TargetTransformInfo *TTI;
886 TargetLibraryInfo *TLI;
890 /// Instruction builder to construct the vectorized tree.
895 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
901 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
902 ArrayRef<Value *> UserIgnoreLst) {
904 UserIgnoreList = UserIgnoreLst;
905 if (!getSameType(Roots))
907 buildTree_rec(Roots, 0);
909 // Collect the values that we need to extract from the tree.
910 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
911 TreeEntry *Entry = &VectorizableTree[EIdx];
914 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
915 Value *Scalar = Entry->Scalars[Lane];
917 // No need to handle users of gathered values.
918 if (Entry->NeedToGather)
921 for (User *U : Scalar->users()) {
922 DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
924 Instruction *UserInst = dyn_cast<Instruction>(U);
928 // Skip in-tree scalars that become vectors
929 if (ScalarToTreeEntry.count(U)) {
930 int Idx = ScalarToTreeEntry[U];
931 TreeEntry *UseEntry = &VectorizableTree[Idx];
932 Value *UseScalar = UseEntry->Scalars[0];
933 // Some in-tree scalars will remain as scalar in vectorized
934 // instructions. If that is the case, the one in Lane 0 will
936 if (UseScalar != U ||
937 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
938 DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
940 assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
945 // Ignore users in the user ignore list.
946 if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
947 UserIgnoreList.end())
950 DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
951 Lane << " from " << *Scalar << ".\n");
952 ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
959 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
960 bool SameTy = getSameType(VL); (void)SameTy;
961 bool isAltShuffle = false;
962 assert(SameTy && "Invalid types!");
964 if (Depth == RecursionMaxDepth) {
965 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
966 newTreeEntry(VL, false);
970 // Don't handle vectors.
971 if (VL[0]->getType()->isVectorTy()) {
972 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
973 newTreeEntry(VL, false);
977 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
978 if (SI->getValueOperand()->getType()->isVectorTy()) {
979 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
980 newTreeEntry(VL, false);
983 unsigned Opcode = getSameOpcode(VL);
985 // Check that this shuffle vector refers to the alternate
986 // sequence of opcodes.
987 if (Opcode == Instruction::ShuffleVector) {
988 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
989 unsigned Op = I0->getOpcode();
990 if (Op != Instruction::ShuffleVector)
994 // If all of the operands are identical or constant we have a simple solution.
995 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
996 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
997 newTreeEntry(VL, false);
1001 // We now know that this is a vector of instructions of the same type from
1004 // Don't vectorize ephemeral values.
1005 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1006 if (EphValues.count(VL[i])) {
1007 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1008 ") is ephemeral.\n");
1009 newTreeEntry(VL, false);
1014 // Check if this is a duplicate of another entry.
1015 if (ScalarToTreeEntry.count(VL[0])) {
1016 int Idx = ScalarToTreeEntry[VL[0]];
1017 TreeEntry *E = &VectorizableTree[Idx];
1018 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1019 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
1020 if (E->Scalars[i] != VL[i]) {
1021 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
1022 newTreeEntry(VL, false);
1026 DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
1030 // Check that none of the instructions in the bundle are already in the tree.
1031 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1032 if (ScalarToTreeEntry.count(VL[i])) {
1033 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1034 ") is already in tree.\n");
1035 newTreeEntry(VL, false);
1040 // If any of the scalars is marked as a value that needs to stay scalar then
1041 // we need to gather the scalars.
1042 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1043 if (MustGather.count(VL[i])) {
1044 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
1045 newTreeEntry(VL, false);
1050 // Check that all of the users of the scalars that we want to vectorize are
1052 Instruction *VL0 = cast<Instruction>(VL[0]);
1053 BasicBlock *BB = cast<Instruction>(VL0)->getParent();
1055 if (!DT->isReachableFromEntry(BB)) {
1056 // Don't go into unreachable blocks. They may contain instructions with
1057 // dependency cycles which confuse the final scheduling.
1058 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
1059 newTreeEntry(VL, false);
1063 // Check that every instructions appears once in this bundle.
1064 for (unsigned i = 0, e = VL.size(); i < e; ++i)
1065 for (unsigned j = i+1; j < e; ++j)
1066 if (VL[i] == VL[j]) {
1067 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
1068 newTreeEntry(VL, false);
1072 auto &BSRef = BlocksSchedules[BB];
1074 BSRef = llvm::make_unique<BlockScheduling>(BB);
1076 BlockScheduling &BS = *BSRef.get();
1078 if (!BS.tryScheduleBundle(VL, this)) {
1079 DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
1080 BS.cancelScheduling(VL);
1081 newTreeEntry(VL, false);
1084 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
1087 case Instruction::PHI: {
1088 PHINode *PH = dyn_cast<PHINode>(VL0);
1090 // Check for terminator values (e.g. invoke).
1091 for (unsigned j = 0; j < VL.size(); ++j)
1092 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1093 TerminatorInst *Term = dyn_cast<TerminatorInst>(
1094 cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
1096 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
1097 BS.cancelScheduling(VL);
1098 newTreeEntry(VL, false);
1103 newTreeEntry(VL, true);
1104 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
1106 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1108 // Prepare the operand vector.
1109 for (unsigned j = 0; j < VL.size(); ++j)
1110 Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
1111 PH->getIncomingBlock(i)));
1113 buildTree_rec(Operands, Depth + 1);
1117 case Instruction::ExtractElement: {
1118 bool Reuse = CanReuseExtract(VL);
1120 DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
1122 BS.cancelScheduling(VL);
1124 newTreeEntry(VL, Reuse);
1127 case Instruction::Load: {
1128 // Check if the loads are consecutive or of we need to swizzle them.
1129 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1130 LoadInst *L = cast<LoadInst>(VL[i]);
1131 if (!L->isSimple()) {
1132 BS.cancelScheduling(VL);
1133 newTreeEntry(VL, false);
1134 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
1137 const DataLayout &DL = F->getParent()->getDataLayout();
1138 if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
1139 if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
1140 ++NumLoadsWantToChangeOrder;
1142 BS.cancelScheduling(VL);
1143 newTreeEntry(VL, false);
1144 DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
1148 ++NumLoadsWantToKeepOrder;
1149 newTreeEntry(VL, true);
1150 DEBUG(dbgs() << "SLP: added a vector of loads.\n");
1153 case Instruction::ZExt:
1154 case Instruction::SExt:
1155 case Instruction::FPToUI:
1156 case Instruction::FPToSI:
1157 case Instruction::FPExt:
1158 case Instruction::PtrToInt:
1159 case Instruction::IntToPtr:
1160 case Instruction::SIToFP:
1161 case Instruction::UIToFP:
1162 case Instruction::Trunc:
1163 case Instruction::FPTrunc:
1164 case Instruction::BitCast: {
1165 Type *SrcTy = VL0->getOperand(0)->getType();
1166 for (unsigned i = 0; i < VL.size(); ++i) {
1167 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1168 if (Ty != SrcTy || !isValidElementType(Ty)) {
1169 BS.cancelScheduling(VL);
1170 newTreeEntry(VL, false);
1171 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
1175 newTreeEntry(VL, true);
1176 DEBUG(dbgs() << "SLP: added a vector of casts.\n");
1178 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1180 // Prepare the operand vector.
1181 for (unsigned j = 0; j < VL.size(); ++j)
1182 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1184 buildTree_rec(Operands, Depth+1);
1188 case Instruction::ICmp:
1189 case Instruction::FCmp: {
1190 // Check that all of the compares have the same predicate.
1191 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
1192 Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
1193 for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1194 CmpInst *Cmp = cast<CmpInst>(VL[i]);
1195 if (Cmp->getPredicate() != P0 ||
1196 Cmp->getOperand(0)->getType() != ComparedTy) {
1197 BS.cancelScheduling(VL);
1198 newTreeEntry(VL, false);
1199 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
1204 newTreeEntry(VL, true);
1205 DEBUG(dbgs() << "SLP: added a vector of compares.\n");
1207 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1209 // Prepare the operand vector.
1210 for (unsigned j = 0; j < VL.size(); ++j)
1211 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1213 buildTree_rec(Operands, Depth+1);
1217 case Instruction::Select:
1218 case Instruction::Add:
1219 case Instruction::FAdd:
1220 case Instruction::Sub:
1221 case Instruction::FSub:
1222 case Instruction::Mul:
1223 case Instruction::FMul:
1224 case Instruction::UDiv:
1225 case Instruction::SDiv:
1226 case Instruction::FDiv:
1227 case Instruction::URem:
1228 case Instruction::SRem:
1229 case Instruction::FRem:
1230 case Instruction::Shl:
1231 case Instruction::LShr:
1232 case Instruction::AShr:
1233 case Instruction::And:
1234 case Instruction::Or:
1235 case Instruction::Xor: {
1236 newTreeEntry(VL, true);
1237 DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
1239 // Sort operands of the instructions so that each side is more likely to
1240 // have the same opcode.
1241 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1242 ValueList Left, Right;
1243 reorderInputsAccordingToOpcode(VL, Left, Right);
1244 buildTree_rec(Left, Depth + 1);
1245 buildTree_rec(Right, Depth + 1);
1249 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1251 // Prepare the operand vector.
1252 for (unsigned j = 0; j < VL.size(); ++j)
1253 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1255 buildTree_rec(Operands, Depth+1);
1259 case Instruction::GetElementPtr: {
1260 // We don't combine GEPs with complicated (nested) indexing.
1261 for (unsigned j = 0; j < VL.size(); ++j) {
1262 if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1263 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
1264 BS.cancelScheduling(VL);
1265 newTreeEntry(VL, false);
1270 // We can't combine several GEPs into one vector if they operate on
1272 Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
1273 for (unsigned j = 0; j < VL.size(); ++j) {
1274 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1276 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
1277 BS.cancelScheduling(VL);
1278 newTreeEntry(VL, false);
1283 // We don't combine GEPs with non-constant indexes.
1284 for (unsigned j = 0; j < VL.size(); ++j) {
1285 auto Op = cast<Instruction>(VL[j])->getOperand(1);
1286 if (!isa<ConstantInt>(Op)) {
1288 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
1289 BS.cancelScheduling(VL);
1290 newTreeEntry(VL, false);
1295 newTreeEntry(VL, true);
1296 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
1297 for (unsigned i = 0, e = 2; i < e; ++i) {
1299 // Prepare the operand vector.
1300 for (unsigned j = 0; j < VL.size(); ++j)
1301 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1303 buildTree_rec(Operands, Depth + 1);
1307 case Instruction::Store: {
1308 const DataLayout &DL = F->getParent()->getDataLayout();
1309 // Check if the stores are consecutive or of we need to swizzle them.
1310 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1311 if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
1312 BS.cancelScheduling(VL);
1313 newTreeEntry(VL, false);
1314 DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
1318 newTreeEntry(VL, true);
1319 DEBUG(dbgs() << "SLP: added a vector of stores.\n");
1322 for (unsigned j = 0; j < VL.size(); ++j)
1323 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
1325 buildTree_rec(Operands, Depth + 1);
1328 case Instruction::Call: {
1329 // Check if the calls are all to the same vectorizable intrinsic.
1330 CallInst *CI = cast<CallInst>(VL[0]);
1331 // Check if this is an Intrinsic call or something that can be
1332 // represented by an intrinsic call
1333 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1334 if (!isTriviallyVectorizable(ID)) {
1335 BS.cancelScheduling(VL);
1336 newTreeEntry(VL, false);
1337 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
1340 Function *Int = CI->getCalledFunction();
1341 Value *A1I = nullptr;
1342 if (hasVectorInstrinsicScalarOpd(ID, 1))
1343 A1I = CI->getArgOperand(1);
1344 for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1345 CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
1346 if (!CI2 || CI2->getCalledFunction() != Int ||
1347 getIntrinsicIDForCall(CI2, TLI) != ID) {
1348 BS.cancelScheduling(VL);
1349 newTreeEntry(VL, false);
1350 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
1354 // ctlz,cttz and powi are special intrinsics whose second argument
1355 // should be same in order for them to be vectorized.
1356 if (hasVectorInstrinsicScalarOpd(ID, 1)) {
1357 Value *A1J = CI2->getArgOperand(1);
1359 BS.cancelScheduling(VL);
1360 newTreeEntry(VL, false);
1361 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
1362 << " argument "<< A1I<<"!=" << A1J
1369 newTreeEntry(VL, true);
1370 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
1372 // Prepare the operand vector.
1373 for (unsigned j = 0; j < VL.size(); ++j) {
1374 CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
1375 Operands.push_back(CI2->getArgOperand(i));
1377 buildTree_rec(Operands, Depth + 1);
1381 case Instruction::ShuffleVector: {
1382 // If this is not an alternate sequence of opcode like add-sub
1383 // then do not vectorize this instruction.
1384 if (!isAltShuffle) {
1385 BS.cancelScheduling(VL);
1386 newTreeEntry(VL, false);
1387 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
1390 newTreeEntry(VL, true);
1391 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
1393 // Reorder operands if reordering would enable vectorization.
1394 if (isa<BinaryOperator>(VL0)) {
1395 ValueList Left, Right;
1396 reorderAltShuffleOperands(VL, Left, Right);
1397 buildTree_rec(Left, Depth + 1);
1398 buildTree_rec(Right, Depth + 1);
1402 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1404 // Prepare the operand vector.
1405 for (unsigned j = 0; j < VL.size(); ++j)
1406 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1408 buildTree_rec(Operands, Depth + 1);
1413 BS.cancelScheduling(VL);
1414 newTreeEntry(VL, false);
1415 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
1420 int BoUpSLP::getEntryCost(TreeEntry *E) {
1421 ArrayRef<Value*> VL = E->Scalars;
1423 Type *ScalarTy = VL[0]->getType();
1424 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1425 ScalarTy = SI->getValueOperand()->getType();
1426 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1428 if (E->NeedToGather) {
1429 if (allConstant(VL))
1432 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
1434 return getGatherCost(E->Scalars);
1436 unsigned Opcode = getSameOpcode(VL);
1437 assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
1438 Instruction *VL0 = cast<Instruction>(VL[0]);
1440 case Instruction::PHI: {
1443 case Instruction::ExtractElement: {
1444 if (CanReuseExtract(VL)) {
1446 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1447 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
1449 // Take credit for instruction that will become dead.
1451 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
1455 return getGatherCost(VecTy);
1457 case Instruction::ZExt:
1458 case Instruction::SExt:
1459 case Instruction::FPToUI:
1460 case Instruction::FPToSI:
1461 case Instruction::FPExt:
1462 case Instruction::PtrToInt:
1463 case Instruction::IntToPtr:
1464 case Instruction::SIToFP:
1465 case Instruction::UIToFP:
1466 case Instruction::Trunc:
1467 case Instruction::FPTrunc:
1468 case Instruction::BitCast: {
1469 Type *SrcTy = VL0->getOperand(0)->getType();
1471 // Calculate the cost of this instruction.
1472 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
1473 VL0->getType(), SrcTy);
1475 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
1476 int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
1477 return VecCost - ScalarCost;
1479 case Instruction::FCmp:
1480 case Instruction::ICmp:
1481 case Instruction::Select:
1482 case Instruction::Add:
1483 case Instruction::FAdd:
1484 case Instruction::Sub:
1485 case Instruction::FSub:
1486 case Instruction::Mul:
1487 case Instruction::FMul:
1488 case Instruction::UDiv:
1489 case Instruction::SDiv:
1490 case Instruction::FDiv:
1491 case Instruction::URem:
1492 case Instruction::SRem:
1493 case Instruction::FRem:
1494 case Instruction::Shl:
1495 case Instruction::LShr:
1496 case Instruction::AShr:
1497 case Instruction::And:
1498 case Instruction::Or:
1499 case Instruction::Xor: {
1500 // Calculate the cost of this instruction.
1503 if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
1504 Opcode == Instruction::Select) {
1505 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
1506 ScalarCost = VecTy->getNumElements() *
1507 TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
1508 VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
1510 // Certain instructions can be cheaper to vectorize if they have a
1511 // constant second vector operand.
1512 TargetTransformInfo::OperandValueKind Op1VK =
1513 TargetTransformInfo::OK_AnyValue;
1514 TargetTransformInfo::OperandValueKind Op2VK =
1515 TargetTransformInfo::OK_UniformConstantValue;
1516 TargetTransformInfo::OperandValueProperties Op1VP =
1517 TargetTransformInfo::OP_None;
1518 TargetTransformInfo::OperandValueProperties Op2VP =
1519 TargetTransformInfo::OP_None;
1521 // If all operands are exactly the same ConstantInt then set the
1522 // operand kind to OK_UniformConstantValue.
1523 // If instead not all operands are constants, then set the operand kind
1524 // to OK_AnyValue. If all operands are constants but not the same,
1525 // then set the operand kind to OK_NonUniformConstantValue.
1526 ConstantInt *CInt = nullptr;
1527 for (unsigned i = 0; i < VL.size(); ++i) {
1528 const Instruction *I = cast<Instruction>(VL[i]);
1529 if (!isa<ConstantInt>(I->getOperand(1))) {
1530 Op2VK = TargetTransformInfo::OK_AnyValue;
1534 CInt = cast<ConstantInt>(I->getOperand(1));
1537 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
1538 CInt != cast<ConstantInt>(I->getOperand(1)))
1539 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
1541 // FIXME: Currently cost of model modification for division by
1542 // power of 2 is handled only for X86. Add support for other targets.
1543 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
1544 CInt->getValue().isPowerOf2())
1545 Op2VP = TargetTransformInfo::OP_PowerOf2;
1547 ScalarCost = VecTy->getNumElements() *
1548 TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
1550 VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
1553 return VecCost - ScalarCost;
1555 case Instruction::GetElementPtr: {
1556 TargetTransformInfo::OperandValueKind Op1VK =
1557 TargetTransformInfo::OK_AnyValue;
1558 TargetTransformInfo::OperandValueKind Op2VK =
1559 TargetTransformInfo::OK_UniformConstantValue;
1562 VecTy->getNumElements() *
1563 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
1565 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
1567 return VecCost - ScalarCost;
1569 case Instruction::Load: {
1570 // Cost of wide load - cost of scalar loads.
1571 int ScalarLdCost = VecTy->getNumElements() *
1572 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
1573 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
1574 return VecLdCost - ScalarLdCost;
1576 case Instruction::Store: {
1577 // We know that we can merge the stores. Calculate the cost.
1578 int ScalarStCost = VecTy->getNumElements() *
1579 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
1580 int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
1581 return VecStCost - ScalarStCost;
1583 case Instruction::Call: {
1584 CallInst *CI = cast<CallInst>(VL0);
1585 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1587 // Calculate the cost of the scalar and vector calls.
1588 SmallVector<Type*, 4> ScalarTys, VecTys;
1589 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
1590 ScalarTys.push_back(CI->getArgOperand(op)->getType());
1591 VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
1592 VecTy->getNumElements()));
1595 int ScalarCallCost = VecTy->getNumElements() *
1596 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
1598 int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
1600 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
1601 << " (" << VecCallCost << "-" << ScalarCallCost << ")"
1602 << " for " << *CI << "\n");
1604 return VecCallCost - ScalarCallCost;
1606 case Instruction::ShuffleVector: {
1607 TargetTransformInfo::OperandValueKind Op1VK =
1608 TargetTransformInfo::OK_AnyValue;
1609 TargetTransformInfo::OperandValueKind Op2VK =
1610 TargetTransformInfo::OK_AnyValue;
1613 for (unsigned i = 0; i < VL.size(); ++i) {
1614 Instruction *I = cast<Instruction>(VL[i]);
1618 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
1620 // VecCost is equal to sum of the cost of creating 2 vectors
1621 // and the cost of creating shuffle.
1622 Instruction *I0 = cast<Instruction>(VL[0]);
1624 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
1625 Instruction *I1 = cast<Instruction>(VL[1]);
1627 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
1629 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
1630 return VecCost - ScalarCost;
1633 llvm_unreachable("Unknown instruction");
1637 bool BoUpSLP::isFullyVectorizableTinyTree() {
1638 DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
1639 VectorizableTree.size() << " is fully vectorizable .\n");
1641 // We only handle trees of height 2.
1642 if (VectorizableTree.size() != 2)
1645 // Handle splat and all-constants stores.
1646 if (!VectorizableTree[0].NeedToGather &&
1647 (allConstant(VectorizableTree[1].Scalars) ||
1648 isSplat(VectorizableTree[1].Scalars)))
1651 // Gathering cost would be too much for tiny trees.
1652 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1658 int BoUpSLP::getSpillCost() {
1659 // Walk from the bottom of the tree to the top, tracking which values are
1660 // live. When we see a call instruction that is not part of our tree,
1661 // query TTI to see if there is a cost to keeping values live over it
1662 // (for example, if spills and fills are required).
1663 unsigned BundleWidth = VectorizableTree.front().Scalars.size();
1666 SmallPtrSet<Instruction*, 4> LiveValues;
1667 Instruction *PrevInst = nullptr;
1669 for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
1670 Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
1680 dbgs() << "SLP: #LV: " << LiveValues.size();
1681 for (auto *X : LiveValues)
1682 dbgs() << " " << X->getName();
1683 dbgs() << ", Looking at ";
1687 // Update LiveValues.
1688 LiveValues.erase(PrevInst);
1689 for (auto &J : PrevInst->operands()) {
1690 if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
1691 LiveValues.insert(cast<Instruction>(&*J));
1694 // Now find the sequence of instructions between PrevInst and Inst.
1695 BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
1697 while (InstIt != PrevInstIt) {
1698 if (PrevInstIt == PrevInst->getParent()->rend()) {
1699 PrevInstIt = Inst->getParent()->rbegin();
1703 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
1704 SmallVector<Type*, 4> V;
1705 for (auto *II : LiveValues)
1706 V.push_back(VectorType::get(II->getType(), BundleWidth));
1707 Cost += TTI->getCostOfKeepingLiveOverCall(V);
1716 DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
1720 int BoUpSLP::getTreeCost() {
1722 DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
1723 VectorizableTree.size() << ".\n");
1725 // We only vectorize tiny trees if it is fully vectorizable.
1726 if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
1727 if (VectorizableTree.empty()) {
1728 assert(!ExternalUses.size() && "We should not have any external users");
1733 unsigned BundleWidth = VectorizableTree[0].Scalars.size();
1735 for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
1736 int C = getEntryCost(&VectorizableTree[i]);
1737 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
1738 << *VectorizableTree[i].Scalars[0] << " .\n");
1742 SmallSet<Value *, 16> ExtractCostCalculated;
1743 int ExtractCost = 0;
1744 for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
1746 // We only add extract cost once for the same scalar.
1747 if (!ExtractCostCalculated.insert(I->Scalar).second)
1750 // Uses by ephemeral values are free (because the ephemeral value will be
1751 // removed prior to code generation, and so the extraction will be
1752 // removed as well).
1753 if (EphValues.count(I->User))
1756 VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
1757 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1761 Cost += getSpillCost();
1763 DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
1764 return Cost + ExtractCost;
1767 int BoUpSLP::getGatherCost(Type *Ty) {
1769 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
1770 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
1774 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
1775 // Find the type of the operands in VL.
1776 Type *ScalarTy = VL[0]->getType();
1777 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1778 ScalarTy = SI->getValueOperand()->getType();
1779 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1780 // Find the cost of inserting/extracting values from the vector.
1781 return getGatherCost(VecTy);
1784 Value *BoUpSLP::getPointerOperand(Value *I) {
1785 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1786 return LI->getPointerOperand();
1787 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1788 return SI->getPointerOperand();
1792 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
1793 if (LoadInst *L = dyn_cast<LoadInst>(I))
1794 return L->getPointerAddressSpace();
1795 if (StoreInst *S = dyn_cast<StoreInst>(I))
1796 return S->getPointerAddressSpace();
1800 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
1801 Value *PtrA = getPointerOperand(A);
1802 Value *PtrB = getPointerOperand(B);
1803 unsigned ASA = getAddressSpaceOperand(A);
1804 unsigned ASB = getAddressSpaceOperand(B);
1806 // Check that the address spaces match and that the pointers are valid.
1807 if (!PtrA || !PtrB || (ASA != ASB))
1810 // Make sure that A and B are different pointers of the same type.
1811 if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
1814 unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
1815 Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
1816 APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
1818 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
1819 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1820 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1822 APInt OffsetDelta = OffsetB - OffsetA;
1824 // Check if they are based on the same pointer. That makes the offsets
1827 return OffsetDelta == Size;
1829 // Compute the necessary base pointer delta to have the necessary final delta
1830 // equal to the size.
1831 APInt BaseDelta = Size - OffsetDelta;
1833 // Otherwise compute the distance with SCEV between the base pointers.
1834 const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
1835 const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
1836 const SCEV *C = SE->getConstant(BaseDelta);
1837 const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
1838 return X == PtrSCEVB;
1841 // Reorder commutative operations in alternate shuffle if the resulting vectors
1842 // are consecutive loads. This would allow us to vectorize the tree.
1843 // If we have something like-
1844 // load a[0] - load b[0]
1845 // load b[1] + load a[1]
1846 // load a[2] - load b[2]
1847 // load a[3] + load b[3]
1848 // Reordering the second load b[1] load a[1] would allow us to vectorize this
1850 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
1851 SmallVectorImpl<Value *> &Left,
1852 SmallVectorImpl<Value *> &Right) {
1853 const DataLayout &DL = F->getParent()->getDataLayout();
1855 // Push left and right operands of binary operation into Left and Right
1856 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1857 Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
1858 Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
1861 // Reorder if we have a commutative operation and consecutive access
1862 // are on either side of the alternate instructions.
1863 for (unsigned j = 0; j < VL.size() - 1; ++j) {
1864 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
1865 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
1866 Instruction *VL1 = cast<Instruction>(VL[j]);
1867 Instruction *VL2 = cast<Instruction>(VL[j + 1]);
1868 if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
1869 std::swap(Left[j], Right[j]);
1871 } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
1872 std::swap(Left[j + 1], Right[j + 1]);
1878 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
1879 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
1880 Instruction *VL1 = cast<Instruction>(VL[j]);
1881 Instruction *VL2 = cast<Instruction>(VL[j + 1]);
1882 if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
1883 std::swap(Left[j], Right[j]);
1885 } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
1886 std::swap(Left[j + 1], Right[j + 1]);
1895 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
1896 SmallVectorImpl<Value *> &Left,
1897 SmallVectorImpl<Value *> &Right) {
1899 SmallVector<Value *, 16> OrigLeft, OrigRight;
1901 bool AllSameOpcodeLeft = true;
1902 bool AllSameOpcodeRight = true;
1903 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1904 Instruction *I = cast<Instruction>(VL[i]);
1905 Value *VLeft = I->getOperand(0);
1906 Value *VRight = I->getOperand(1);
1908 OrigLeft.push_back(VLeft);
1909 OrigRight.push_back(VRight);
1911 Instruction *ILeft = dyn_cast<Instruction>(VLeft);
1912 Instruction *IRight = dyn_cast<Instruction>(VRight);
1914 // Check whether all operands on one side have the same opcode. In this case
1915 // we want to preserve the original order and not make things worse by
1917 if (i && AllSameOpcodeLeft && ILeft) {
1918 if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
1919 if (PLeft->getOpcode() != ILeft->getOpcode())
1920 AllSameOpcodeLeft = false;
1922 AllSameOpcodeLeft = false;
1924 if (i && AllSameOpcodeRight && IRight) {
1925 if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
1926 if (PRight->getOpcode() != IRight->getOpcode())
1927 AllSameOpcodeRight = false;
1929 AllSameOpcodeRight = false;
1932 // Sort two opcodes. In the code below we try to preserve the ability to use
1933 // broadcast of values instead of individual inserts.
1940 // If we just sorted according to opcode we would leave the first line in
1941 // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
1944 // Because vr2 and vr1 are from the same load we loose the opportunity of a
1945 // broadcast for the packed right side in the backend: we have [vr1, vl2]
1946 // instead of [vr1, vr2=vr1].
1947 if (ILeft && IRight) {
1948 if (!i && ILeft->getOpcode() > IRight->getOpcode()) {
1949 Left.push_back(IRight);
1950 Right.push_back(ILeft);
1951 } else if (i && ILeft->getOpcode() > IRight->getOpcode() &&
1952 Right[i - 1] != IRight) {
1953 // Try not to destroy a broad cast for no apparent benefit.
1954 Left.push_back(IRight);
1955 Right.push_back(ILeft);
1956 } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
1957 Right[i - 1] == ILeft) {
1958 // Try preserve broadcasts.
1959 Left.push_back(IRight);
1960 Right.push_back(ILeft);
1961 } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
1962 Left[i - 1] == IRight) {
1963 // Try preserve broadcasts.
1964 Left.push_back(IRight);
1965 Right.push_back(ILeft);
1967 Left.push_back(ILeft);
1968 Right.push_back(IRight);
1972 // One opcode, put the instruction on the right.
1974 Left.push_back(VRight);
1975 Right.push_back(ILeft);
1978 Left.push_back(VLeft);
1979 Right.push_back(VRight);
1982 bool LeftBroadcast = isSplat(Left);
1983 bool RightBroadcast = isSplat(Right);
1985 // If operands end up being broadcast return this operand order.
1986 if (LeftBroadcast || RightBroadcast)
1989 // Don't reorder if the operands where good to begin.
1990 if (AllSameOpcodeRight || AllSameOpcodeLeft) {
1995 const DataLayout &DL = F->getParent()->getDataLayout();
1997 // Finally check if we can get longer vectorizable chain by reordering
1998 // without breaking the good operand order detected above.
1999 // E.g. If we have something like-
2000 // load a[0] load b[0]
2001 // load b[1] load a[1]
2002 // load a[2] load b[2]
2003 // load a[3] load b[3]
2004 // Reordering the second load b[1] load a[1] would allow us to vectorize
2005 // this code and we still retain AllSameOpcode property.
2006 // FIXME: This load reordering might break AllSameOpcode in some rare cases
2008 // add a[0],c[0] load b[0]
2009 // add a[1],c[2] load b[1]
2011 // add a[3],c[3] load b[3]
2012 for (unsigned j = 0; j < VL.size() - 1; ++j) {
2013 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2014 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2015 if (isConsecutiveAccess(L, L1, DL)) {
2016 std::swap(Left[j + 1], Right[j + 1]);
2021 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2022 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2023 if (isConsecutiveAccess(L, L1, DL)) {
2024 std::swap(Left[j + 1], Right[j + 1]);
2033 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
2034 Instruction *VL0 = cast<Instruction>(VL[0]);
2035 BasicBlock::iterator NextInst = VL0;
2037 Builder.SetInsertPoint(VL0->getParent(), NextInst);
2038 Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
2041 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
2042 Value *Vec = UndefValue::get(Ty);
2043 // Generate the 'InsertElement' instruction.
2044 for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
2045 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
2046 if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
2047 GatherSeq.insert(Insrt);
2048 CSEBlocks.insert(Insrt->getParent());
2050 // Add to our 'need-to-extract' list.
2051 if (ScalarToTreeEntry.count(VL[i])) {
2052 int Idx = ScalarToTreeEntry[VL[i]];
2053 TreeEntry *E = &VectorizableTree[Idx];
2054 // Find which lane we need to extract.
2056 for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
2057 // Is this the lane of the scalar that we are looking for ?
2058 if (E->Scalars[Lane] == VL[i]) {
2063 assert(FoundLane >= 0 && "Could not find the correct lane");
2064 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
2072 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
2073 SmallDenseMap<Value*, int>::const_iterator Entry
2074 = ScalarToTreeEntry.find(VL[0]);
2075 if (Entry != ScalarToTreeEntry.end()) {
2076 int Idx = Entry->second;
2077 const TreeEntry *En = &VectorizableTree[Idx];
2078 if (En->isSame(VL) && En->VectorizedValue)
2079 return En->VectorizedValue;
2084 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
2085 if (ScalarToTreeEntry.count(VL[0])) {
2086 int Idx = ScalarToTreeEntry[VL[0]];
2087 TreeEntry *E = &VectorizableTree[Idx];
2089 return vectorizeTree(E);
2092 Type *ScalarTy = VL[0]->getType();
2093 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
2094 ScalarTy = SI->getValueOperand()->getType();
2095 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2097 return Gather(VL, VecTy);
2100 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
2101 IRBuilder<>::InsertPointGuard Guard(Builder);
2103 if (E->VectorizedValue) {
2104 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
2105 return E->VectorizedValue;
2108 Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
2109 Type *ScalarTy = VL0->getType();
2110 if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
2111 ScalarTy = SI->getValueOperand()->getType();
2112 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
2114 if (E->NeedToGather) {
2115 setInsertPointAfterBundle(E->Scalars);
2116 return Gather(E->Scalars, VecTy);
2119 const DataLayout &DL = F->getParent()->getDataLayout();
2120 unsigned Opcode = getSameOpcode(E->Scalars);
2123 case Instruction::PHI: {
2124 PHINode *PH = dyn_cast<PHINode>(VL0);
2125 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
2126 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2127 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
2128 E->VectorizedValue = NewPhi;
2130 // PHINodes may have multiple entries from the same block. We want to
2131 // visit every block once.
2132 SmallSet<BasicBlock*, 4> VisitedBBs;
2134 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
2136 BasicBlock *IBB = PH->getIncomingBlock(i);
2138 if (!VisitedBBs.insert(IBB).second) {
2139 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
2143 // Prepare the operand vector.
2144 for (Value *V : E->Scalars)
2145 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
2147 Builder.SetInsertPoint(IBB->getTerminator());
2148 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2149 Value *Vec = vectorizeTree(Operands);
2150 NewPhi->addIncoming(Vec, IBB);
2153 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
2154 "Invalid number of incoming values");
2158 case Instruction::ExtractElement: {
2159 if (CanReuseExtract(E->Scalars)) {
2160 Value *V = VL0->getOperand(0);
2161 E->VectorizedValue = V;
2164 return Gather(E->Scalars, VecTy);
2166 case Instruction::ZExt:
2167 case Instruction::SExt:
2168 case Instruction::FPToUI:
2169 case Instruction::FPToSI:
2170 case Instruction::FPExt:
2171 case Instruction::PtrToInt:
2172 case Instruction::IntToPtr:
2173 case Instruction::SIToFP:
2174 case Instruction::UIToFP:
2175 case Instruction::Trunc:
2176 case Instruction::FPTrunc:
2177 case Instruction::BitCast: {
2179 for (Value *V : E->Scalars)
2180 INVL.push_back(cast<Instruction>(V)->getOperand(0));
2182 setInsertPointAfterBundle(E->Scalars);
2184 Value *InVec = vectorizeTree(INVL);
2186 if (Value *V = alreadyVectorized(E->Scalars))
2189 CastInst *CI = dyn_cast<CastInst>(VL0);
2190 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
2191 E->VectorizedValue = V;
2192 ++NumVectorInstructions;
2195 case Instruction::FCmp:
2196 case Instruction::ICmp: {
2197 ValueList LHSV, RHSV;
2198 for (Value *V : E->Scalars) {
2199 LHSV.push_back(cast<Instruction>(V)->getOperand(0));
2200 RHSV.push_back(cast<Instruction>(V)->getOperand(1));
2203 setInsertPointAfterBundle(E->Scalars);
2205 Value *L = vectorizeTree(LHSV);
2206 Value *R = vectorizeTree(RHSV);
2208 if (Value *V = alreadyVectorized(E->Scalars))
2211 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
2213 if (Opcode == Instruction::FCmp)
2214 V = Builder.CreateFCmp(P0, L, R);
2216 V = Builder.CreateICmp(P0, L, R);
2218 E->VectorizedValue = V;
2219 ++NumVectorInstructions;
2222 case Instruction::Select: {
2223 ValueList TrueVec, FalseVec, CondVec;
2224 for (Value *V : E->Scalars) {
2225 CondVec.push_back(cast<Instruction>(V)->getOperand(0));
2226 TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
2227 FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
2230 setInsertPointAfterBundle(E->Scalars);
2232 Value *Cond = vectorizeTree(CondVec);
2233 Value *True = vectorizeTree(TrueVec);
2234 Value *False = vectorizeTree(FalseVec);
2236 if (Value *V = alreadyVectorized(E->Scalars))
2239 Value *V = Builder.CreateSelect(Cond, True, False);
2240 E->VectorizedValue = V;
2241 ++NumVectorInstructions;
2244 case Instruction::Add:
2245 case Instruction::FAdd:
2246 case Instruction::Sub:
2247 case Instruction::FSub:
2248 case Instruction::Mul:
2249 case Instruction::FMul:
2250 case Instruction::UDiv:
2251 case Instruction::SDiv:
2252 case Instruction::FDiv:
2253 case Instruction::URem:
2254 case Instruction::SRem:
2255 case Instruction::FRem:
2256 case Instruction::Shl:
2257 case Instruction::LShr:
2258 case Instruction::AShr:
2259 case Instruction::And:
2260 case Instruction::Or:
2261 case Instruction::Xor: {
2262 ValueList LHSVL, RHSVL;
2263 if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
2264 reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
2266 for (Value *V : E->Scalars) {
2267 LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
2268 RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
2271 setInsertPointAfterBundle(E->Scalars);
2273 Value *LHS = vectorizeTree(LHSVL);
2274 Value *RHS = vectorizeTree(RHSVL);
2276 if (LHS == RHS && isa<Instruction>(LHS)) {
2277 assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
2280 if (Value *V = alreadyVectorized(E->Scalars))
2283 BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
2284 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
2285 E->VectorizedValue = V;
2286 propagateIRFlags(E->VectorizedValue, E->Scalars);
2287 ++NumVectorInstructions;
2289 if (Instruction *I = dyn_cast<Instruction>(V))
2290 return propagateMetadata(I, E->Scalars);
2294 case Instruction::Load: {
2295 // Loads are inserted at the head of the tree because we don't want to
2296 // sink them all the way down past store instructions.
2297 setInsertPointAfterBundle(E->Scalars);
2299 LoadInst *LI = cast<LoadInst>(VL0);
2300 Type *ScalarLoadTy = LI->getType();
2301 unsigned AS = LI->getPointerAddressSpace();
2303 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
2304 VecTy->getPointerTo(AS));
2306 // The pointer operand uses an in-tree scalar so we add the new BitCast to
2307 // ExternalUses list to make sure that an extract will be generated in the
2309 if (ScalarToTreeEntry.count(LI->getPointerOperand()))
2310 ExternalUses.push_back(
2311 ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
2313 unsigned Alignment = LI->getAlignment();
2314 LI = Builder.CreateLoad(VecPtr);
2316 Alignment = DL.getABITypeAlignment(ScalarLoadTy);
2318 LI->setAlignment(Alignment);
2319 E->VectorizedValue = LI;
2320 ++NumVectorInstructions;
2321 return propagateMetadata(LI, E->Scalars);
2323 case Instruction::Store: {
2324 StoreInst *SI = cast<StoreInst>(VL0);
2325 unsigned Alignment = SI->getAlignment();
2326 unsigned AS = SI->getPointerAddressSpace();
2329 for (Value *V : E->Scalars)
2330 ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
2332 setInsertPointAfterBundle(E->Scalars);
2334 Value *VecValue = vectorizeTree(ValueOp);
2335 Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
2336 VecTy->getPointerTo(AS));
2337 StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
2339 // The pointer operand uses an in-tree scalar so we add the new BitCast to
2340 // ExternalUses list to make sure that an extract will be generated in the
2342 if (ScalarToTreeEntry.count(SI->getPointerOperand()))
2343 ExternalUses.push_back(
2344 ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
2347 Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType());
2349 S->setAlignment(Alignment);
2350 E->VectorizedValue = S;
2351 ++NumVectorInstructions;
2352 return propagateMetadata(S, E->Scalars);
2354 case Instruction::GetElementPtr: {
2355 setInsertPointAfterBundle(E->Scalars);
2358 for (Value *V : E->Scalars)
2359 Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
2361 Value *Op0 = vectorizeTree(Op0VL);
2363 std::vector<Value *> OpVecs;
2364 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
2367 for (Value *V : E->Scalars)
2368 OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
2370 Value *OpVec = vectorizeTree(OpVL);
2371 OpVecs.push_back(OpVec);
2374 Value *V = Builder.CreateGEP(
2375 cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
2376 E->VectorizedValue = V;
2377 ++NumVectorInstructions;
2379 if (Instruction *I = dyn_cast<Instruction>(V))
2380 return propagateMetadata(I, E->Scalars);
2384 case Instruction::Call: {
2385 CallInst *CI = cast<CallInst>(VL0);
2386 setInsertPointAfterBundle(E->Scalars);
2388 Intrinsic::ID IID = Intrinsic::not_intrinsic;
2389 Value *ScalarArg = nullptr;
2390 if (CI && (FI = CI->getCalledFunction())) {
2391 IID = FI->getIntrinsicID();
2393 std::vector<Value *> OpVecs;
2394 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
2396 // ctlz,cttz and powi are special intrinsics whose second argument is
2397 // a scalar. This argument should not be vectorized.
2398 if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
2399 CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2400 ScalarArg = CEI->getArgOperand(j);
2401 OpVecs.push_back(CEI->getArgOperand(j));
2404 for (Value *V : E->Scalars) {
2405 CallInst *CEI = cast<CallInst>(V);
2406 OpVL.push_back(CEI->getArgOperand(j));
2409 Value *OpVec = vectorizeTree(OpVL);
2410 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
2411 OpVecs.push_back(OpVec);
2414 Module *M = F->getParent();
2415 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
2416 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
2417 Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
2418 Value *V = Builder.CreateCall(CF, OpVecs);
2420 // The scalar argument uses an in-tree scalar so we add the new vectorized
2421 // call to ExternalUses list to make sure that an extract will be
2422 // generated in the future.
2423 if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2424 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2426 E->VectorizedValue = V;
2427 ++NumVectorInstructions;
2430 case Instruction::ShuffleVector: {
2431 ValueList LHSVL, RHSVL;
2432 assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
2433 reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
2434 setInsertPointAfterBundle(E->Scalars);
2436 Value *LHS = vectorizeTree(LHSVL);
2437 Value *RHS = vectorizeTree(RHSVL);
2439 if (Value *V = alreadyVectorized(E->Scalars))
2442 // Create a vector of LHS op1 RHS
2443 BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
2444 Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
2446 // Create a vector of LHS op2 RHS
2447 Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
2448 BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
2449 Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
2451 // Create shuffle to take alternate operations from the vector.
2452 // Also, gather up odd and even scalar ops to propagate IR flags to
2453 // each vector operation.
2454 ValueList OddScalars, EvenScalars;
2455 unsigned e = E->Scalars.size();
2456 SmallVector<Constant *, 8> Mask(e);
2457 for (unsigned i = 0; i < e; ++i) {
2459 Mask[i] = Builder.getInt32(e + i);
2460 OddScalars.push_back(E->Scalars[i]);
2462 Mask[i] = Builder.getInt32(i);
2463 EvenScalars.push_back(E->Scalars[i]);
2467 Value *ShuffleMask = ConstantVector::get(Mask);
2468 propagateIRFlags(V0, EvenScalars);
2469 propagateIRFlags(V1, OddScalars);
2471 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2472 E->VectorizedValue = V;
2473 ++NumVectorInstructions;
2474 if (Instruction *I = dyn_cast<Instruction>(V))
2475 return propagateMetadata(I, E->Scalars);
2480 llvm_unreachable("unknown inst");
2485 Value *BoUpSLP::vectorizeTree() {
2487 // All blocks must be scheduled before any instructions are inserted.
2488 for (auto &BSIter : BlocksSchedules) {
2489 scheduleBlock(BSIter.second.get());
2492 Builder.SetInsertPoint(F->getEntryBlock().begin());
2493 vectorizeTree(&VectorizableTree[0]);
2495 DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
2497 // Extract all of the elements with the external uses.
2498 for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
2500 Value *Scalar = it->Scalar;
2501 llvm::User *User = it->User;
2503 // Skip users that we already RAUW. This happens when one instruction
2504 // has multiple uses of the same value.
2505 if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
2508 assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
2510 int Idx = ScalarToTreeEntry[Scalar];
2511 TreeEntry *E = &VectorizableTree[Idx];
2512 assert(!E->NeedToGather && "Extracting from a gather list");
2514 Value *Vec = E->VectorizedValue;
2515 assert(Vec && "Can't find vectorizable value");
2517 Value *Lane = Builder.getInt32(it->Lane);
2518 // Generate extracts for out-of-tree users.
2519 // Find the insertion point for the extractelement lane.
2520 if (isa<Instruction>(Vec)){
2521 if (PHINode *PH = dyn_cast<PHINode>(User)) {
2522 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
2523 if (PH->getIncomingValue(i) == Scalar) {
2524 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
2525 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2526 CSEBlocks.insert(PH->getIncomingBlock(i));
2527 PH->setOperand(i, Ex);
2531 Builder.SetInsertPoint(cast<Instruction>(User));
2532 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2533 CSEBlocks.insert(cast<Instruction>(User)->getParent());
2534 User->replaceUsesOfWith(Scalar, Ex);
2537 Builder.SetInsertPoint(F->getEntryBlock().begin());
2538 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2539 CSEBlocks.insert(&F->getEntryBlock());
2540 User->replaceUsesOfWith(Scalar, Ex);
2543 DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
2546 // For each vectorized value:
2547 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
2548 TreeEntry *Entry = &VectorizableTree[EIdx];
2551 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
2552 Value *Scalar = Entry->Scalars[Lane];
2553 // No need to handle users of gathered values.
2554 if (Entry->NeedToGather)
2557 assert(Entry->VectorizedValue && "Can't find vectorizable value");
2559 Type *Ty = Scalar->getType();
2560 if (!Ty->isVoidTy()) {
2562 for (User *U : Scalar->users()) {
2563 DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
2565 assert((ScalarToTreeEntry.count(U) ||
2566 // It is legal to replace users in the ignorelist by undef.
2567 (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
2568 UserIgnoreList.end())) &&
2569 "Replacing out-of-tree value with undef");
2572 Value *Undef = UndefValue::get(Ty);
2573 Scalar->replaceAllUsesWith(Undef);
2575 DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
2576 eraseInstruction(cast<Instruction>(Scalar));
2580 Builder.ClearInsertionPoint();
2582 return VectorizableTree[0].VectorizedValue;
2585 void BoUpSLP::optimizeGatherSequence() {
2586 DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
2587 << " gather sequences instructions.\n");
2588 // LICM InsertElementInst sequences.
2589 for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
2590 e = GatherSeq.end(); it != e; ++it) {
2591 InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
2596 // Check if this block is inside a loop.
2597 Loop *L = LI->getLoopFor(Insert->getParent());
2601 // Check if it has a preheader.
2602 BasicBlock *PreHeader = L->getLoopPreheader();
2606 // If the vector or the element that we insert into it are
2607 // instructions that are defined in this basic block then we can't
2608 // hoist this instruction.
2609 Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
2610 Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
2611 if (CurrVec && L->contains(CurrVec))
2613 if (NewElem && L->contains(NewElem))
2616 // We can hoist this instruction. Move it to the pre-header.
2617 Insert->moveBefore(PreHeader->getTerminator());
2620 // Make a list of all reachable blocks in our CSE queue.
2621 SmallVector<const DomTreeNode *, 8> CSEWorkList;
2622 CSEWorkList.reserve(CSEBlocks.size());
2623 for (BasicBlock *BB : CSEBlocks)
2624 if (DomTreeNode *N = DT->getNode(BB)) {
2625 assert(DT->isReachableFromEntry(N));
2626 CSEWorkList.push_back(N);
2629 // Sort blocks by domination. This ensures we visit a block after all blocks
2630 // dominating it are visited.
2631 std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
2632 [this](const DomTreeNode *A, const DomTreeNode *B) {
2633 return DT->properlyDominates(A, B);
2636 // Perform O(N^2) search over the gather sequences and merge identical
2637 // instructions. TODO: We can further optimize this scan if we split the
2638 // instructions into different buckets based on the insert lane.
2639 SmallVector<Instruction *, 16> Visited;
2640 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
2641 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
2642 "Worklist not sorted properly!");
2643 BasicBlock *BB = (*I)->getBlock();
2644 // For all instructions in blocks containing gather sequences:
2645 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
2646 Instruction *In = it++;
2647 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
2650 // Check if we can replace this instruction with any of the
2651 // visited instructions.
2652 for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
2655 if (In->isIdenticalTo(*v) &&
2656 DT->dominates((*v)->getParent(), In->getParent())) {
2657 In->replaceAllUsesWith(*v);
2658 eraseInstruction(In);
2664 assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
2665 Visited.push_back(In);
2673 // Groups the instructions to a bundle (which is then a single scheduling entity)
2674 // and schedules instructions until the bundle gets ready.
2675 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
2677 if (isa<PHINode>(VL[0]))
2680 // Initialize the instruction bundle.
2681 Instruction *OldScheduleEnd = ScheduleEnd;
2682 ScheduleData *PrevInBundle = nullptr;
2683 ScheduleData *Bundle = nullptr;
2684 bool ReSchedule = false;
2685 DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
2686 for (Value *V : VL) {
2687 extendSchedulingRegion(V);
2688 ScheduleData *BundleMember = getScheduleData(V);
2689 assert(BundleMember &&
2690 "no ScheduleData for bundle member (maybe not in same basic block)");
2691 if (BundleMember->IsScheduled) {
2692 // A bundle member was scheduled as single instruction before and now
2693 // needs to be scheduled as part of the bundle. We just get rid of the
2694 // existing schedule.
2695 DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
2696 << " was already scheduled\n");
2699 assert(BundleMember->isSchedulingEntity() &&
2700 "bundle member already part of other bundle");
2702 PrevInBundle->NextInBundle = BundleMember;
2704 Bundle = BundleMember;
2706 BundleMember->UnscheduledDepsInBundle = 0;
2707 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
2709 // Group the instructions to a bundle.
2710 BundleMember->FirstInBundle = Bundle;
2711 PrevInBundle = BundleMember;
2713 if (ScheduleEnd != OldScheduleEnd) {
2714 // The scheduling region got new instructions at the lower end (or it is a
2715 // new region for the first bundle). This makes it necessary to
2716 // recalculate all dependencies.
2717 // It is seldom that this needs to be done a second time after adding the
2718 // initial bundle to the region.
2719 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2720 ScheduleData *SD = getScheduleData(I);
2721 SD->clearDependencies();
2727 initialFillReadyList(ReadyInsts);
2730 DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
2731 << BB->getName() << "\n");
2733 calculateDependencies(Bundle, true, SLP);
2735 // Now try to schedule the new bundle. As soon as the bundle is "ready" it
2736 // means that there are no cyclic dependencies and we can schedule it.
2737 // Note that's important that we don't "schedule" the bundle yet (see
2738 // cancelScheduling).
2739 while (!Bundle->isReady() && !ReadyInsts.empty()) {
2741 ScheduleData *pickedSD = ReadyInsts.back();
2742 ReadyInsts.pop_back();
2744 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
2745 schedule(pickedSD, ReadyInsts);
2748 return Bundle->isReady();
2751 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
2752 if (isa<PHINode>(VL[0]))
2755 ScheduleData *Bundle = getScheduleData(VL[0]);
2756 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
2757 assert(!Bundle->IsScheduled &&
2758 "Can't cancel bundle which is already scheduled");
2759 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
2760 "tried to unbundle something which is not a bundle");
2762 // Un-bundle: make single instructions out of the bundle.
2763 ScheduleData *BundleMember = Bundle;
2764 while (BundleMember) {
2765 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
2766 BundleMember->FirstInBundle = BundleMember;
2767 ScheduleData *Next = BundleMember->NextInBundle;
2768 BundleMember->NextInBundle = nullptr;
2769 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
2770 if (BundleMember->UnscheduledDepsInBundle == 0) {
2771 ReadyInsts.insert(BundleMember);
2773 BundleMember = Next;
2777 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
2778 if (getScheduleData(V))
2780 Instruction *I = dyn_cast<Instruction>(V);
2781 assert(I && "bundle member must be an instruction");
2782 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
2783 if (!ScheduleStart) {
2784 // It's the first instruction in the new region.
2785 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
2787 ScheduleEnd = I->getNextNode();
2788 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2789 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
2792 // Search up and down at the same time, because we don't know if the new
2793 // instruction is above or below the existing scheduling region.
2794 BasicBlock::reverse_iterator UpIter(ScheduleStart);
2795 BasicBlock::reverse_iterator UpperEnd = BB->rend();
2796 BasicBlock::iterator DownIter(ScheduleEnd);
2797 BasicBlock::iterator LowerEnd = BB->end();
2799 if (UpIter != UpperEnd) {
2800 if (&*UpIter == I) {
2801 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
2803 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
2808 if (DownIter != LowerEnd) {
2809 if (&*DownIter == I) {
2810 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
2812 ScheduleEnd = I->getNextNode();
2813 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2814 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
2819 assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
2820 "instruction not found in block");
2824 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
2826 ScheduleData *PrevLoadStore,
2827 ScheduleData *NextLoadStore) {
2828 ScheduleData *CurrentLoadStore = PrevLoadStore;
2829 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
2830 ScheduleData *SD = ScheduleDataMap[I];
2832 // Allocate a new ScheduleData for the instruction.
2833 if (ChunkPos >= ChunkSize) {
2834 ScheduleDataChunks.push_back(
2835 llvm::make_unique<ScheduleData[]>(ChunkSize));
2838 SD = &(ScheduleDataChunks.back()[ChunkPos++]);
2839 ScheduleDataMap[I] = SD;
2842 assert(!isInSchedulingRegion(SD) &&
2843 "new ScheduleData already in scheduling region");
2844 SD->init(SchedulingRegionID);
2846 if (I->mayReadOrWriteMemory()) {
2847 // Update the linked list of memory accessing instructions.
2848 if (CurrentLoadStore) {
2849 CurrentLoadStore->NextLoadStore = SD;
2851 FirstLoadStoreInRegion = SD;
2853 CurrentLoadStore = SD;
2856 if (NextLoadStore) {
2857 if (CurrentLoadStore)
2858 CurrentLoadStore->NextLoadStore = NextLoadStore;
2860 LastLoadStoreInRegion = CurrentLoadStore;
2864 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
2865 bool InsertInReadyList,
2867 assert(SD->isSchedulingEntity());
2869 SmallVector<ScheduleData *, 10> WorkList;
2870 WorkList.push_back(SD);
2872 while (!WorkList.empty()) {
2873 ScheduleData *SD = WorkList.back();
2874 WorkList.pop_back();
2876 ScheduleData *BundleMember = SD;
2877 while (BundleMember) {
2878 assert(isInSchedulingRegion(BundleMember));
2879 if (!BundleMember->hasValidDependencies()) {
2881 DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
2882 BundleMember->Dependencies = 0;
2883 BundleMember->resetUnscheduledDeps();
2885 // Handle def-use chain dependencies.
2886 for (User *U : BundleMember->Inst->users()) {
2887 if (isa<Instruction>(U)) {
2888 ScheduleData *UseSD = getScheduleData(U);
2889 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
2890 BundleMember->Dependencies++;
2891 ScheduleData *DestBundle = UseSD->FirstInBundle;
2892 if (!DestBundle->IsScheduled) {
2893 BundleMember->incrementUnscheduledDeps(1);
2895 if (!DestBundle->hasValidDependencies()) {
2896 WorkList.push_back(DestBundle);
2900 // I'm not sure if this can ever happen. But we need to be safe.
2901 // This lets the instruction/bundle never be scheduled and eventally
2902 // disable vectorization.
2903 BundleMember->Dependencies++;
2904 BundleMember->incrementUnscheduledDeps(1);
2908 // Handle the memory dependencies.
2909 ScheduleData *DepDest = BundleMember->NextLoadStore;
2911 Instruction *SrcInst = BundleMember->Inst;
2912 MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
2913 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
2914 unsigned numAliased = 0;
2915 unsigned DistToSrc = 1;
2918 assert(isInSchedulingRegion(DepDest));
2920 // We have two limits to reduce the complexity:
2921 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
2922 // SLP->isAliased (which is the expensive part in this loop).
2923 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
2924 // the whole loop (even if the loop is fast, it's quadratic).
2925 // It's important for the loop break condition (see below) to
2926 // check this limit even between two read-only instructions.
2927 if (DistToSrc >= MaxMemDepDistance ||
2928 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
2929 (numAliased >= AliasedCheckLimit ||
2930 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
2932 // We increment the counter only if the locations are aliased
2933 // (instead of counting all alias checks). This gives a better
2934 // balance between reduced runtime and accurate dependencies.
2937 DepDest->MemoryDependencies.push_back(BundleMember);
2938 BundleMember->Dependencies++;
2939 ScheduleData *DestBundle = DepDest->FirstInBundle;
2940 if (!DestBundle->IsScheduled) {
2941 BundleMember->incrementUnscheduledDeps(1);
2943 if (!DestBundle->hasValidDependencies()) {
2944 WorkList.push_back(DestBundle);
2947 DepDest = DepDest->NextLoadStore;
2949 // Example, explaining the loop break condition: Let's assume our
2950 // starting instruction is i0 and MaxMemDepDistance = 3.
2953 // i0,i1,i2,i3,i4,i5,i6,i7,i8
2956 // MaxMemDepDistance let us stop alias-checking at i3 and we add
2957 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
2958 // Previously we already added dependencies from i3 to i6,i7,i8
2959 // (because of MaxMemDepDistance). As we added a dependency from
2960 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
2961 // and we can abort this loop at i6.
2962 if (DistToSrc >= 2 * MaxMemDepDistance)
2968 BundleMember = BundleMember->NextInBundle;
2970 if (InsertInReadyList && SD->isReady()) {
2971 ReadyInsts.push_back(SD);
2972 DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
2977 void BoUpSLP::BlockScheduling::resetSchedule() {
2978 assert(ScheduleStart &&
2979 "tried to reset schedule on block which has not been scheduled");
2980 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2981 ScheduleData *SD = getScheduleData(I);
2982 assert(isInSchedulingRegion(SD));
2983 SD->IsScheduled = false;
2984 SD->resetUnscheduledDeps();
2989 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
2991 if (!BS->ScheduleStart)
2994 DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
2996 BS->resetSchedule();
2998 // For the real scheduling we use a more sophisticated ready-list: it is
2999 // sorted by the original instruction location. This lets the final schedule
3000 // be as close as possible to the original instruction order.
3001 struct ScheduleDataCompare {
3002 bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
3003 return SD2->SchedulingPriority < SD1->SchedulingPriority;
3006 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
3008 // Ensure that all depencency data is updated and fill the ready-list with
3009 // initial instructions.
3011 int NumToSchedule = 0;
3012 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
3013 I = I->getNextNode()) {
3014 ScheduleData *SD = BS->getScheduleData(I);
3016 SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
3017 "scheduler and vectorizer have different opinion on what is a bundle");
3018 SD->FirstInBundle->SchedulingPriority = Idx++;
3019 if (SD->isSchedulingEntity()) {
3020 BS->calculateDependencies(SD, false, this);
3024 BS->initialFillReadyList(ReadyInsts);
3026 Instruction *LastScheduledInst = BS->ScheduleEnd;
3028 // Do the "real" scheduling.
3029 while (!ReadyInsts.empty()) {
3030 ScheduleData *picked = *ReadyInsts.begin();
3031 ReadyInsts.erase(ReadyInsts.begin());
3033 // Move the scheduled instruction(s) to their dedicated places, if not
3035 ScheduleData *BundleMember = picked;
3036 while (BundleMember) {
3037 Instruction *pickedInst = BundleMember->Inst;
3038 if (LastScheduledInst->getNextNode() != pickedInst) {
3039 BS->BB->getInstList().remove(pickedInst);
3040 BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
3042 LastScheduledInst = pickedInst;
3043 BundleMember = BundleMember->NextInBundle;
3046 BS->schedule(picked, ReadyInsts);
3049 assert(NumToSchedule == 0 && "could not schedule all instructions");
3051 // Avoid duplicate scheduling of the block.
3052 BS->ScheduleStart = nullptr;
3055 /// The SLPVectorizer Pass.
3056 struct SLPVectorizer : public FunctionPass {
3057 typedef SmallVector<StoreInst *, 8> StoreList;
3058 typedef MapVector<Value *, StoreList> StoreListMap;
3060 /// Pass identification, replacement for typeid
3063 explicit SLPVectorizer() : FunctionPass(ID) {
3064 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
3067 ScalarEvolution *SE;
3068 TargetTransformInfo *TTI;
3069 TargetLibraryInfo *TLI;
3073 AssumptionCache *AC;
3075 bool runOnFunction(Function &F) override {
3076 if (skipOptnoneFunction(F))
3079 SE = &getAnalysis<ScalarEvolution>();
3080 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
3081 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
3082 TLI = TLIP ? &TLIP->getTLI() : nullptr;
3083 AA = &getAnalysis<AliasAnalysis>();
3084 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
3085 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
3086 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
3089 bool Changed = false;
3091 // If the target claims to have no vector registers don't attempt
3093 if (!TTI->getNumberOfRegisters(true))
3096 // Use the vector register size specified by the target unless overridden
3097 // by a command-line option.
3098 // TODO: It would be better to limit the vectorization factor based on
3099 // data type rather than just register size. For example, x86 AVX has
3100 // 256-bit registers, but it does not support integer operations
3101 // at that width (that requires AVX2).
3102 if (MaxVectorRegSizeOption.getNumOccurrences())
3103 MaxVecRegSize = MaxVectorRegSizeOption;
3105 MaxVecRegSize = TTI->getRegisterBitWidth(true);
3107 // Don't vectorize when the attribute NoImplicitFloat is used.
3108 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
3111 DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
3113 // Use the bottom up slp vectorizer to construct chains that start with
3114 // store instructions.
3115 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
3117 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
3118 // delete instructions.
3120 // Scan the blocks in the function in post order.
3121 for (auto BB : post_order(&F.getEntryBlock())) {
3122 // Vectorize trees that end at stores.
3123 if (unsigned count = collectStores(BB, R)) {
3125 DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
3126 Changed |= vectorizeStoreChains(R);
3129 // Vectorize trees that end at reductions.
3130 Changed |= vectorizeChainsInBlock(BB, R);
3134 R.optimizeGatherSequence();
3135 DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
3136 DEBUG(verifyFunction(F));
3141 void getAnalysisUsage(AnalysisUsage &AU) const override {
3142 FunctionPass::getAnalysisUsage(AU);
3143 AU.addRequired<AssumptionCacheTracker>();
3144 AU.addRequired<ScalarEvolution>();
3145 AU.addRequired<AliasAnalysis>();
3146 AU.addRequired<TargetTransformInfoWrapperPass>();
3147 AU.addRequired<LoopInfoWrapperPass>();
3148 AU.addRequired<DominatorTreeWrapperPass>();
3149 AU.addPreserved<LoopInfoWrapperPass>();
3150 AU.addPreserved<DominatorTreeWrapperPass>();
3151 AU.setPreservesCFG();
3156 /// \brief Collect memory references and sort them according to their base
3157 /// object. We sort the stores to their base objects to reduce the cost of the
3158 /// quadratic search on the stores. TODO: We can further reduce this cost
3159 /// if we flush the chain creation every time we run into a memory barrier.
3160 unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
3162 /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
3163 bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
3165 /// \brief Try to vectorize a list of operands.
3166 /// \@param BuildVector A list of users to ignore for the purpose of
3167 /// scheduling and that don't need extracting.
3168 /// \returns true if a value was vectorized.
3169 bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3170 ArrayRef<Value *> BuildVector = None,
3171 bool allowReorder = false);
3173 /// \brief Try to vectorize a chain that may start at the operands of \V;
3174 bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
3176 /// \brief Vectorize the stores that were collected in StoreRefs.
3177 bool vectorizeStoreChains(BoUpSLP &R);
3179 /// \brief Scan the basic block and look for patterns that are likely to start
3180 /// a vectorization chain.
3181 bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
3183 bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
3184 BoUpSLP &R, unsigned VecRegSize);
3186 bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
3189 StoreListMap StoreRefs;
3190 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3193 /// \brief Check that the Values in the slice in VL array are still existent in
3194 /// the WeakVH array.
3195 /// Vectorization of part of the VL array may cause later values in the VL array
3196 /// to become invalid. We track when this has happened in the WeakVH array.
3197 static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
3198 unsigned SliceBegin, unsigned SliceSize) {
3199 VL = VL.slice(SliceBegin, SliceSize);
3200 VH = VH.slice(SliceBegin, SliceSize);
3201 return !std::equal(VL.begin(), VL.end(), VH.begin());
3204 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
3205 int CostThreshold, BoUpSLP &R,
3206 unsigned VecRegSize) {
3207 unsigned ChainLen = Chain.size();
3208 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
3210 Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
3211 auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
3212 unsigned Sz = DL.getTypeSizeInBits(StoreTy);
3213 unsigned VF = VecRegSize / Sz;
3215 if (!isPowerOf2_32(Sz) || VF < 2)
3218 // Keep track of values that were deleted by vectorizing in the loop below.
3219 SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
3221 bool Changed = false;
3222 // Look for profitable vectorizable trees at all offsets, starting at zero.
3223 for (unsigned i = 0, e = ChainLen; i < e; ++i) {
3227 // Check that a previous iteration of this loop did not delete the Value.
3228 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
3231 DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
3233 ArrayRef<Value *> Operands = Chain.slice(i, VF);
3235 R.buildTree(Operands);
3237 int Cost = R.getTreeCost();
3239 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
3240 if (Cost < CostThreshold) {
3241 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
3244 // Move to the next bundle.
3253 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
3254 int costThreshold, BoUpSLP &R) {
3255 SetVector<StoreInst *> Heads, Tails;
3256 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
3258 // We may run into multiple chains that merge into a single chain. We mark the
3259 // stores that we vectorized so that we don't visit the same store twice.
3260 BoUpSLP::ValueSet VectorizedStores;
3261 bool Changed = false;
3263 // Do a quadratic search on all of the given stores and find
3264 // all of the pairs of stores that follow each other.
3265 for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
3266 for (unsigned j = 0; j < e; ++j) {
3269 const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
3270 if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) {
3271 Tails.insert(Stores[j]);
3272 Heads.insert(Stores[i]);
3273 ConsecutiveChain[Stores[i]] = Stores[j];
3278 // For stores that start but don't end a link in the chain:
3279 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
3281 if (Tails.count(*it))
3284 // We found a store instr that starts a chain. Now follow the chain and try
3286 BoUpSLP::ValueList Operands;
3288 // Collect the chain into a list.
3289 while (Tails.count(I) || Heads.count(I)) {
3290 if (VectorizedStores.count(I))
3292 Operands.push_back(I);
3293 // Move to the next value in the chain.
3294 I = ConsecutiveChain[I];
3297 // FIXME: Is division-by-2 the correct step? Should we assert that the
3298 // register size is a power-of-2?
3299 for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
3300 if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
3301 // Mark the vectorized stores so that we don't vectorize them again.
3302 VectorizedStores.insert(Operands.begin(), Operands.end());
3313 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
3316 const DataLayout &DL = BB->getModule()->getDataLayout();
3317 for (Instruction &I : *BB) {
3318 StoreInst *SI = dyn_cast<StoreInst>(&I);
3322 // Don't touch volatile stores.
3323 if (!SI->isSimple())
3326 // Check that the pointer points to scalars.
3327 Type *Ty = SI->getValueOperand()->getType();
3328 if (!isValidElementType(Ty))
3331 // Find the base pointer.
3332 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
3334 // Save the store locations.
3335 StoreRefs[Ptr].push_back(SI);
3341 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
3344 Value *VL[] = { A, B };
3345 return tryToVectorizeList(VL, R, None, true);
3348 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3349 ArrayRef<Value *> BuildVector,
3350 bool allowReorder) {
3354 DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
3356 // Check that all of the parts are scalar instructions of the same type.
3357 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
3361 unsigned Opcode0 = I0->getOpcode();
3362 const DataLayout &DL = I0->getModule()->getDataLayout();
3364 Type *Ty0 = I0->getType();
3365 unsigned Sz = DL.getTypeSizeInBits(Ty0);
3366 // FIXME: Register size should be a parameter to this function, so we can
3367 // try different vectorization factors.
3368 unsigned VF = MinVecRegSize / Sz;
3370 for (Value *V : VL) {
3371 Type *Ty = V->getType();
3372 if (!isValidElementType(Ty))
3374 Instruction *Inst = dyn_cast<Instruction>(V);
3375 if (!Inst || Inst->getOpcode() != Opcode0)
3379 bool Changed = false;
3381 // Keep track of values that were deleted by vectorizing in the loop below.
3382 SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
3384 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
3385 unsigned OpsWidth = 0;
3392 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
3395 // Check that a previous iteration of this loop did not delete the Value.
3396 if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
3399 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
3401 ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
3403 ArrayRef<Value *> BuildVectorSlice;
3404 if (!BuildVector.empty())
3405 BuildVectorSlice = BuildVector.slice(i, OpsWidth);
3407 R.buildTree(Ops, BuildVectorSlice);
3408 // TODO: check if we can allow reordering also for other cases than
3409 // tryToVectorizePair()
3410 if (allowReorder && R.shouldReorder()) {
3411 assert(Ops.size() == 2);
3412 assert(BuildVectorSlice.empty());
3413 Value *ReorderedOps[] = { Ops[1], Ops[0] };
3414 R.buildTree(ReorderedOps, None);
3416 int Cost = R.getTreeCost();
3418 if (Cost < -SLPCostThreshold) {
3419 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
3420 Value *VectorizedRoot = R.vectorizeTree();
3422 // Reconstruct the build vector by extracting the vectorized root. This
3423 // way we handle the case where some elements of the vector are undefined.
3424 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
3425 if (!BuildVectorSlice.empty()) {
3426 // The insert point is the last build vector instruction. The vectorized
3427 // root will precede it. This guarantees that we get an instruction. The
3428 // vectorized tree could have been constant folded.
3429 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
3430 unsigned VecIdx = 0;
3431 for (auto &V : BuildVectorSlice) {
3432 IRBuilder<true, NoFolder> Builder(
3433 ++BasicBlock::iterator(InsertAfter));
3434 InsertElementInst *IE = cast<InsertElementInst>(V);
3435 Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
3436 VectorizedRoot, Builder.getInt32(VecIdx++)));
3437 IE->setOperand(1, Extract);
3438 IE->removeFromParent();
3439 IE->insertAfter(Extract);
3443 // Move to the next bundle.
3452 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
3456 // Try to vectorize V.
3457 if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
3460 BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
3461 BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
3463 if (B && B->hasOneUse()) {
3464 BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
3465 BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
3466 if (tryToVectorizePair(A, B0, R)) {
3469 if (tryToVectorizePair(A, B1, R)) {
3475 if (A && A->hasOneUse()) {
3476 BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
3477 BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
3478 if (tryToVectorizePair(A0, B, R)) {
3481 if (tryToVectorizePair(A1, B, R)) {
3488 /// \brief Generate a shuffle mask to be used in a reduction tree.
3490 /// \param VecLen The length of the vector to be reduced.
3491 /// \param NumEltsToRdx The number of elements that should be reduced in the
3493 /// \param IsPairwise Whether the reduction is a pairwise or splitting
3494 /// reduction. A pairwise reduction will generate a mask of
3495 /// <0,2,...> or <1,3,..> while a splitting reduction will generate
3496 /// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
3497 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
3498 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
3499 bool IsPairwise, bool IsLeft,
3500 IRBuilder<> &Builder) {
3501 assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
3503 SmallVector<Constant *, 32> ShuffleMask(
3504 VecLen, UndefValue::get(Builder.getInt32Ty()));
3507 // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
3508 for (unsigned i = 0; i != NumEltsToRdx; ++i)
3509 ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
3511 // Move the upper half of the vector to the lower half.
3512 for (unsigned i = 0; i != NumEltsToRdx; ++i)
3513 ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
3515 return ConstantVector::get(ShuffleMask);
3519 /// Model horizontal reductions.
3521 /// A horizontal reduction is a tree of reduction operations (currently add and
3522 /// fadd) that has operations that can be put into a vector as its leaf.
3523 /// For example, this tree:
3530 /// This tree has "mul" as its reduced values and "+" as its reduction
3531 /// operations. A reduction might be feeding into a store or a binary operation
3546 class HorizontalReduction {
3547 SmallVector<Value *, 16> ReductionOps;
3548 SmallVector<Value *, 32> ReducedVals;
3550 BinaryOperator *ReductionRoot;
3551 PHINode *ReductionPHI;
3553 /// The opcode of the reduction.
3554 unsigned ReductionOpcode;
3555 /// The opcode of the values we perform a reduction on.
3556 unsigned ReducedValueOpcode;
3557 /// The width of one full horizontal reduction operation.
3558 unsigned ReduxWidth;
3559 /// Should we model this reduction as a pairwise reduction tree or a tree that
3560 /// splits the vector in halves and adds those halves.
3561 bool IsPairwiseReduction;
3564 HorizontalReduction()
3565 : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
3566 ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
3568 /// \brief Try to find a reduction tree.
3569 bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
3571 std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
3572 "Thi phi needs to use the binary operator");
3574 // We could have a initial reductions that is not an add.
3575 // r *= v1 + v2 + v3 + v4
3576 // In such a case start looking for a tree rooted in the first '+'.
3578 if (B->getOperand(0) == Phi) {
3580 B = dyn_cast<BinaryOperator>(B->getOperand(1));
3581 } else if (B->getOperand(1) == Phi) {
3583 B = dyn_cast<BinaryOperator>(B->getOperand(0));
3590 Type *Ty = B->getType();
3591 if (!isValidElementType(Ty))
3594 const DataLayout &DL = B->getModule()->getDataLayout();
3595 ReductionOpcode = B->getOpcode();
3596 ReducedValueOpcode = 0;
3597 // FIXME: Register size should be a parameter to this function, so we can
3598 // try different vectorization factors.
3599 ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
3606 // We currently only support adds.
3607 if (ReductionOpcode != Instruction::Add &&
3608 ReductionOpcode != Instruction::FAdd)
3611 // Post order traverse the reduction tree starting at B. We only handle true
3612 // trees containing only binary operators.
3613 SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
3614 Stack.push_back(std::make_pair(B, 0));
3615 while (!Stack.empty()) {
3616 BinaryOperator *TreeN = Stack.back().first;
3617 unsigned EdgeToVist = Stack.back().second++;
3618 bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
3620 // Only handle trees in the current basic block.
3621 if (TreeN->getParent() != B->getParent())
3624 // Each tree node needs to have one user except for the ultimate
3626 if (!TreeN->hasOneUse() && TreeN != B)
3630 if (EdgeToVist == 2 || IsReducedValue) {
3631 if (IsReducedValue) {
3632 // Make sure that the opcodes of the operations that we are going to
3634 if (!ReducedValueOpcode)
3635 ReducedValueOpcode = TreeN->getOpcode();
3636 else if (ReducedValueOpcode != TreeN->getOpcode())
3638 ReducedVals.push_back(TreeN);
3640 // We need to be able to reassociate the adds.
3641 if (!TreeN->isAssociative())
3643 ReductionOps.push_back(TreeN);
3650 // Visit left or right.
3651 Value *NextV = TreeN->getOperand(EdgeToVist);
3652 BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
3654 Stack.push_back(std::make_pair(Next, 0));
3655 else if (NextV != Phi)
3661 /// \brief Attempt to vectorize the tree found by
3662 /// matchAssociativeReduction.
3663 bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
3664 if (ReducedVals.empty())
3667 unsigned NumReducedVals = ReducedVals.size();
3668 if (NumReducedVals < ReduxWidth)
3671 Value *VectorizedTree = nullptr;
3672 IRBuilder<> Builder(ReductionRoot);
3673 FastMathFlags Unsafe;
3674 Unsafe.setUnsafeAlgebra();
3675 Builder.SetFastMathFlags(Unsafe);
3678 for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
3679 V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
3682 int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
3683 if (Cost >= -SLPCostThreshold)
3686 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
3689 // Vectorize a tree.
3690 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
3691 Value *VectorizedRoot = V.vectorizeTree();
3693 // Emit a reduction.
3694 Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
3695 if (VectorizedTree) {
3696 Builder.SetCurrentDebugLocation(Loc);
3697 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3698 ReducedSubTree, "bin.rdx");
3700 VectorizedTree = ReducedSubTree;
3703 if (VectorizedTree) {
3704 // Finish the reduction.
3705 for (; i < NumReducedVals; ++i) {
3706 Builder.SetCurrentDebugLocation(
3707 cast<Instruction>(ReducedVals[i])->getDebugLoc());
3708 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3713 assert(ReductionRoot && "Need a reduction operation");
3714 ReductionRoot->setOperand(0, VectorizedTree);
3715 ReductionRoot->setOperand(1, ReductionPHI);
3717 ReductionRoot->replaceAllUsesWith(VectorizedTree);
3719 return VectorizedTree != nullptr;
3724 /// \brief Calcuate the cost of a reduction.
3725 int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
3726 Type *ScalarTy = FirstReducedVal->getType();
3727 Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
3729 int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
3730 int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
3732 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
3733 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
3735 int ScalarReduxCost =
3736 ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
3738 DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
3739 << " for reduction that starts with " << *FirstReducedVal
3741 << (IsPairwiseReduction ? "pairwise" : "splitting")
3742 << " reduction)\n");
3744 return VecReduxCost - ScalarReduxCost;
3747 static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
3748 Value *R, const Twine &Name = "") {
3749 if (Opcode == Instruction::FAdd)
3750 return Builder.CreateFAdd(L, R, Name);
3751 return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
3754 /// \brief Emit a horizontal reduction of the vectorized value.
3755 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
3756 assert(VectorizedValue && "Need to have a vectorized tree node");
3757 assert(isPowerOf2_32(ReduxWidth) &&
3758 "We only handle power-of-two reductions for now");
3760 Value *TmpVec = VectorizedValue;
3761 for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
3762 if (IsPairwiseReduction) {
3764 createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
3766 createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
3768 Value *LeftShuf = Builder.CreateShuffleVector(
3769 TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
3770 Value *RightShuf = Builder.CreateShuffleVector(
3771 TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
3773 TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
3777 createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
3778 Value *Shuf = Builder.CreateShuffleVector(
3779 TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
3780 TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
3784 // The result is in the first element of the vector.
3785 return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
3789 /// \brief Recognize construction of vectors like
3790 /// %ra = insertelement <4 x float> undef, float %s0, i32 0
3791 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1
3792 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2
3793 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3
3795 /// Returns true if it matches
3797 static bool findBuildVector(InsertElementInst *FirstInsertElem,
3798 SmallVectorImpl<Value *> &BuildVector,
3799 SmallVectorImpl<Value *> &BuildVectorOpds) {
3800 if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
3803 InsertElementInst *IE = FirstInsertElem;
3805 BuildVector.push_back(IE);
3806 BuildVectorOpds.push_back(IE->getOperand(1));
3808 if (IE->use_empty())
3811 InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
3815 // If this isn't the final use, make sure the next insertelement is the only
3816 // use. It's OK if the final constructed vector is used multiple times
3817 if (!IE->hasOneUse())
3826 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
3827 return V->getType() < V2->getType();
3830 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
3831 bool Changed = false;
3832 SmallVector<Value *, 4> Incoming;
3833 SmallSet<Value *, 16> VisitedInstrs;
3835 bool HaveVectorizedPhiNodes = true;
3836 while (HaveVectorizedPhiNodes) {
3837 HaveVectorizedPhiNodes = false;
3839 // Collect the incoming values from the PHIs.
3841 for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
3843 PHINode *P = dyn_cast<PHINode>(instr);
3847 if (!VisitedInstrs.count(P))
3848 Incoming.push_back(P);
3852 std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
3854 // Try to vectorize elements base on their type.
3855 for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
3859 // Look for the next elements with the same type.
3860 SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
3861 while (SameTypeIt != E &&
3862 (*SameTypeIt)->getType() == (*IncIt)->getType()) {
3863 VisitedInstrs.insert(*SameTypeIt);
3867 // Try to vectorize them.
3868 unsigned NumElts = (SameTypeIt - IncIt);
3869 DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
3870 if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
3871 // Success start over because instructions might have been changed.
3872 HaveVectorizedPhiNodes = true;
3877 // Start over at the next instruction of a different type (or the end).
3882 VisitedInstrs.clear();
3884 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
3885 // We may go through BB multiple times so skip the one we have checked.
3886 if (!VisitedInstrs.insert(it).second)
3889 if (isa<DbgInfoIntrinsic>(it))
3892 // Try to vectorize reductions that use PHINodes.
3893 if (PHINode *P = dyn_cast<PHINode>(it)) {
3894 // Check that the PHI is a reduction PHI.
3895 if (P->getNumIncomingValues() != 2)
3898 (P->getIncomingBlock(0) == BB
3899 ? (P->getIncomingValue(0))
3900 : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
3902 // Check if this is a Binary Operator.
3903 BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
3907 // Try to match and vectorize a horizontal reduction.
3908 HorizontalReduction HorRdx;
3909 if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) &&
3910 HorRdx.tryToReduce(R, TTI)) {
3917 Value *Inst = BI->getOperand(0);
3919 Inst = BI->getOperand(1);
3921 if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
3922 // We would like to start over since some instructions are deleted
3923 // and the iterator may become invalid value.
3933 // Try to vectorize horizontal reductions feeding into a store.
3934 if (ShouldStartVectorizeHorAtStore)
3935 if (StoreInst *SI = dyn_cast<StoreInst>(it))
3936 if (BinaryOperator *BinOp =
3937 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
3938 HorizontalReduction HorRdx;
3939 if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) &&
3940 HorRdx.tryToReduce(R, TTI)) ||
3941 tryToVectorize(BinOp, R))) {
3949 // Try to vectorize horizontal reductions feeding into a return.
3950 if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
3951 if (RI->getNumOperands() != 0)
3952 if (BinaryOperator *BinOp =
3953 dyn_cast<BinaryOperator>(RI->getOperand(0))) {
3954 DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
3955 if (tryToVectorizePair(BinOp->getOperand(0),
3956 BinOp->getOperand(1), R)) {
3964 // Try to vectorize trees that start at compare instructions.
3965 if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
3966 if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
3968 // We would like to start over since some instructions are deleted
3969 // and the iterator may become invalid value.
3975 for (int i = 0; i < 2; ++i) {
3976 if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
3977 if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
3979 // We would like to start over since some instructions are deleted
3980 // and the iterator may become invalid value.
3990 // Try to vectorize trees that start at insertelement instructions.
3991 if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
3992 SmallVector<Value *, 16> BuildVector;
3993 SmallVector<Value *, 16> BuildVectorOpds;
3994 if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
3997 // Vectorize starting with the build vector operands ignoring the
3998 // BuildVector instructions for the purpose of scheduling and user
4000 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
4013 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
4014 bool Changed = false;
4015 // Attempt to sort and vectorize each of the store-groups.
4016 for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
4018 if (it->second.size() < 2)
4021 DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
4022 << it->second.size() << ".\n");
4024 // Process the stores in chunks of 16.
4025 // TODO: The limit of 16 inhibits greater vectorization factors.
4026 // For example, AVX2 supports v32i8. Increasing this limit, however,
4027 // may cause a significant compile-time increase.
4028 for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
4029 unsigned Len = std::min<unsigned>(CE - CI, 16);
4030 Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
4031 -SLPCostThreshold, R);
4037 } // end anonymous namespace
4039 char SLPVectorizer::ID = 0;
4040 static const char lv_name[] = "SLP Vectorizer";
4041 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
4042 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
4043 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
4044 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
4045 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
4046 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
4047 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
4050 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }