lib/Transforms/Vectorize/SLPVectorizer.cpp

   1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
  10 // stores that can be put together into vector-stores. Next, it attempts to
  11 // construct vectorizable tree using the use-def chains. If a profitable tree
  12 // was found, the SLP vectorizer performs vectorization on the tree.
  13 //
  14 // The pass is inspired by the work described in the paper:
  15 //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
  16 //
  17 //===----------------------------------------------------------------------===//
  18 #include "llvm/Transforms/Vectorize.h"
  19 #include "llvm/ADT/MapVector.h"
  20 #include "llvm/ADT/PostOrderIterator.h"
  21 #include "llvm/ADT/SetVector.h"
  22 #include "llvm/ADT/Statistic.h"
  23 #include "llvm/Analysis/AliasAnalysis.h"
  24 #include "llvm/Analysis/LoopInfo.h"
  25 #include "llvm/Analysis/ScalarEvolution.h"
  26 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  27 #include "llvm/Analysis/TargetTransformInfo.h"
  28 #include "llvm/Analysis/ValueTracking.h"
  29 #include "llvm/IR/DataLayout.h"
  30 #include "llvm/IR/Dominators.h"
  31 #include "llvm/IR/IRBuilder.h"
  32 #include "llvm/IR/Instructions.h"
  33 #include "llvm/IR/IntrinsicInst.h"
  34 #include "llvm/IR/Module.h"
  35 #include "llvm/IR/NoFolder.h"
  36 #include "llvm/IR/Type.h"
  37 #include "llvm/IR/Value.h"
  38 #include "llvm/IR/Verifier.h"
  39 #include "llvm/Pass.h"
  40 #include "llvm/Support/CommandLine.h"
  41 #include "llvm/Support/Debug.h"
  42 #include "llvm/Support/raw_ostream.h"
  43 #include "llvm/Transforms/Utils/VectorUtils.h"
  44 #include <algorithm>
  45 #include <map>
  46 #include <memory>
  47
  48 using namespace llvm;
  49
  50 #define SV_NAME "slp-vectorizer"
  51 #define DEBUG_TYPE "SLP"
  52
  53 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
  54
  55 static cl::opt<int>
  56     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
  57                      cl::desc("Only vectorize if you gain more than this "
  58                               "number "));
  59
  60 static cl::opt<bool>
  61 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
  62                    cl::desc("Attempt to vectorize horizontal reductions"));
  63
  64 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
  65     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
  66     cl::desc(
  67         "Attempt to vectorize horizontal reductions feeding into a store"));
  68
  69 namespace {
  70
  71 static const unsigned MinVecRegSize = 128;
  72
  73 static const unsigned RecursionMaxDepth = 12;
  74
  75 /// \returns the parent basic block if all of the instructions in \p VL
  76 /// are in the same block or null otherwise.
  77 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
  78   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  79   if (!I0)
  80     return nullptr;
  81   BasicBlock *BB = I0->getParent();
  82   for (int i = 1, e = VL.size(); i < e; i++) {
  83     Instruction *I = dyn_cast<Instruction>(VL[i]);
  84     if (!I)
  85       return nullptr;
  86
  87     if (BB != I->getParent())
  88       return nullptr;
  89   }
  90   return BB;
  91 }
  92
  93 /// \returns True if all of the values in \p VL are constants.
  94 static bool allConstant(ArrayRef<Value *> VL) {
  95   for (unsigned i = 0, e = VL.size(); i < e; ++i)
  96     if (!isa<Constant>(VL[i]))
  97       return false;
  98   return true;
  99 }
 100
 101 /// \returns True if all of the values in \p VL are identical.
 102 static bool isSplat(ArrayRef<Value *> VL) {
 103   for (unsigned i = 1, e = VL.size(); i < e; ++i)
 104     if (VL[i] != VL[0])
 105       return false;
 106   return true;
 107 }
 108
 109 ///\returns Opcode that can be clubbed with \p Op to create an alternate
 110 /// sequence which can later be merged as a ShuffleVector instruction.
 111 static unsigned getAltOpcode(unsigned Op) {
 112   switch (Op) {
 113   case Instruction::FAdd:
 114     return Instruction::FSub;
 115   case Instruction::FSub:
 116     return Instruction::FAdd;
 117   case Instruction::Add:
 118     return Instruction::Sub;
 119   case Instruction::Sub:
 120     return Instruction::Add;
 121   default:
 122     return 0;
 123   }
 124 }
 125
 126 ///\returns bool representing if Opcode \p Op can be part
 127 /// of an alternate sequence which can later be merged as
 128 /// a ShuffleVector instruction.
 129 static bool canCombineAsAltInst(unsigned Op) {
 130   if (Op == Instruction::FAdd || Op == Instruction::FSub ||
 131       Op == Instruction::Sub || Op == Instruction::Add)
 132     return true;
 133   return false;
 134 }
 135
 136 /// \returns ShuffleVector instruction if intructions in \p VL have
 137 ///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
 138 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
 139 static unsigned isAltInst(ArrayRef<Value *> VL) {
 140   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
 141   unsigned Opcode = I0->getOpcode();
 142   unsigned AltOpcode = getAltOpcode(Opcode);
 143   for (int i = 1, e = VL.size(); i < e; i++) {
 144     Instruction *I = dyn_cast<Instruction>(VL[i]);
 145     if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
 146       return 0;
 147   }
 148   return Instruction::ShuffleVector;
 149 }
 150
 151 /// \returns The opcode if all of the Instructions in \p VL have the same
 152 /// opcode, or zero.
 153 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
 154   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
 155   if (!I0)
 156     return 0;
 157   unsigned Opcode = I0->getOpcode();
 158   for (int i = 1, e = VL.size(); i < e; i++) {
 159     Instruction *I = dyn_cast<Instruction>(VL[i]);
 160     if (!I || Opcode != I->getOpcode()) {
 161       if (canCombineAsAltInst(Opcode) && i == 1)
 162         return isAltInst(VL);
 163       return 0;
 164     }
 165   }
 166   return Opcode;
 167 }
 168
 169 /// Get the intersection (logical and) of all of the potential IR flags
 170 /// of each scalar operation (VL) that will be converted into a vector (I).
 171 /// Flag set: NSW, NUW, exact, and all of fast-math.
 172 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
 173   if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
 174     if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
 175       // Intersection is initialized to the 0th scalar,
 176       // so start counting from index '1'.
 177       for (int i = 1, e = VL.size(); i < e; ++i) {
 178         if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
 179           Intersection->andIRFlags(Scalar);
 180       }
 181       VecOp->copyIRFlags(Intersection);
 182     }
 183   }
 184 }
 185
 186 /// \returns \p I after propagating metadata from \p VL.
 187 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
 188   Instruction *I0 = cast<Instruction>(VL[0]);
 189   SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
 190   I0->getAllMetadataOtherThanDebugLoc(Metadata);
 191
 192   for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
 193     unsigned Kind = Metadata[i].first;
 194     MDNode *MD = Metadata[i].second;
 195
 196     for (int i = 1, e = VL.size(); MD && i != e; i++) {
 197       Instruction *I = cast<Instruction>(VL[i]);
 198       MDNode *IMD = I->getMetadata(Kind);
 199
 200       switch (Kind) {
 201       default:
 202         MD = nullptr; // Remove unknown metadata
 203         break;
 204       case LLVMContext::MD_tbaa:
 205         MD = MDNode::getMostGenericTBAA(MD, IMD);
 206         break;
 207       case LLVMContext::MD_alias_scope:
 208       case LLVMContext::MD_noalias:
 209         MD = MDNode::intersect(MD, IMD);
 210         break;
 211       case LLVMContext::MD_fpmath:
 212         MD = MDNode::getMostGenericFPMath(MD, IMD);
 213         break;
 214       }
 215     }
 216     I->setMetadata(Kind, MD);
 217   }
 218   return I;
 219 }
 220
 221 /// \returns The type that all of the values in \p VL have or null if there
 222 /// are different types.
 223 static Type* getSameType(ArrayRef<Value *> VL) {
 224   Type *Ty = VL[0]->getType();
 225   for (int i = 1, e = VL.size(); i < e; i++)
 226     if (VL[i]->getType() != Ty)
 227       return nullptr;
 228
 229   return Ty;
 230 }
 231
 232 /// \returns True if the ExtractElement instructions in VL can be vectorized
 233 /// to use the original vector.
 234 static bool CanReuseExtract(ArrayRef<Value *> VL) {
 235   assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
 236   // Check if all of the extracts come from the same vector and from the
 237   // correct offset.
 238   Value *VL0 = VL[0];
 239   ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
 240   Value *Vec = E0->getOperand(0);
 241
 242   // We have to extract from the same vector type.
 243   unsigned NElts = Vec->getType()->getVectorNumElements();
 244
 245   if (NElts != VL.size())
 246     return false;
 247
 248   // Check that all of the indices extract from the correct offset.
 249   ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
 250   if (!CI || CI->getZExtValue())
 251     return false;
 252
 253   for (unsigned i = 1, e = VL.size(); i < e; ++i) {
 254     ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
 255     ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
 256
 257     if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
 258       return false;
 259   }
 260
 261   return true;
 262 }
 263
 264 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
 265                                            SmallVectorImpl<Value *> &Left,
 266                                            SmallVectorImpl<Value *> &Right) {
 267
 268   SmallVector<Value *, 16> OrigLeft, OrigRight;
 269
 270   bool AllSameOpcodeLeft = true;
 271   bool AllSameOpcodeRight = true;
 272   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
 273     Instruction *I = cast<Instruction>(VL[i]);
 274     Value *V0 = I->getOperand(0);
 275     Value *V1 = I->getOperand(1);
 276
 277     OrigLeft.push_back(V0);
 278     OrigRight.push_back(V1);
 279
 280     Instruction *I0 = dyn_cast<Instruction>(V0);
 281     Instruction *I1 = dyn_cast<Instruction>(V1);
 282
 283     // Check whether all operands on one side have the same opcode. In this case
 284     // we want to preserve the original order and not make things worse by
 285     // reordering.
 286     AllSameOpcodeLeft = I0;
 287     AllSameOpcodeRight = I1;
 288
 289     if (i && AllSameOpcodeLeft) {
 290       if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
 291         if(P0->getOpcode() != I0->getOpcode())
 292           AllSameOpcodeLeft = false;
 293       } else
 294         AllSameOpcodeLeft = false;
 295     }
 296     if (i && AllSameOpcodeRight) {
 297       if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
 298         if(P1->getOpcode() != I1->getOpcode())
 299           AllSameOpcodeRight = false;
 300       } else
 301         AllSameOpcodeRight = false;
 302     }
 303
 304     // Sort two opcodes. In the code below we try to preserve the ability to use
 305     // broadcast of values instead of individual inserts.
 306     // vl1 = load
 307     // vl2 = phi
 308     // vr1 = load
 309     // vr2 = vr2
 310     //    = vl1 x vr1
 311     //    = vl2 x vr2
 312     // If we just sorted according to opcode we would leave the first line in
 313     // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
 314     //    = vl1 x vr1
 315     //    = vr2 x vl2
 316     // Because vr2 and vr1 are from the same load we loose the opportunity of a
 317     // broadcast for the packed right side in the backend: we have [vr1, vl2]
 318     // instead of [vr1, vr2=vr1].
 319     if (I0 && I1) {
 320        if(!i && I0->getOpcode() > I1->getOpcode()) {
 321          Left.push_back(I1);
 322          Right.push_back(I0);
 323        } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
 324          // Try not to destroy a broad cast for no apparent benefit.
 325          Left.push_back(I1);
 326          Right.push_back(I0);
 327        } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] ==  I0) {
 328          // Try preserve broadcasts.
 329          Left.push_back(I1);
 330          Right.push_back(I0);
 331        } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
 332          // Try preserve broadcasts.
 333          Left.push_back(I1);
 334          Right.push_back(I0);
 335        } else {
 336          Left.push_back(I0);
 337          Right.push_back(I1);
 338        }
 339        continue;
 340     }
 341     // One opcode, put the instruction on the right.
 342     if (I0) {
 343       Left.push_back(V1);
 344       Right.push_back(I0);
 345       continue;
 346     }
 347     Left.push_back(V0);
 348     Right.push_back(V1);
 349   }
 350
 351   bool LeftBroadcast = isSplat(Left);
 352   bool RightBroadcast = isSplat(Right);
 353
 354   // Don't reorder if the operands where good to begin with.
 355   if (!(LeftBroadcast || RightBroadcast) &&
 356       (AllSameOpcodeRight || AllSameOpcodeLeft)) {
 357     Left = OrigLeft;
 358     Right = OrigRight;
 359   }
 360 }
 361
 362 /// \returns True if in-tree use also needs extract. This refers to
 363 /// possible scalar operand in vectorized instruction.
 364 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
 365                                     TargetLibraryInfo *TLI) {
 366
 367   unsigned Opcode = UserInst->getOpcode();
 368   switch (Opcode) {
 369   case Instruction::Load: {
 370     LoadInst *LI = cast<LoadInst>(UserInst);
 371     return (LI->getPointerOperand() == Scalar);
 372   }
 373   case Instruction::Store: {
 374     StoreInst *SI = cast<StoreInst>(UserInst);
 375     return (SI->getPointerOperand() == Scalar);
 376   }
 377   case Instruction::Call: {
 378     CallInst *CI = cast<CallInst>(UserInst);
 379     Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
 380     if (hasVectorInstrinsicScalarOpd(ID, 1)) {
 381       return (CI->getArgOperand(1) == Scalar);
 382     }
 383   }
 384   default:
 385     return false;
 386   }
 387 }
 388
 389 /// Bottom Up SLP Vectorizer.
 390 class BoUpSLP {
 391 public:
 392   typedef SmallVector<Value *, 8> ValueList;
 393   typedef SmallVector<Instruction *, 16> InstrList;
 394   typedef SmallPtrSet<Value *, 16> ValueSet;
 395   typedef SmallVector<StoreInst *, 8> StoreList;
 396
 397   BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
 398           TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa,
 399           LoopInfo *Li, DominatorTree *Dt)
 400       : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0),
 401         F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
 402         Builder(Se->getContext()) {}
 403
 404   /// \brief Vectorize the tree that starts with the elements in \p VL.
 405   /// Returns the vectorized root.
 406   Value *vectorizeTree();
 407
 408   /// \returns the cost incurred by unwanted spills and fills, caused by
 409   /// holding live values over call sites.
 410   int getSpillCost();
 411
 412   /// \returns the vectorization cost of the subtree that starts at \p VL.
 413   /// A negative number means that this is profitable.
 414   int getTreeCost();
 415
 416   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
 417   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
 418   void buildTree(ArrayRef<Value *> Roots,
 419                  ArrayRef<Value *> UserIgnoreLst = None);
 420
 421   /// Clear the internal data structures that are created by 'buildTree'.
 422   void deleteTree() {
 423     VectorizableTree.clear();
 424     ScalarToTreeEntry.clear();
 425     MustGather.clear();
 426     ExternalUses.clear();
 427     NumLoadsWantToKeepOrder = 0;
 428     NumLoadsWantToChangeOrder = 0;
 429     for (auto &Iter : BlocksSchedules) {
 430       BlockScheduling *BS = Iter.second.get();
 431       BS->clear();
 432     }
 433   }
 434
 435   /// \returns true if the memory operations A and B are consecutive.
 436   bool isConsecutiveAccess(Value *A, Value *B);
 437
 438   /// \brief Perform LICM and CSE on the newly generated gather sequences.
 439   void optimizeGatherSequence();
 440
 441   /// \returns true if it is benefitial to reverse the vector order.
 442   bool shouldReorder() const {
 443     return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
 444   }
 445
 446 private:
 447   struct TreeEntry;
 448
 449   /// \returns the cost of the vectorizable entry.
 450   int getEntryCost(TreeEntry *E);
 451
 452   /// This is the recursive part of buildTree.
 453   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
 454
 455   /// Vectorize a single entry in the tree.
 456   Value *vectorizeTree(TreeEntry *E);
 457
 458   /// Vectorize a single entry in the tree, starting in \p VL.
 459   Value *vectorizeTree(ArrayRef<Value *> VL);
 460
 461   /// \returns the pointer to the vectorized value if \p VL is already
 462   /// vectorized, or NULL. They may happen in cycles.
 463   Value *alreadyVectorized(ArrayRef<Value *> VL) const;
 464
 465   /// \brief Take the pointer operand from the Load/Store instruction.
 466   /// \returns NULL if this is not a valid Load/Store instruction.
 467   static Value *getPointerOperand(Value *I);
 468
 469   /// \brief Take the address space operand from the Load/Store instruction.
 470   /// \returns -1 if this is not a valid Load/Store instruction.
 471   static unsigned getAddressSpaceOperand(Value *I);
 472
 473   /// \returns the scalarization cost for this type. Scalarization in this
 474   /// context means the creation of vectors from a group of scalars.
 475   int getGatherCost(Type *Ty);
 476
 477   /// \returns the scalarization cost for this list of values. Assuming that
 478   /// this subtree gets vectorized, we may need to extract the values from the
 479   /// roots. This method calculates the cost of extracting the values.
 480   int getGatherCost(ArrayRef<Value *> VL);
 481
 482   /// \brief Set the Builder insert point to one after the last instruction in
 483   /// the bundle
 484   void setInsertPointAfterBundle(ArrayRef<Value *> VL);
 485
 486   /// \returns a vector from a collection of scalars in \p VL.
 487   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
 488
 489   /// \returns whether the VectorizableTree is fully vectoriable and will
 490   /// be beneficial even the tree height is tiny.
 491   bool isFullyVectorizableTinyTree();
 492
 493   struct TreeEntry {
 494     TreeEntry() : Scalars(), VectorizedValue(nullptr),
 495     NeedToGather(0) {}
 496
 497     /// \returns true if the scalars in VL are equal to this entry.
 498     bool isSame(ArrayRef<Value *> VL) const {
 499       assert(VL.size() == Scalars.size() && "Invalid size");
 500       return std::equal(VL.begin(), VL.end(), Scalars.begin());
 501     }
 502
 503     /// A vector of scalars.
 504     ValueList Scalars;
 505
 506     /// The Scalars are vectorized into this value. It is initialized to Null.
 507     Value *VectorizedValue;
 508
 509     /// Do we need to gather this sequence ?
 510     bool NeedToGather;
 511   };
 512
 513   /// Create a new VectorizableTree entry.
 514   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
 515     VectorizableTree.push_back(TreeEntry());
 516     int idx = VectorizableTree.size() - 1;
 517     TreeEntry *Last = &VectorizableTree[idx];
 518     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
 519     Last->NeedToGather = !Vectorized;
 520     if (Vectorized) {
 521       for (int i = 0, e = VL.size(); i != e; ++i) {
 522         assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
 523         ScalarToTreeEntry[VL[i]] = idx;
 524       }
 525     } else {
 526       MustGather.insert(VL.begin(), VL.end());
 527     }
 528     return Last;
 529   }
 530
 531   /// -- Vectorization State --
 532   /// Holds all of the tree entries.
 533   std::vector<TreeEntry> VectorizableTree;
 534
 535   /// Maps a specific scalar to its tree entry.
 536   SmallDenseMap<Value*, int> ScalarToTreeEntry;
 537
 538   /// A list of scalars that we found that we need to keep as scalars.
 539   ValueSet MustGather;
 540
 541   /// This POD struct describes one external user in the vectorized tree.
 542   struct ExternalUser {
 543     ExternalUser (Value *S, llvm::User *U, int L) :
 544       Scalar(S), User(U), Lane(L){};
 545     // Which scalar in our function.
 546     Value *Scalar;
 547     // Which user that uses the scalar.
 548     llvm::User *User;
 549     // Which lane does the scalar belong to.
 550     int Lane;
 551   };
 552   typedef SmallVector<ExternalUser, 16> UserList;
 553
 554   /// A list of values that need to extracted out of the tree.
 555   /// This list holds pairs of (Internal Scalar : External User).
 556   UserList ExternalUses;
 557
 558   /// Holds all of the instructions that we gathered.
 559   SetVector<Instruction *> GatherSeq;
 560   /// A list of blocks that we are going to CSE.
 561   SetVector<BasicBlock *> CSEBlocks;
 562
 563   /// Contains all scheduling relevant data for an instruction.
 564   /// A ScheduleData either represents a single instruction or a member of an
 565   /// instruction bundle (= a group of instructions which is combined into a
 566   /// vector instruction).
 567   struct ScheduleData {
 568
 569     // The initial value for the dependency counters. It means that the
 570     // dependencies are not calculated yet.
 571     enum { InvalidDeps = -1 };
 572
 573     ScheduleData()
 574         : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
 575           NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
 576           Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
 577           UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
 578
 579     void init(int BlockSchedulingRegionID) {
 580       FirstInBundle = this;
 581       NextInBundle = nullptr;
 582       NextLoadStore = nullptr;
 583       IsScheduled = false;
 584       SchedulingRegionID = BlockSchedulingRegionID;
 585       UnscheduledDepsInBundle = UnscheduledDeps;
 586       clearDependencies();
 587     }
 588
 589     /// Returns true if the dependency information has been calculated.
 590     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
 591
 592     /// Returns true for single instructions and for bundle representatives
 593     /// (= the head of a bundle).
 594     bool isSchedulingEntity() const { return FirstInBundle == this; }
 595
 596     /// Returns true if it represents an instruction bundle and not only a
 597     /// single instruction.
 598     bool isPartOfBundle() const {
 599       return NextInBundle != nullptr || FirstInBundle != this;
 600     }
 601
 602     /// Returns true if it is ready for scheduling, i.e. it has no more
 603     /// unscheduled depending instructions/bundles.
 604     bool isReady() const {
 605       assert(isSchedulingEntity() &&
 606              "can't consider non-scheduling entity for ready list");
 607       return UnscheduledDepsInBundle == 0 && !IsScheduled;
 608     }
 609
 610     /// Modifies the number of unscheduled dependencies, also updating it for
 611     /// the whole bundle.
 612     int incrementUnscheduledDeps(int Incr) {
 613       UnscheduledDeps += Incr;
 614       return FirstInBundle->UnscheduledDepsInBundle += Incr;
 615     }
 616
 617     /// Sets the number of unscheduled dependencies to the number of
 618     /// dependencies.
 619     void resetUnscheduledDeps() {
 620       incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
 621     }
 622
 623     /// Clears all dependency information.
 624     void clearDependencies() {
 625       Dependencies = InvalidDeps;
 626       resetUnscheduledDeps();
 627       MemoryDependencies.clear();
 628     }
 629
 630     void dump(raw_ostream &os) const {
 631       if (!isSchedulingEntity()) {
 632         os << "/ " << *Inst;
 633       } else if (NextInBundle) {
 634         os << '[' << *Inst;
 635         ScheduleData *SD = NextInBundle;
 636         while (SD) {
 637           os << ';' << *SD->Inst;
 638           SD = SD->NextInBundle;
 639         }
 640         os << ']';
 641       } else {
 642         os << *Inst;
 643       }
 644     }
 645
 646     Instruction *Inst;
 647
 648     /// Points to the head in an instruction bundle (and always to this for
 649     /// single instructions).
 650     ScheduleData *FirstInBundle;
 651
 652     /// Single linked list of all instructions in a bundle. Null if it is a
 653     /// single instruction.
 654     ScheduleData *NextInBundle;
 655
 656     /// Single linked list of all memory instructions (e.g. load, store, call)
 657     /// in the block - until the end of the scheduling region.
 658     ScheduleData *NextLoadStore;
 659
 660     /// The dependent memory instructions.
 661     /// This list is derived on demand in calculateDependencies().
 662     SmallVector<ScheduleData *, 4> MemoryDependencies;
 663
 664     /// This ScheduleData is in the current scheduling region if this matches
 665     /// the current SchedulingRegionID of BlockScheduling.
 666     int SchedulingRegionID;
 667
 668     /// Used for getting a "good" final ordering of instructions.
 669     int SchedulingPriority;
 670
 671     /// The number of dependencies. Constitutes of the number of users of the
 672     /// instruction plus the number of dependent memory instructions (if any).
 673     /// This value is calculated on demand.
 674     /// If InvalidDeps, the number of dependencies is not calculated yet.
 675     ///
 676     int Dependencies;
 677
 678     /// The number of dependencies minus the number of dependencies of scheduled
 679     /// instructions. As soon as this is zero, the instruction/bundle gets ready
 680     /// for scheduling.
 681     /// Note that this is negative as long as Dependencies is not calculated.
 682     int UnscheduledDeps;
 683
 684     /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
 685     /// single instructions.
 686     int UnscheduledDepsInBundle;
 687
 688     /// True if this instruction is scheduled (or considered as scheduled in the
 689     /// dry-run).
 690     bool IsScheduled;
 691   };
 692
 693 #ifndef NDEBUG
 694   friend raw_ostream &operator<<(raw_ostream &os,
 695                                  const BoUpSLP::ScheduleData &SD);
 696 #endif
 697
 698   /// Contains all scheduling data for a basic block.
 699   ///
 700   struct BlockScheduling {
 701
 702     BlockScheduling(BasicBlock *BB)
 703         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
 704           ScheduleStart(nullptr), ScheduleEnd(nullptr),
 705           FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
 706           // Make sure that the initial SchedulingRegionID is greater than the
 707           // initial SchedulingRegionID in ScheduleData (which is 0).
 708           SchedulingRegionID(1) {}
 709
 710     void clear() {
 711       ReadyInsts.clear();
 712       ScheduleStart = nullptr;
 713       ScheduleEnd = nullptr;
 714       FirstLoadStoreInRegion = nullptr;
 715       LastLoadStoreInRegion = nullptr;
 716
 717       // Make a new scheduling region, i.e. all existing ScheduleData is not
 718       // in the new region yet.
 719       ++SchedulingRegionID;
 720     }
 721
 722     ScheduleData *getScheduleData(Value *V) {
 723       ScheduleData *SD = ScheduleDataMap[V];
 724       if (SD && SD->SchedulingRegionID == SchedulingRegionID)
 725         return SD;
 726       return nullptr;
 727     }
 728
 729     bool isInSchedulingRegion(ScheduleData *SD) {
 730       return SD->SchedulingRegionID == SchedulingRegionID;
 731     }
 732
 733     /// Marks an instruction as scheduled and puts all dependent ready
 734     /// instructions into the ready-list.
 735     template <typename ReadyListType>
 736     void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
 737       SD->IsScheduled = true;
 738       DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
 739
 740       ScheduleData *BundleMember = SD;
 741       while (BundleMember) {
 742         // Handle the def-use chain dependencies.
 743         for (Use &U : BundleMember->Inst->operands()) {
 744           ScheduleData *OpDef = getScheduleData(U.get());
 745           if (OpDef && OpDef->hasValidDependencies() &&
 746               OpDef->incrementUnscheduledDeps(-1) == 0) {
 747             // There are no more unscheduled dependencies after decrementing,
 748             // so we can put the dependent instruction into the ready list.
 749             ScheduleData *DepBundle = OpDef->FirstInBundle;
 750             assert(!DepBundle->IsScheduled &&
 751                    "already scheduled bundle gets ready");
 752             ReadyList.insert(DepBundle);
 753             DEBUG(dbgs() << "SLP:    gets ready (def): " << *DepBundle << "\n");
 754           }
 755         }
 756         // Handle the memory dependencies.
 757         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
 758           if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
 759             // There are no more unscheduled dependencies after decrementing,
 760             // so we can put the dependent instruction into the ready list.
 761             ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
 762             assert(!DepBundle->IsScheduled &&
 763                    "already scheduled bundle gets ready");
 764             ReadyList.insert(DepBundle);
 765             DEBUG(dbgs() << "SLP:    gets ready (mem): " << *DepBundle << "\n");
 766           }
 767         }
 768         BundleMember = BundleMember->NextInBundle;
 769       }
 770     }
 771
 772     /// Put all instructions into the ReadyList which are ready for scheduling.
 773     template <typename ReadyListType>
 774     void initialFillReadyList(ReadyListType &ReadyList) {
 775       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
 776         ScheduleData *SD = getScheduleData(I);
 777         if (SD->isSchedulingEntity() && SD->isReady()) {
 778           ReadyList.insert(SD);
 779           DEBUG(dbgs() << "SLP:    initially in ready list: " << *I << "\n");
 780         }
 781       }
 782     }
 783
 784     /// Checks if a bundle of instructions can be scheduled, i.e. has no
 785     /// cyclic dependencies. This is only a dry-run, no instructions are
 786     /// actually moved at this stage.
 787     bool tryScheduleBundle(ArrayRef<Value *> VL, AliasAnalysis *AA);
 788
 789     /// Un-bundles a group of instructions.
 790     void cancelScheduling(ArrayRef<Value *> VL);
 791
 792     /// Extends the scheduling region so that V is inside the region.
 793     void extendSchedulingRegion(Value *V);
 794
 795     /// Initialize the ScheduleData structures for new instructions in the
 796     /// scheduling region.
 797     void initScheduleData(Instruction *FromI, Instruction *ToI,
 798                           ScheduleData *PrevLoadStore,
 799                           ScheduleData *NextLoadStore);
 800
 801     /// Updates the dependency information of a bundle and of all instructions/
 802     /// bundles which depend on the original bundle.
 803     void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
 804                                AliasAnalysis *AA);
 805
 806     /// Sets all instruction in the scheduling region to un-scheduled.
 807     void resetSchedule();
 808
 809     BasicBlock *BB;
 810
 811     /// Simple memory allocation for ScheduleData.
 812     std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
 813
 814     /// The size of a ScheduleData array in ScheduleDataChunks.
 815     int ChunkSize;
 816
 817     /// The allocator position in the current chunk, which is the last entry
 818     /// of ScheduleDataChunks.
 819     int ChunkPos;
 820
 821     /// Attaches ScheduleData to Instruction.
 822     /// Note that the mapping survives during all vectorization iterations, i.e.
 823     /// ScheduleData structures are recycled.
 824     DenseMap<Value *, ScheduleData *> ScheduleDataMap;
 825
 826     struct ReadyList : SmallVector<ScheduleData *, 8> {
 827       void insert(ScheduleData *SD) { push_back(SD); }
 828     };
 829
 830     /// The ready-list for scheduling (only used for the dry-run).
 831     ReadyList ReadyInsts;
 832
 833     /// The first instruction of the scheduling region.
 834     Instruction *ScheduleStart;
 835
 836     /// The first instruction _after_ the scheduling region.
 837     Instruction *ScheduleEnd;
 838
 839     /// The first memory accessing instruction in the scheduling region
 840     /// (can be null).
 841     ScheduleData *FirstLoadStoreInRegion;
 842
 843     /// The last memory accessing instruction in the scheduling region
 844     /// (can be null).
 845     ScheduleData *LastLoadStoreInRegion;
 846
 847     /// The ID of the scheduling region. For a new vectorization iteration this
 848     /// is incremented which "removes" all ScheduleData from the region.
 849     int SchedulingRegionID;
 850   };
 851
 852   /// Attaches the BlockScheduling structures to basic blocks.
 853   DenseMap<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
 854
 855   /// Performs the "real" scheduling. Done before vectorization is actually
 856   /// performed in a basic block.
 857   void scheduleBlock(BlockScheduling *BS);
 858
 859   /// List of users to ignore during scheduling and that don't need extracting.
 860   ArrayRef<Value *> UserIgnoreList;
 861
 862   // Number of load-bundles, which contain consecutive loads.
 863   int NumLoadsWantToKeepOrder;
 864
 865   // Number of load-bundles of size 2, which are consecutive loads if reversed.
 866   int NumLoadsWantToChangeOrder;
 867
 868   // Analysis and block reference.
 869   Function *F;
 870   ScalarEvolution *SE;
 871   const DataLayout *DL;
 872   TargetTransformInfo *TTI;
 873   TargetLibraryInfo *TLI;
 874   AliasAnalysis *AA;
 875   LoopInfo *LI;
 876   DominatorTree *DT;
 877   /// Instruction builder to construct the vectorized tree.
 878   IRBuilder<> Builder;
 879 };
 880
 881 #ifndef NDEBUG
 882 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
 883   SD.dump(os);
 884   return os;
 885 }
 886 #endif
 887
 888 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
 889                         ArrayRef<Value *> UserIgnoreLst) {
 890   deleteTree();
 891   UserIgnoreList = UserIgnoreLst;
 892   if (!getSameType(Roots))
 893     return;
 894   buildTree_rec(Roots, 0);
 895
 896   // Collect the values that we need to extract from the tree.
 897   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
 898     TreeEntry *Entry = &VectorizableTree[EIdx];
 899
 900     // For each lane:
 901     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
 902       Value *Scalar = Entry->Scalars[Lane];
 903
 904       // No need to handle users of gathered values.
 905       if (Entry->NeedToGather)
 906         continue;
 907
 908       for (User *U : Scalar->users()) {
 909         DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
 910
 911         Instruction *UserInst = dyn_cast<Instruction>(U);
 912         if (!UserInst)
 913           continue;
 914
 915         // Skip in-tree scalars that become vectors
 916         if (ScalarToTreeEntry.count(U)) {
 917           int Idx = ScalarToTreeEntry[U];
 918           TreeEntry *UseEntry = &VectorizableTree[Idx];
 919           Value *UseScalar = UseEntry->Scalars[0];
 920           // Some in-tree scalars will remain as scalar in vectorized
 921           // instructions. If that is the case, the one in Lane 0 will
 922           // be used.
 923           if (UseScalar != U ||
 924               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
 925             DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
 926                          << ".\n");
 927             assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
 928             continue;
 929           }
 930         }
 931
 932         // Ignore users in the user ignore list.
 933         if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
 934             UserIgnoreList.end())
 935           continue;
 936
 937         DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
 938               Lane << " from " << *Scalar << ".\n");
 939         ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
 940       }
 941     }
 942   }
 943 }
 944
 945
 946 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
 947   bool SameTy = getSameType(VL); (void)SameTy;
 948   bool isAltShuffle = false;
 949   assert(SameTy && "Invalid types!");
 950
 951   if (Depth == RecursionMaxDepth) {
 952     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
 953     newTreeEntry(VL, false);
 954     return;
 955   }
 956
 957   // Don't handle vectors.
 958   if (VL[0]->getType()->isVectorTy()) {
 959     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
 960     newTreeEntry(VL, false);
 961     return;
 962   }
 963
 964   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
 965     if (SI->getValueOperand()->getType()->isVectorTy()) {
 966       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
 967       newTreeEntry(VL, false);
 968       return;
 969     }
 970   unsigned Opcode = getSameOpcode(VL);
 971
 972   // Check that this shuffle vector refers to the alternate
 973   // sequence of opcodes.
 974   if (Opcode == Instruction::ShuffleVector) {
 975     Instruction *I0 = dyn_cast<Instruction>(VL[0]);
 976     unsigned Op = I0->getOpcode();
 977     if (Op != Instruction::ShuffleVector)
 978       isAltShuffle = true;
 979   }
 980
 981   // If all of the operands are identical or constant we have a simple solution.
 982   if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
 983     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
 984     newTreeEntry(VL, false);
 985     return;
 986   }
 987
 988   // We now know that this is a vector of instructions of the same type from
 989   // the same block.
 990
 991   // Check if this is a duplicate of another entry.
 992   if (ScalarToTreeEntry.count(VL[0])) {
 993     int Idx = ScalarToTreeEntry[VL[0]];
 994     TreeEntry *E = &VectorizableTree[Idx];
 995     for (unsigned i = 0, e = VL.size(); i != e; ++i) {
 996       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
 997       if (E->Scalars[i] != VL[i]) {
 998         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
 999         newTreeEntry(VL, false);
1000         return;
1001       }
1002     }
1003     DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
1004     return;
1005   }
1006
1007   // Check that none of the instructions in the bundle are already in the tree.
1008   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1009     if (ScalarToTreeEntry.count(VL[i])) {
1010       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1011             ") is already in tree.\n");
1012       newTreeEntry(VL, false);
1013       return;
1014     }
1015   }
1016
1017   // If any of the scalars appears in the table OR it is marked as a value that
1018   // needs to stat scalar then we need to gather the scalars.
1019   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1020     if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) {
1021       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n");
1022       newTreeEntry(VL, false);
1023       return;
1024     }
1025   }
1026
1027   // Check that all of the users of the scalars that we want to vectorize are
1028   // schedulable.
1029   Instruction *VL0 = cast<Instruction>(VL[0]);
1030   BasicBlock *BB = cast<Instruction>(VL0)->getParent();
1031
1032   if (!DT->isReachableFromEntry(BB)) {
1033     // Don't go into unreachable blocks. They may contain instructions with
1034     // dependency cycles which confuse the final scheduling.
1035     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
1036     newTreeEntry(VL, false);
1037     return;
1038   }
1039
1040   // Check that every instructions appears once in this bundle.
1041   for (unsigned i = 0, e = VL.size(); i < e; ++i)
1042     for (unsigned j = i+1; j < e; ++j)
1043       if (VL[i] == VL[j]) {
1044         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
1045         newTreeEntry(VL, false);
1046         return;
1047       }
1048
1049   auto &BSRef = BlocksSchedules[BB];
1050   if (!BSRef) {
1051     BSRef = llvm::make_unique<BlockScheduling>(BB);
1052   }
1053   BlockScheduling &BS = *BSRef.get();
1054
1055   if (!BS.tryScheduleBundle(VL, AA)) {
1056     DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
1057     BS.cancelScheduling(VL);
1058     newTreeEntry(VL, false);
1059     return;
1060   }
1061   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
1062
1063   switch (Opcode) {
1064     case Instruction::PHI: {
1065       PHINode *PH = dyn_cast<PHINode>(VL0);
1066
1067       // Check for terminator values (e.g. invoke).
1068       for (unsigned j = 0; j < VL.size(); ++j)
1069         for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1070           TerminatorInst *Term = dyn_cast<TerminatorInst>(
1071               cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
1072           if (Term) {
1073             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
1074             BS.cancelScheduling(VL);
1075             newTreeEntry(VL, false);
1076             return;
1077           }
1078         }
1079
1080       newTreeEntry(VL, true);
1081       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
1082
1083       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1084         ValueList Operands;
1085         // Prepare the operand vector.
1086         for (unsigned j = 0; j < VL.size(); ++j)
1087           Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
1088               PH->getIncomingBlock(i)));
1089
1090         buildTree_rec(Operands, Depth + 1);
1091       }
1092       return;
1093     }
1094     case Instruction::ExtractElement: {
1095       bool Reuse = CanReuseExtract(VL);
1096       if (Reuse) {
1097         DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
1098       } else {
1099         BS.cancelScheduling(VL);
1100       }
1101       newTreeEntry(VL, Reuse);
1102       return;
1103     }
1104     case Instruction::Load: {
1105       // Check if the loads are consecutive or of we need to swizzle them.
1106       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1107         LoadInst *L = cast<LoadInst>(VL[i]);
1108         if (!L->isSimple()) {
1109           BS.cancelScheduling(VL);
1110           newTreeEntry(VL, false);
1111           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
1112           return;
1113         }
1114         if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
1115           if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) {
1116             ++NumLoadsWantToChangeOrder;
1117           }
1118           BS.cancelScheduling(VL);
1119           newTreeEntry(VL, false);
1120           DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
1121           return;
1122         }
1123       }
1124       ++NumLoadsWantToKeepOrder;
1125       newTreeEntry(VL, true);
1126       DEBUG(dbgs() << "SLP: added a vector of loads.\n");
1127       return;
1128     }
1129     case Instruction::ZExt:
1130     case Instruction::SExt:
1131     case Instruction::FPToUI:
1132     case Instruction::FPToSI:
1133     case Instruction::FPExt:
1134     case Instruction::PtrToInt:
1135     case Instruction::IntToPtr:
1136     case Instruction::SIToFP:
1137     case Instruction::UIToFP:
1138     case Instruction::Trunc:
1139     case Instruction::FPTrunc:
1140     case Instruction::BitCast: {
1141       Type *SrcTy = VL0->getOperand(0)->getType();
1142       for (unsigned i = 0; i < VL.size(); ++i) {
1143         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1144         if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) {
1145           BS.cancelScheduling(VL);
1146           newTreeEntry(VL, false);
1147           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
1148           return;
1149         }
1150       }
1151       newTreeEntry(VL, true);
1152       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
1153
1154       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1155         ValueList Operands;
1156         // Prepare the operand vector.
1157         for (unsigned j = 0; j < VL.size(); ++j)
1158           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1159
1160         buildTree_rec(Operands, Depth+1);
1161       }
1162       return;
1163     }
1164     case Instruction::ICmp:
1165     case Instruction::FCmp: {
1166       // Check that all of the compares have the same predicate.
1167       CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
1168       Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
1169       for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1170         CmpInst *Cmp = cast<CmpInst>(VL[i]);
1171         if (Cmp->getPredicate() != P0 ||
1172             Cmp->getOperand(0)->getType() != ComparedTy) {
1173           BS.cancelScheduling(VL);
1174           newTreeEntry(VL, false);
1175           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
1176           return;
1177         }
1178       }
1179
1180       newTreeEntry(VL, true);
1181       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
1182
1183       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1184         ValueList Operands;
1185         // Prepare the operand vector.
1186         for (unsigned j = 0; j < VL.size(); ++j)
1187           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1188
1189         buildTree_rec(Operands, Depth+1);
1190       }
1191       return;
1192     }
1193     case Instruction::Select:
1194     case Instruction::Add:
1195     case Instruction::FAdd:
1196     case Instruction::Sub:
1197     case Instruction::FSub:
1198     case Instruction::Mul:
1199     case Instruction::FMul:
1200     case Instruction::UDiv:
1201     case Instruction::SDiv:
1202     case Instruction::FDiv:
1203     case Instruction::URem:
1204     case Instruction::SRem:
1205     case Instruction::FRem:
1206     case Instruction::Shl:
1207     case Instruction::LShr:
1208     case Instruction::AShr:
1209     case Instruction::And:
1210     case Instruction::Or:
1211     case Instruction::Xor: {
1212       newTreeEntry(VL, true);
1213       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
1214
1215       // Sort operands of the instructions so that each side is more likely to
1216       // have the same opcode.
1217       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1218         ValueList Left, Right;
1219         reorderInputsAccordingToOpcode(VL, Left, Right);
1220         buildTree_rec(Left, Depth + 1);
1221         buildTree_rec(Right, Depth + 1);
1222         return;
1223       }
1224
1225       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1226         ValueList Operands;
1227         // Prepare the operand vector.
1228         for (unsigned j = 0; j < VL.size(); ++j)
1229           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1230
1231         buildTree_rec(Operands, Depth+1);
1232       }
1233       return;
1234     }
1235     case Instruction::GetElementPtr: {
1236       // We don't combine GEPs with complicated (nested) indexing.
1237       for (unsigned j = 0; j < VL.size(); ++j) {
1238         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1239           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
1240           BS.cancelScheduling(VL);
1241           newTreeEntry(VL, false);
1242           return;
1243         }
1244       }
1245
1246       // We can't combine several GEPs into one vector if they operate on
1247       // different types.
1248       Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
1249       for (unsigned j = 0; j < VL.size(); ++j) {
1250         Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1251         if (Ty0 != CurTy) {
1252           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
1253           BS.cancelScheduling(VL);
1254           newTreeEntry(VL, false);
1255           return;
1256         }
1257       }
1258
1259       // We don't combine GEPs with non-constant indexes.
1260       for (unsigned j = 0; j < VL.size(); ++j) {
1261         auto Op = cast<Instruction>(VL[j])->getOperand(1);
1262         if (!isa<ConstantInt>(Op)) {
1263           DEBUG(
1264               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
1265           BS.cancelScheduling(VL);
1266           newTreeEntry(VL, false);
1267           return;
1268         }
1269       }
1270
1271       newTreeEntry(VL, true);
1272       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
1273       for (unsigned i = 0, e = 2; i < e; ++i) {
1274         ValueList Operands;
1275         // Prepare the operand vector.
1276         for (unsigned j = 0; j < VL.size(); ++j)
1277           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1278
1279         buildTree_rec(Operands, Depth + 1);
1280       }
1281       return;
1282     }
1283     case Instruction::Store: {
1284       // Check if the stores are consecutive or of we need to swizzle them.
1285       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1286         if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
1287           BS.cancelScheduling(VL);
1288           newTreeEntry(VL, false);
1289           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
1290           return;
1291         }
1292
1293       newTreeEntry(VL, true);
1294       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
1295
1296       ValueList Operands;
1297       for (unsigned j = 0; j < VL.size(); ++j)
1298         Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
1299
1300       buildTree_rec(Operands, Depth + 1);
1301       return;
1302     }
1303     case Instruction::Call: {
1304       // Check if the calls are all to the same vectorizable intrinsic.
1305       CallInst *CI = cast<CallInst>(VL[0]);
1306       // Check if this is an Intrinsic call or something that can be
1307       // represented by an intrinsic call
1308       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1309       if (!isTriviallyVectorizable(ID)) {
1310         BS.cancelScheduling(VL);
1311         newTreeEntry(VL, false);
1312         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
1313         return;
1314       }
1315       Function *Int = CI->getCalledFunction();
1316       Value *A1I = nullptr;
1317       if (hasVectorInstrinsicScalarOpd(ID, 1))
1318         A1I = CI->getArgOperand(1);
1319       for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1320         CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
1321         if (!CI2 || CI2->getCalledFunction() != Int ||
1322             getIntrinsicIDForCall(CI2, TLI) != ID) {
1323           BS.cancelScheduling(VL);
1324           newTreeEntry(VL, false);
1325           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
1326                        << "\n");
1327           return;
1328         }
1329         // ctlz,cttz and powi are special intrinsics whose second argument
1330         // should be same in order for them to be vectorized.
1331         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
1332           Value *A1J = CI2->getArgOperand(1);
1333           if (A1I != A1J) {
1334             BS.cancelScheduling(VL);
1335             newTreeEntry(VL, false);
1336             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
1337                          << " argument "<< A1I<<"!=" << A1J
1338                          << "\n");
1339             return;
1340           }
1341         }
1342       }
1343
1344       newTreeEntry(VL, true);
1345       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
1346         ValueList Operands;
1347         // Prepare the operand vector.
1348         for (unsigned j = 0; j < VL.size(); ++j) {
1349           CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
1350           Operands.push_back(CI2->getArgOperand(i));
1351         }
1352         buildTree_rec(Operands, Depth + 1);
1353       }
1354       return;
1355     }
1356     case Instruction::ShuffleVector: {
1357       // If this is not an alternate sequence of opcode like add-sub
1358       // then do not vectorize this instruction.
1359       if (!isAltShuffle) {
1360         BS.cancelScheduling(VL);
1361         newTreeEntry(VL, false);
1362         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
1363         return;
1364       }
1365       newTreeEntry(VL, true);
1366       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
1367       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1368         ValueList Operands;
1369         // Prepare the operand vector.
1370         for (unsigned j = 0; j < VL.size(); ++j)
1371           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1372
1373         buildTree_rec(Operands, Depth + 1);
1374       }
1375       return;
1376     }
1377     default:
1378       BS.cancelScheduling(VL);
1379       newTreeEntry(VL, false);
1380       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
1381       return;
1382   }
1383 }
1384
1385 int BoUpSLP::getEntryCost(TreeEntry *E) {
1386   ArrayRef<Value*> VL = E->Scalars;
1387
1388   Type *ScalarTy = VL[0]->getType();
1389   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1390     ScalarTy = SI->getValueOperand()->getType();
1391   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1392
1393   if (E->NeedToGather) {
1394     if (allConstant(VL))
1395       return 0;
1396     if (isSplat(VL)) {
1397       return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
1398     }
1399     return getGatherCost(E->Scalars);
1400   }
1401   unsigned Opcode = getSameOpcode(VL);
1402   assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
1403   Instruction *VL0 = cast<Instruction>(VL[0]);
1404   switch (Opcode) {
1405     case Instruction::PHI: {
1406       return 0;
1407     }
1408     case Instruction::ExtractElement: {
1409       if (CanReuseExtract(VL)) {
1410         int DeadCost = 0;
1411         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1412           ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
1413           if (E->hasOneUse())
1414             // Take credit for instruction that will become dead.
1415             DeadCost +=
1416                 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
1417         }
1418         return -DeadCost;
1419       }
1420       return getGatherCost(VecTy);
1421     }
1422     case Instruction::ZExt:
1423     case Instruction::SExt:
1424     case Instruction::FPToUI:
1425     case Instruction::FPToSI:
1426     case Instruction::FPExt:
1427     case Instruction::PtrToInt:
1428     case Instruction::IntToPtr:
1429     case Instruction::SIToFP:
1430     case Instruction::UIToFP:
1431     case Instruction::Trunc:
1432     case Instruction::FPTrunc:
1433     case Instruction::BitCast: {
1434       Type *SrcTy = VL0->getOperand(0)->getType();
1435
1436       // Calculate the cost of this instruction.
1437       int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
1438                                                          VL0->getType(), SrcTy);
1439
1440       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
1441       int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
1442       return VecCost - ScalarCost;
1443     }
1444     case Instruction::FCmp:
1445     case Instruction::ICmp:
1446     case Instruction::Select:
1447     case Instruction::Add:
1448     case Instruction::FAdd:
1449     case Instruction::Sub:
1450     case Instruction::FSub:
1451     case Instruction::Mul:
1452     case Instruction::FMul:
1453     case Instruction::UDiv:
1454     case Instruction::SDiv:
1455     case Instruction::FDiv:
1456     case Instruction::URem:
1457     case Instruction::SRem:
1458     case Instruction::FRem:
1459     case Instruction::Shl:
1460     case Instruction::LShr:
1461     case Instruction::AShr:
1462     case Instruction::And:
1463     case Instruction::Or:
1464     case Instruction::Xor: {
1465       // Calculate the cost of this instruction.
1466       int ScalarCost = 0;
1467       int VecCost = 0;
1468       if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
1469           Opcode == Instruction::Select) {
1470         VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
1471         ScalarCost = VecTy->getNumElements() *
1472         TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
1473         VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
1474       } else {
1475         // Certain instructions can be cheaper to vectorize if they have a
1476         // constant second vector operand.
1477         TargetTransformInfo::OperandValueKind Op1VK =
1478             TargetTransformInfo::OK_AnyValue;
1479         TargetTransformInfo::OperandValueKind Op2VK =
1480             TargetTransformInfo::OK_UniformConstantValue;
1481         TargetTransformInfo::OperandValueProperties Op1VP =
1482             TargetTransformInfo::OP_None;
1483         TargetTransformInfo::OperandValueProperties Op2VP =
1484             TargetTransformInfo::OP_None;
1485
1486         // If all operands are exactly the same ConstantInt then set the
1487         // operand kind to OK_UniformConstantValue.
1488         // If instead not all operands are constants, then set the operand kind
1489         // to OK_AnyValue. If all operands are constants but not the same,
1490         // then set the operand kind to OK_NonUniformConstantValue.
1491         ConstantInt *CInt = nullptr;
1492         for (unsigned i = 0; i < VL.size(); ++i) {
1493           const Instruction *I = cast<Instruction>(VL[i]);
1494           if (!isa<ConstantInt>(I->getOperand(1))) {
1495             Op2VK = TargetTransformInfo::OK_AnyValue;
1496             break;
1497           }
1498           if (i == 0) {
1499             CInt = cast<ConstantInt>(I->getOperand(1));
1500             continue;
1501           }
1502           if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
1503               CInt != cast<ConstantInt>(I->getOperand(1)))
1504             Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
1505         }
1506         // FIXME: Currently cost of model modification for division by
1507         // power of 2 is handled only for X86. Add support for other targets.
1508         if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
1509             CInt->getValue().isPowerOf2())
1510           Op2VP = TargetTransformInfo::OP_PowerOf2;
1511
1512         ScalarCost = VecTy->getNumElements() *
1513                      TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
1514                                                  Op1VP, Op2VP);
1515         VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
1516                                               Op1VP, Op2VP);
1517       }
1518       return VecCost - ScalarCost;
1519     }
1520     case Instruction::GetElementPtr: {
1521       TargetTransformInfo::OperandValueKind Op1VK =
1522           TargetTransformInfo::OK_AnyValue;
1523       TargetTransformInfo::OperandValueKind Op2VK =
1524           TargetTransformInfo::OK_UniformConstantValue;
1525
1526       int ScalarCost =
1527           VecTy->getNumElements() *
1528           TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
1529       int VecCost =
1530           TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
1531
1532       return VecCost - ScalarCost;
1533     }
1534     case Instruction::Load: {
1535       // Cost of wide load - cost of scalar loads.
1536       int ScalarLdCost = VecTy->getNumElements() *
1537       TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
1538       int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
1539       return VecLdCost - ScalarLdCost;
1540     }
1541     case Instruction::Store: {
1542       // We know that we can merge the stores. Calculate the cost.
1543       int ScalarStCost = VecTy->getNumElements() *
1544       TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
1545       int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
1546       return VecStCost - ScalarStCost;
1547     }
1548     case Instruction::Call: {
1549       CallInst *CI = cast<CallInst>(VL0);
1550       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1551
1552       // Calculate the cost of the scalar and vector calls.
1553       SmallVector<Type*, 4> ScalarTys, VecTys;
1554       for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
1555         ScalarTys.push_back(CI->getArgOperand(op)->getType());
1556         VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
1557                                          VecTy->getNumElements()));
1558       }
1559
1560       int ScalarCallCost = VecTy->getNumElements() *
1561           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
1562
1563       int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
1564
1565       DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
1566             << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
1567             << " for " << *CI << "\n");
1568
1569       return VecCallCost - ScalarCallCost;
1570     }
1571     case Instruction::ShuffleVector: {
1572       TargetTransformInfo::OperandValueKind Op1VK =
1573           TargetTransformInfo::OK_AnyValue;
1574       TargetTransformInfo::OperandValueKind Op2VK =
1575           TargetTransformInfo::OK_AnyValue;
1576       int ScalarCost = 0;
1577       int VecCost = 0;
1578       for (unsigned i = 0; i < VL.size(); ++i) {
1579         Instruction *I = cast<Instruction>(VL[i]);
1580         if (!I)
1581           break;
1582         ScalarCost +=
1583             TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
1584       }
1585       // VecCost is equal to sum of the cost of creating 2 vectors
1586       // and the cost of creating shuffle.
1587       Instruction *I0 = cast<Instruction>(VL[0]);
1588       VecCost =
1589           TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
1590       Instruction *I1 = cast<Instruction>(VL[1]);
1591       VecCost +=
1592           TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
1593       VecCost +=
1594           TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
1595       return VecCost - ScalarCost;
1596     }
1597     default:
1598       llvm_unreachable("Unknown instruction");
1599   }
1600 }
1601
1602 bool BoUpSLP::isFullyVectorizableTinyTree() {
1603   DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
1604         VectorizableTree.size() << " is fully vectorizable .\n");
1605
1606   // We only handle trees of height 2.
1607   if (VectorizableTree.size() != 2)
1608     return false;
1609
1610   // Handle splat stores.
1611   if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars))
1612     return true;
1613
1614   // Gathering cost would be too much for tiny trees.
1615   if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1616     return false;
1617
1618   return true;
1619 }
1620
1621 int BoUpSLP::getSpillCost() {
1622   // Walk from the bottom of the tree to the top, tracking which values are
1623   // live. When we see a call instruction that is not part of our tree,
1624   // query TTI to see if there is a cost to keeping values live over it
1625   // (for example, if spills and fills are required).
1626   unsigned BundleWidth = VectorizableTree.front().Scalars.size();
1627   int Cost = 0;
1628
1629   SmallPtrSet<Instruction*, 4> LiveValues;
1630   Instruction *PrevInst = nullptr;
1631
1632   for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
1633     Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
1634     if (!Inst)
1635       continue;
1636
1637     if (!PrevInst) {
1638       PrevInst = Inst;
1639       continue;
1640     }
1641
1642     DEBUG(
1643       dbgs() << "SLP: #LV: " << LiveValues.size();
1644       for (auto *X : LiveValues)
1645         dbgs() << " " << X->getName();
1646       dbgs() << ", Looking at ";
1647       Inst->dump();
1648       );
1649
1650     // Update LiveValues.
1651     LiveValues.erase(PrevInst);
1652     for (auto &J : PrevInst->operands()) {
1653       if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
1654         LiveValues.insert(cast<Instruction>(&*J));
1655     }
1656
1657     // Now find the sequence of instructions between PrevInst and Inst.
1658     BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
1659     --PrevInstIt;
1660     while (InstIt != PrevInstIt) {
1661       if (PrevInstIt == PrevInst->getParent()->rend()) {
1662         PrevInstIt = Inst->getParent()->rbegin();
1663         continue;
1664       }
1665
1666       if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
1667         SmallVector<Type*, 4> V;
1668         for (auto *II : LiveValues)
1669           V.push_back(VectorType::get(II->getType(), BundleWidth));
1670         Cost += TTI->getCostOfKeepingLiveOverCall(V);
1671       }
1672
1673       ++PrevInstIt;
1674     }
1675
1676     PrevInst = Inst;
1677   }
1678
1679   DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
1680   return Cost;
1681 }
1682
1683 int BoUpSLP::getTreeCost() {
1684   int Cost = 0;
1685   DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
1686         VectorizableTree.size() << ".\n");
1687
1688   // We only vectorize tiny trees if it is fully vectorizable.
1689   if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
1690     if (!VectorizableTree.size()) {
1691       assert(!ExternalUses.size() && "We should not have any external users");
1692     }
1693     return INT_MAX;
1694   }
1695
1696   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
1697
1698   for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
1699     int C = getEntryCost(&VectorizableTree[i]);
1700     DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
1701           << *VectorizableTree[i].Scalars[0] << " .\n");
1702     Cost += C;
1703   }
1704
1705   SmallSet<Value *, 16> ExtractCostCalculated;
1706   int ExtractCost = 0;
1707   for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
1708        I != E; ++I) {
1709     // We only add extract cost once for the same scalar.
1710     if (!ExtractCostCalculated.insert(I->Scalar))
1711       continue;
1712
1713     VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
1714     ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1715                                            I->Lane);
1716   }
1717
1718   Cost += getSpillCost();
1719
1720   DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
1721   return  Cost + ExtractCost;
1722 }
1723
1724 int BoUpSLP::getGatherCost(Type *Ty) {
1725   int Cost = 0;
1726   for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
1727     Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
1728   return Cost;
1729 }
1730
1731 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
1732   // Find the type of the operands in VL.
1733   Type *ScalarTy = VL[0]->getType();
1734   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1735     ScalarTy = SI->getValueOperand()->getType();
1736   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1737   // Find the cost of inserting/extracting values from the vector.
1738   return getGatherCost(VecTy);
1739 }
1740
1741 Value *BoUpSLP::getPointerOperand(Value *I) {
1742   if (LoadInst *LI = dyn_cast<LoadInst>(I))
1743     return LI->getPointerOperand();
1744   if (StoreInst *SI = dyn_cast<StoreInst>(I))
1745     return SI->getPointerOperand();
1746   return nullptr;
1747 }
1748
1749 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
1750   if (LoadInst *L = dyn_cast<LoadInst>(I))
1751     return L->getPointerAddressSpace();
1752   if (StoreInst *S = dyn_cast<StoreInst>(I))
1753     return S->getPointerAddressSpace();
1754   return -1;
1755 }
1756
1757 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
1758   Value *PtrA = getPointerOperand(A);
1759   Value *PtrB = getPointerOperand(B);
1760   unsigned ASA = getAddressSpaceOperand(A);
1761   unsigned ASB = getAddressSpaceOperand(B);
1762
1763   // Check that the address spaces match and that the pointers are valid.
1764   if (!PtrA || !PtrB || (ASA != ASB))
1765     return false;
1766
1767   // Make sure that A and B are different pointers of the same type.
1768   if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
1769     return false;
1770
1771   unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA);
1772   Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
1773   APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty));
1774
1775   APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
1776   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA);
1777   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB);
1778
1779   APInt OffsetDelta = OffsetB - OffsetA;
1780
1781   // Check if they are based on the same pointer. That makes the offsets
1782   // sufficient.
1783   if (PtrA == PtrB)
1784     return OffsetDelta == Size;
1785
1786   // Compute the necessary base pointer delta to have the necessary final delta
1787   // equal to the size.
1788   APInt BaseDelta = Size - OffsetDelta;
1789
1790   // Otherwise compute the distance with SCEV between the base pointers.
1791   const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
1792   const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
1793   const SCEV *C = SE->getConstant(BaseDelta);
1794   const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
1795   return X == PtrSCEVB;
1796 }
1797
1798 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
1799   Instruction *VL0 = cast<Instruction>(VL[0]);
1800   BasicBlock::iterator NextInst = VL0;
1801   ++NextInst;
1802   Builder.SetInsertPoint(VL0->getParent(), NextInst);
1803   Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
1804 }
1805
1806 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
1807   Value *Vec = UndefValue::get(Ty);
1808   // Generate the 'InsertElement' instruction.
1809   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
1810     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
1811     if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
1812       GatherSeq.insert(Insrt);
1813       CSEBlocks.insert(Insrt->getParent());
1814
1815       // Add to our 'need-to-extract' list.
1816       if (ScalarToTreeEntry.count(VL[i])) {
1817         int Idx = ScalarToTreeEntry[VL[i]];
1818         TreeEntry *E = &VectorizableTree[Idx];
1819         // Find which lane we need to extract.
1820         int FoundLane = -1;
1821         for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
1822           // Is this the lane of the scalar that we are looking for ?
1823           if (E->Scalars[Lane] == VL[i]) {
1824             FoundLane = Lane;
1825             break;
1826           }
1827         }
1828         assert(FoundLane >= 0 && "Could not find the correct lane");
1829         ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
1830       }
1831     }
1832   }
1833
1834   return Vec;
1835 }
1836
1837 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
1838   SmallDenseMap<Value*, int>::const_iterator Entry
1839     = ScalarToTreeEntry.find(VL[0]);
1840   if (Entry != ScalarToTreeEntry.end()) {
1841     int Idx = Entry->second;
1842     const TreeEntry *En = &VectorizableTree[Idx];
1843     if (En->isSame(VL) && En->VectorizedValue)
1844       return En->VectorizedValue;
1845   }
1846   return nullptr;
1847 }
1848
1849 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
1850   if (ScalarToTreeEntry.count(VL[0])) {
1851     int Idx = ScalarToTreeEntry[VL[0]];
1852     TreeEntry *E = &VectorizableTree[Idx];
1853     if (E->isSame(VL))
1854       return vectorizeTree(E);
1855   }
1856
1857   Type *ScalarTy = VL[0]->getType();
1858   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1859     ScalarTy = SI->getValueOperand()->getType();
1860   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1861
1862   return Gather(VL, VecTy);
1863 }
1864
1865 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
1866   IRBuilder<>::InsertPointGuard Guard(Builder);
1867
1868   if (E->VectorizedValue) {
1869     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
1870     return E->VectorizedValue;
1871   }
1872
1873   Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
1874   Type *ScalarTy = VL0->getType();
1875   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
1876     ScalarTy = SI->getValueOperand()->getType();
1877   VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
1878
1879   if (E->NeedToGather) {
1880     setInsertPointAfterBundle(E->Scalars);
1881     return Gather(E->Scalars, VecTy);
1882   }
1883
1884   unsigned Opcode = getSameOpcode(E->Scalars);
1885
1886   switch (Opcode) {
1887     case Instruction::PHI: {
1888       PHINode *PH = dyn_cast<PHINode>(VL0);
1889       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
1890       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
1891       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
1892       E->VectorizedValue = NewPhi;
1893
1894       // PHINodes may have multiple entries from the same block. We want to
1895       // visit every block once.
1896       SmallSet<BasicBlock*, 4> VisitedBBs;
1897
1898       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1899         ValueList Operands;
1900         BasicBlock *IBB = PH->getIncomingBlock(i);
1901
1902         if (!VisitedBBs.insert(IBB)) {
1903           NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
1904           continue;
1905         }
1906
1907         // Prepare the operand vector.
1908         for (unsigned j = 0; j < E->Scalars.size(); ++j)
1909           Operands.push_back(cast<PHINode>(E->Scalars[j])->
1910                              getIncomingValueForBlock(IBB));
1911
1912         Builder.SetInsertPoint(IBB->getTerminator());
1913         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
1914         Value *Vec = vectorizeTree(Operands);
1915         NewPhi->addIncoming(Vec, IBB);
1916       }
1917
1918       assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
1919              "Invalid number of incoming values");
1920       return NewPhi;
1921     }
1922
1923     case Instruction::ExtractElement: {
1924       if (CanReuseExtract(E->Scalars)) {
1925         Value *V = VL0->getOperand(0);
1926         E->VectorizedValue = V;
1927         return V;
1928       }
1929       return Gather(E->Scalars, VecTy);
1930     }
1931     case Instruction::ZExt:
1932     case Instruction::SExt:
1933     case Instruction::FPToUI:
1934     case Instruction::FPToSI:
1935     case Instruction::FPExt:
1936     case Instruction::PtrToInt:
1937     case Instruction::IntToPtr:
1938     case Instruction::SIToFP:
1939     case Instruction::UIToFP:
1940     case Instruction::Trunc:
1941     case Instruction::FPTrunc:
1942     case Instruction::BitCast: {
1943       ValueList INVL;
1944       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
1945         INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
1946
1947       setInsertPointAfterBundle(E->Scalars);
1948
1949       Value *InVec = vectorizeTree(INVL);
1950
1951       if (Value *V = alreadyVectorized(E->Scalars))
1952         return V;
1953
1954       CastInst *CI = dyn_cast<CastInst>(VL0);
1955       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
1956       E->VectorizedValue = V;
1957       ++NumVectorInstructions;
1958       return V;
1959     }
1960     case Instruction::FCmp:
1961     case Instruction::ICmp: {
1962       ValueList LHSV, RHSV;
1963       for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
1964         LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
1965         RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
1966       }
1967
1968       setInsertPointAfterBundle(E->Scalars);
1969
1970       Value *L = vectorizeTree(LHSV);
1971       Value *R = vectorizeTree(RHSV);
1972
1973       if (Value *V = alreadyVectorized(E->Scalars))
1974         return V;
1975
1976       CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
1977       Value *V;
1978       if (Opcode == Instruction::FCmp)
1979         V = Builder.CreateFCmp(P0, L, R);
1980       else
1981         V = Builder.CreateICmp(P0, L, R);
1982
1983       E->VectorizedValue = V;
1984       ++NumVectorInstructions;
1985       return V;
1986     }
1987     case Instruction::Select: {
1988       ValueList TrueVec, FalseVec, CondVec;
1989       for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
1990         CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
1991         TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
1992         FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
1993       }
1994
1995       setInsertPointAfterBundle(E->Scalars);
1996
1997       Value *Cond = vectorizeTree(CondVec);
1998       Value *True = vectorizeTree(TrueVec);
1999       Value *False = vectorizeTree(FalseVec);
2000
2001       if (Value *V = alreadyVectorized(E->Scalars))
2002         return V;
2003
2004       Value *V = Builder.CreateSelect(Cond, True, False);
2005       E->VectorizedValue = V;
2006       ++NumVectorInstructions;
2007       return V;
2008     }
2009     case Instruction::Add:
2010     case Instruction::FAdd:
2011     case Instruction::Sub:
2012     case Instruction::FSub:
2013     case Instruction::Mul:
2014     case Instruction::FMul:
2015     case Instruction::UDiv:
2016     case Instruction::SDiv:
2017     case Instruction::FDiv:
2018     case Instruction::URem:
2019     case Instruction::SRem:
2020     case Instruction::FRem:
2021     case Instruction::Shl:
2022     case Instruction::LShr:
2023     case Instruction::AShr:
2024     case Instruction::And:
2025     case Instruction::Or:
2026     case Instruction::Xor: {
2027       ValueList LHSVL, RHSVL;
2028       if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
2029         reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
2030       else
2031         for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2032           LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2033           RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2034         }
2035
2036       setInsertPointAfterBundle(E->Scalars);
2037
2038       Value *LHS = vectorizeTree(LHSVL);
2039       Value *RHS = vectorizeTree(RHSVL);
2040
2041       if (LHS == RHS && isa<Instruction>(LHS)) {
2042         assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
2043       }
2044
2045       if (Value *V = alreadyVectorized(E->Scalars))
2046         return V;
2047
2048       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
2049       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
2050       E->VectorizedValue = V;
2051       propagateIRFlags(E->VectorizedValue, E->Scalars);
2052       ++NumVectorInstructions;
2053
2054       if (Instruction *I = dyn_cast<Instruction>(V))
2055         return propagateMetadata(I, E->Scalars);
2056
2057       return V;
2058     }
2059     case Instruction::Load: {
2060       // Loads are inserted at the head of the tree because we don't want to
2061       // sink them all the way down past store instructions.
2062       setInsertPointAfterBundle(E->Scalars);
2063
2064       LoadInst *LI = cast<LoadInst>(VL0);
2065       Type *ScalarLoadTy = LI->getType();
2066       unsigned AS = LI->getPointerAddressSpace();
2067
2068       Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
2069                                             VecTy->getPointerTo(AS));
2070
2071       // The pointer operand uses an in-tree scalar so we add the new BitCast to
2072       // ExternalUses list to make sure that an extract will be generated in the
2073       // future.
2074       if (ScalarToTreeEntry.count(LI->getPointerOperand()))
2075         ExternalUses.push_back(
2076             ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
2077
2078       unsigned Alignment = LI->getAlignment();
2079       LI = Builder.CreateLoad(VecPtr);
2080       if (!Alignment)
2081         Alignment = DL->getABITypeAlignment(ScalarLoadTy);
2082       LI->setAlignment(Alignment);
2083       E->VectorizedValue = LI;
2084       ++NumVectorInstructions;
2085       return propagateMetadata(LI, E->Scalars);
2086     }
2087     case Instruction::Store: {
2088       StoreInst *SI = cast<StoreInst>(VL0);
2089       unsigned Alignment = SI->getAlignment();
2090       unsigned AS = SI->getPointerAddressSpace();
2091
2092       ValueList ValueOp;
2093       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2094         ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
2095
2096       setInsertPointAfterBundle(E->Scalars);
2097
2098       Value *VecValue = vectorizeTree(ValueOp);
2099       Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
2100                                             VecTy->getPointerTo(AS));
2101       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
2102
2103       // The pointer operand uses an in-tree scalar so we add the new BitCast to
2104       // ExternalUses list to make sure that an extract will be generated in the
2105       // future.
2106       if (ScalarToTreeEntry.count(SI->getPointerOperand()))
2107         ExternalUses.push_back(
2108             ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
2109
2110       if (!Alignment)
2111         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
2112       S->setAlignment(Alignment);
2113       E->VectorizedValue = S;
2114       ++NumVectorInstructions;
2115       return propagateMetadata(S, E->Scalars);
2116     }
2117     case Instruction::GetElementPtr: {
2118       setInsertPointAfterBundle(E->Scalars);
2119
2120       ValueList Op0VL;
2121       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2122         Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
2123
2124       Value *Op0 = vectorizeTree(Op0VL);
2125
2126       std::vector<Value *> OpVecs;
2127       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
2128            ++j) {
2129         ValueList OpVL;
2130         for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2131           OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
2132
2133         Value *OpVec = vectorizeTree(OpVL);
2134         OpVecs.push_back(OpVec);
2135       }
2136
2137       Value *V = Builder.CreateGEP(Op0, OpVecs);
2138       E->VectorizedValue = V;
2139       ++NumVectorInstructions;
2140
2141       if (Instruction *I = dyn_cast<Instruction>(V))
2142         return propagateMetadata(I, E->Scalars);
2143
2144       return V;
2145     }
2146     case Instruction::Call: {
2147       CallInst *CI = cast<CallInst>(VL0);
2148       setInsertPointAfterBundle(E->Scalars);
2149       Function *FI;
2150       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
2151       Value *ScalarArg = nullptr;
2152       if (CI && (FI = CI->getCalledFunction())) {
2153         IID = (Intrinsic::ID) FI->getIntrinsicID();
2154       }
2155       std::vector<Value *> OpVecs;
2156       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
2157         ValueList OpVL;
2158         // ctlz,cttz and powi are special intrinsics whose second argument is
2159         // a scalar. This argument should not be vectorized.
2160         if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
2161           CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2162           ScalarArg = CEI->getArgOperand(j);
2163           OpVecs.push_back(CEI->getArgOperand(j));
2164           continue;
2165         }
2166         for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2167           CallInst *CEI = cast<CallInst>(E->Scalars[i]);
2168           OpVL.push_back(CEI->getArgOperand(j));
2169         }
2170
2171         Value *OpVec = vectorizeTree(OpVL);
2172         DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
2173         OpVecs.push_back(OpVec);
2174       }
2175
2176       Module *M = F->getParent();
2177       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
2178       Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
2179       Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
2180       Value *V = Builder.CreateCall(CF, OpVecs);
2181
2182       // The scalar argument uses an in-tree scalar so we add the new vectorized
2183       // call to ExternalUses list to make sure that an extract will be
2184       // generated in the future.
2185       if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2186         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2187
2188       E->VectorizedValue = V;
2189       ++NumVectorInstructions;
2190       return V;
2191     }
2192     case Instruction::ShuffleVector: {
2193       ValueList LHSVL, RHSVL;
2194       for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2195         LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2196         RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2197       }
2198       setInsertPointAfterBundle(E->Scalars);
2199
2200       Value *LHS = vectorizeTree(LHSVL);
2201       Value *RHS = vectorizeTree(RHSVL);
2202
2203       if (Value *V = alreadyVectorized(E->Scalars))
2204         return V;
2205
2206       // Create a vector of LHS op1 RHS
2207       BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
2208       Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
2209
2210       // Create a vector of LHS op2 RHS
2211       Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
2212       BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
2213       Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
2214
2215       // Create shuffle to take alternate operations from the vector.
2216       // Also, gather up odd and even scalar ops to propagate IR flags to
2217       // each vector operation.
2218       ValueList OddScalars, EvenScalars;
2219       unsigned e = E->Scalars.size();
2220       SmallVector<Constant *, 8> Mask(e);
2221       for (unsigned i = 0; i < e; ++i) {
2222         if (i & 1) {
2223           Mask[i] = Builder.getInt32(e + i);
2224           OddScalars.push_back(E->Scalars[i]);
2225         } else {
2226           Mask[i] = Builder.getInt32(i);
2227           EvenScalars.push_back(E->Scalars[i]);
2228         }
2229       }
2230
2231       Value *ShuffleMask = ConstantVector::get(Mask);
2232       propagateIRFlags(V0, EvenScalars);
2233       propagateIRFlags(V1, OddScalars);
2234
2235       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2236       E->VectorizedValue = V;
2237       ++NumVectorInstructions;
2238       if (Instruction *I = dyn_cast<Instruction>(V))
2239         return propagateMetadata(I, E->Scalars);
2240
2241       return V;
2242     }
2243     default:
2244     llvm_unreachable("unknown inst");
2245   }
2246   return nullptr;
2247 }
2248
2249 Value *BoUpSLP::vectorizeTree() {
2250
2251   // All blocks must be scheduled before any instructions are inserted.
2252   for (auto &BSIter : BlocksSchedules) {
2253     scheduleBlock(BSIter.second.get());
2254   }
2255
2256   Builder.SetInsertPoint(F->getEntryBlock().begin());
2257   vectorizeTree(&VectorizableTree[0]);
2258
2259   DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
2260
2261   // Extract all of the elements with the external uses.
2262   for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
2263        it != e; ++it) {
2264     Value *Scalar = it->Scalar;
2265     llvm::User *User = it->User;
2266
2267     // Skip users that we already RAUW. This happens when one instruction
2268     // has multiple uses of the same value.
2269     if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
2270         Scalar->user_end())
2271       continue;
2272     assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
2273
2274     int Idx = ScalarToTreeEntry[Scalar];
2275     TreeEntry *E = &VectorizableTree[Idx];
2276     assert(!E->NeedToGather && "Extracting from a gather list");
2277
2278     Value *Vec = E->VectorizedValue;
2279     assert(Vec && "Can't find vectorizable value");
2280
2281     Value *Lane = Builder.getInt32(it->Lane);
2282     // Generate extracts for out-of-tree users.
2283     // Find the insertion point for the extractelement lane.
2284     if (isa<Instruction>(Vec)){
2285       if (PHINode *PH = dyn_cast<PHINode>(User)) {
2286         for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
2287           if (PH->getIncomingValue(i) == Scalar) {
2288             Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
2289             Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2290             CSEBlocks.insert(PH->getIncomingBlock(i));
2291             PH->setOperand(i, Ex);
2292           }
2293         }
2294       } else {
2295         Builder.SetInsertPoint(cast<Instruction>(User));
2296         Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2297         CSEBlocks.insert(cast<Instruction>(User)->getParent());
2298         User->replaceUsesOfWith(Scalar, Ex);
2299      }
2300     } else {
2301       Builder.SetInsertPoint(F->getEntryBlock().begin());
2302       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2303       CSEBlocks.insert(&F->getEntryBlock());
2304       User->replaceUsesOfWith(Scalar, Ex);
2305     }
2306
2307     DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
2308   }
2309
2310   // For each vectorized value:
2311   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
2312     TreeEntry *Entry = &VectorizableTree[EIdx];
2313
2314     // For each lane:
2315     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
2316       Value *Scalar = Entry->Scalars[Lane];
2317       // No need to handle users of gathered values.
2318       if (Entry->NeedToGather)
2319         continue;
2320
2321       assert(Entry->VectorizedValue && "Can't find vectorizable value");
2322
2323       Type *Ty = Scalar->getType();
2324       if (!Ty->isVoidTy()) {
2325 #ifndef NDEBUG
2326         for (User *U : Scalar->users()) {
2327           DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
2328
2329           assert((ScalarToTreeEntry.count(U) ||
2330                   // It is legal to replace users in the ignorelist by undef.
2331                   (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
2332                    UserIgnoreList.end())) &&
2333                  "Replacing out-of-tree value with undef");
2334         }
2335 #endif
2336         Value *Undef = UndefValue::get(Ty);
2337         Scalar->replaceAllUsesWith(Undef);
2338       }
2339       DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
2340       cast<Instruction>(Scalar)->eraseFromParent();
2341     }
2342   }
2343
2344   Builder.ClearInsertionPoint();
2345
2346   return VectorizableTree[0].VectorizedValue;
2347 }
2348
2349 void BoUpSLP::optimizeGatherSequence() {
2350   DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
2351         << " gather sequences instructions.\n");
2352   // LICM InsertElementInst sequences.
2353   for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
2354        e = GatherSeq.end(); it != e; ++it) {
2355     InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
2356
2357     if (!Insert)
2358       continue;
2359
2360     // Check if this block is inside a loop.
2361     Loop *L = LI->getLoopFor(Insert->getParent());
2362     if (!L)
2363       continue;
2364
2365     // Check if it has a preheader.
2366     BasicBlock *PreHeader = L->getLoopPreheader();
2367     if (!PreHeader)
2368       continue;
2369
2370     // If the vector or the element that we insert into it are
2371     // instructions that are defined in this basic block then we can't
2372     // hoist this instruction.
2373     Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
2374     Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
2375     if (CurrVec && L->contains(CurrVec))
2376       continue;
2377     if (NewElem && L->contains(NewElem))
2378       continue;
2379
2380     // We can hoist this instruction. Move it to the pre-header.
2381     Insert->moveBefore(PreHeader->getTerminator());
2382   }
2383
2384   // Make a list of all reachable blocks in our CSE queue.
2385   SmallVector<const DomTreeNode *, 8> CSEWorkList;
2386   CSEWorkList.reserve(CSEBlocks.size());
2387   for (BasicBlock *BB : CSEBlocks)
2388     if (DomTreeNode *N = DT->getNode(BB)) {
2389       assert(DT->isReachableFromEntry(N));
2390       CSEWorkList.push_back(N);
2391     }
2392
2393   // Sort blocks by domination. This ensures we visit a block after all blocks
2394   // dominating it are visited.
2395   std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
2396                    [this](const DomTreeNode *A, const DomTreeNode *B) {
2397     return DT->properlyDominates(A, B);
2398   });
2399
2400   // Perform O(N^2) search over the gather sequences and merge identical
2401   // instructions. TODO: We can further optimize this scan if we split the
2402   // instructions into different buckets based on the insert lane.
2403   SmallVector<Instruction *, 16> Visited;
2404   for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
2405     assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
2406            "Worklist not sorted properly!");
2407     BasicBlock *BB = (*I)->getBlock();
2408     // For all instructions in blocks containing gather sequences:
2409     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
2410       Instruction *In = it++;
2411       if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
2412         continue;
2413
2414       // Check if we can replace this instruction with any of the
2415       // visited instructions.
2416       for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
2417                                                     ve = Visited.end();
2418            v != ve; ++v) {
2419         if (In->isIdenticalTo(*v) &&
2420             DT->dominates((*v)->getParent(), In->getParent())) {
2421           In->replaceAllUsesWith(*v);
2422           In->eraseFromParent();
2423           In = nullptr;
2424           break;
2425         }
2426       }
2427       if (In) {
2428         assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
2429         Visited.push_back(In);
2430       }
2431     }
2432   }
2433   CSEBlocks.clear();
2434   GatherSeq.clear();
2435 }
2436
2437 // Groups the instructions to a bundle (which is then a single scheduling entity)
2438 // and schedules instructions until the bundle gets ready.
2439 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
2440                                                  AliasAnalysis *AA) {
2441   if (isa<PHINode>(VL[0]))
2442     return true;
2443
2444   // Initialize the instruction bundle.
2445   Instruction *OldScheduleEnd = ScheduleEnd;
2446   ScheduleData *PrevInBundle = nullptr;
2447   ScheduleData *Bundle = nullptr;
2448   bool ReSchedule = false;
2449   DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
2450   for (Value *V : VL) {
2451     extendSchedulingRegion(V);
2452     ScheduleData *BundleMember = getScheduleData(V);
2453     assert(BundleMember &&
2454            "no ScheduleData for bundle member (maybe not in same basic block)");
2455     if (BundleMember->IsScheduled) {
2456       // A bundle member was scheduled as single instruction before and now
2457       // needs to be scheduled as part of the bundle. We just get rid of the
2458       // existing schedule.
2459       DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
2460                    << " was already scheduled\n");
2461       ReSchedule = true;
2462     }
2463     assert(BundleMember->isSchedulingEntity() &&
2464            "bundle member already part of other bundle");
2465     if (PrevInBundle) {
2466       PrevInBundle->NextInBundle = BundleMember;
2467     } else {
2468       Bundle = BundleMember;
2469     }
2470     BundleMember->UnscheduledDepsInBundle = 0;
2471     Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
2472
2473     // Group the instructions to a bundle.
2474     BundleMember->FirstInBundle = Bundle;
2475     PrevInBundle = BundleMember;
2476   }
2477   if (ScheduleEnd != OldScheduleEnd) {
2478     // The scheduling region got new instructions at the lower end (or it is a
2479     // new region for the first bundle). This makes it necessary to
2480     // recalculate all dependencies.
2481     // It is seldom that this needs to be done a second time after adding the
2482     // initial bundle to the region.
2483     for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2484       ScheduleData *SD = getScheduleData(I);
2485       SD->clearDependencies();
2486     }
2487     ReSchedule = true;
2488   }
2489   if (ReSchedule) {
2490     resetSchedule();
2491     initialFillReadyList(ReadyInsts);
2492   }
2493
2494   DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
2495                << BB->getName() << "\n");
2496
2497   calculateDependencies(Bundle, true, AA);
2498
2499   // Now try to schedule the new bundle. As soon as the bundle is "ready" it
2500   // means that there are no cyclic dependencies and we can schedule it.
2501   // Note that's important that we don't "schedule" the bundle yet (see
2502   // cancelScheduling).
2503   while (!Bundle->isReady() && !ReadyInsts.empty()) {
2504
2505     ScheduleData *pickedSD = ReadyInsts.back();
2506     ReadyInsts.pop_back();
2507
2508     if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
2509       schedule(pickedSD, ReadyInsts);
2510     }
2511   }
2512   return Bundle->isReady();
2513 }
2514
2515 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
2516   if (isa<PHINode>(VL[0]))
2517     return;
2518
2519   ScheduleData *Bundle = getScheduleData(VL[0]);
2520   DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
2521   assert(!Bundle->IsScheduled &&
2522          "Can't cancel bundle which is already scheduled");
2523   assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
2524          "tried to unbundle something which is not a bundle");
2525
2526   // Un-bundle: make single instructions out of the bundle.
2527   ScheduleData *BundleMember = Bundle;
2528   while (BundleMember) {
2529     assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
2530     BundleMember->FirstInBundle = BundleMember;
2531     ScheduleData *Next = BundleMember->NextInBundle;
2532     BundleMember->NextInBundle = nullptr;
2533     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
2534     if (BundleMember->UnscheduledDepsInBundle == 0) {
2535       ReadyInsts.insert(BundleMember);
2536     }
2537     BundleMember = Next;
2538   }
2539 }
2540
2541 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
2542   if (getScheduleData(V))
2543     return;
2544   Instruction *I = dyn_cast<Instruction>(V);
2545   assert(I && "bundle member must be an instruction");
2546   assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
2547   if (!ScheduleStart) {
2548     // It's the first instruction in the new region.
2549     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
2550     ScheduleStart = I;
2551     ScheduleEnd = I->getNextNode();
2552     assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2553     DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
2554     return;
2555   }
2556   // Search up and down at the same time, because we don't know if the new
2557   // instruction is above or below the existing scheduling region.
2558   BasicBlock::reverse_iterator UpIter(ScheduleStart);
2559   BasicBlock::reverse_iterator UpperEnd = BB->rend();
2560   BasicBlock::iterator DownIter(ScheduleEnd);
2561   BasicBlock::iterator LowerEnd = BB->end();
2562   for (;;) {
2563     if (UpIter != UpperEnd) {
2564       if (&*UpIter == I) {
2565         initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
2566         ScheduleStart = I;
2567         DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n");
2568         return;
2569       }
2570       UpIter++;
2571     }
2572     if (DownIter != LowerEnd) {
2573       if (&*DownIter == I) {
2574         initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
2575                          nullptr);
2576         ScheduleEnd = I->getNextNode();
2577         assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2578         DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
2579         return;
2580       }
2581       DownIter++;
2582     }
2583     assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
2584            "instruction not found in block");
2585   }
2586 }
2587
2588 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
2589                                                 Instruction *ToI,
2590                                                 ScheduleData *PrevLoadStore,
2591                                                 ScheduleData *NextLoadStore) {
2592   ScheduleData *CurrentLoadStore = PrevLoadStore;
2593   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
2594     ScheduleData *SD = ScheduleDataMap[I];
2595     if (!SD) {
2596       // Allocate a new ScheduleData for the instruction.
2597       if (ChunkPos >= ChunkSize) {
2598         ScheduleDataChunks.push_back(
2599             llvm::make_unique<ScheduleData[]>(ChunkSize));
2600         ChunkPos = 0;
2601       }
2602       SD = &(ScheduleDataChunks.back()[ChunkPos++]);
2603       ScheduleDataMap[I] = SD;
2604       SD->Inst = I;
2605     }
2606     assert(!isInSchedulingRegion(SD) &&
2607            "new ScheduleData already in scheduling region");
2608     SD->init(SchedulingRegionID);
2609
2610     if (I->mayReadOrWriteMemory()) {
2611       // Update the linked list of memory accessing instructions.
2612       if (CurrentLoadStore) {
2613         CurrentLoadStore->NextLoadStore = SD;
2614       } else {
2615         FirstLoadStoreInRegion = SD;
2616       }
2617       CurrentLoadStore = SD;
2618     }
2619   }
2620   if (NextLoadStore) {
2621     if (CurrentLoadStore)
2622       CurrentLoadStore->NextLoadStore = NextLoadStore;
2623   } else {
2624     LastLoadStoreInRegion = CurrentLoadStore;
2625   }
2626 }
2627
2628 /// \returns the AA location that is being access by the instruction.
2629 static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {
2630   if (StoreInst *SI = dyn_cast<StoreInst>(I))
2631     return AA->getLocation(SI);
2632   if (LoadInst *LI = dyn_cast<LoadInst>(I))
2633     return AA->getLocation(LI);
2634   return AliasAnalysis::Location();
2635 }
2636
2637 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
2638                                                      bool InsertInReadyList,
2639                                                      AliasAnalysis *AA) {
2640   assert(SD->isSchedulingEntity());
2641
2642   SmallVector<ScheduleData *, 10> WorkList;
2643   WorkList.push_back(SD);
2644
2645   while (!WorkList.empty()) {
2646     ScheduleData *SD = WorkList.back();
2647     WorkList.pop_back();
2648
2649     ScheduleData *BundleMember = SD;
2650     while (BundleMember) {
2651       assert(isInSchedulingRegion(BundleMember));
2652       if (!BundleMember->hasValidDependencies()) {
2653
2654         DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember << "\n");
2655         BundleMember->Dependencies = 0;
2656         BundleMember->resetUnscheduledDeps();
2657
2658         // Handle def-use chain dependencies.
2659         for (User *U : BundleMember->Inst->users()) {
2660           if (isa<Instruction>(U)) {
2661             ScheduleData *UseSD = getScheduleData(U);
2662             if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
2663               BundleMember->Dependencies++;
2664               ScheduleData *DestBundle = UseSD->FirstInBundle;
2665               if (!DestBundle->IsScheduled) {
2666                 BundleMember->incrementUnscheduledDeps(1);
2667               }
2668               if (!DestBundle->hasValidDependencies()) {
2669                 WorkList.push_back(DestBundle);
2670               }
2671             }
2672           } else {
2673             // I'm not sure if this can ever happen. But we need to be safe.
2674             // This lets the instruction/bundle never be scheduled and eventally
2675             // disable vectorization.
2676             BundleMember->Dependencies++;
2677             BundleMember->incrementUnscheduledDeps(1);
2678           }
2679         }
2680
2681         // Handle the memory dependencies.
2682         ScheduleData *DepDest = BundleMember->NextLoadStore;
2683         if (DepDest) {
2684           AliasAnalysis::Location SrcLoc = getLocation(BundleMember->Inst, AA);
2685           bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
2686
2687           while (DepDest) {
2688             assert(isInSchedulingRegion(DepDest));
2689             if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) {
2690               AliasAnalysis::Location DstLoc = getLocation(DepDest->Inst, AA);
2691               if (!SrcLoc.Ptr || !DstLoc.Ptr || AA->alias(SrcLoc, DstLoc)) {
2692                 DepDest->MemoryDependencies.push_back(BundleMember);
2693                 BundleMember->Dependencies++;
2694                 ScheduleData *DestBundle = DepDest->FirstInBundle;
2695                 if (!DestBundle->IsScheduled) {
2696                   BundleMember->incrementUnscheduledDeps(1);
2697                 }
2698                 if (!DestBundle->hasValidDependencies()) {
2699                   WorkList.push_back(DestBundle);
2700                 }
2701               }
2702             }
2703             DepDest = DepDest->NextLoadStore;
2704           }
2705         }
2706       }
2707       BundleMember = BundleMember->NextInBundle;
2708     }
2709     if (InsertInReadyList && SD->isReady()) {
2710       ReadyInsts.push_back(SD);
2711       DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst << "\n");
2712     }
2713   }
2714 }
2715
2716 void BoUpSLP::BlockScheduling::resetSchedule() {
2717   assert(ScheduleStart &&
2718          "tried to reset schedule on block which has not been scheduled");
2719   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2720     ScheduleData *SD = getScheduleData(I);
2721     assert(isInSchedulingRegion(SD));
2722     SD->IsScheduled = false;
2723     SD->resetUnscheduledDeps();
2724   }
2725   ReadyInsts.clear();
2726 }
2727
2728 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
2729
2730   if (!BS->ScheduleStart)
2731     return;
2732
2733   DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
2734
2735   BS->resetSchedule();
2736
2737   // For the real scheduling we use a more sophisticated ready-list: it is
2738   // sorted by the original instruction location. This lets the final schedule
2739   // be as  close as possible to the original instruction order.
2740   struct ScheduleDataCompare {
2741     bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
2742       return SD2->SchedulingPriority < SD1->SchedulingPriority;
2743     }
2744   };
2745   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
2746
2747   // Ensure that all depencency data is updated and fill the ready-list with
2748   // initial instructions.
2749   int Idx = 0;
2750   int NumToSchedule = 0;
2751   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
2752        I = I->getNextNode()) {
2753     ScheduleData *SD = BS->getScheduleData(I);
2754     assert(
2755         SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
2756         "scheduler and vectorizer have different opinion on what is a bundle");
2757     SD->FirstInBundle->SchedulingPriority = Idx++;
2758     if (SD->isSchedulingEntity()) {
2759       BS->calculateDependencies(SD, false, AA);
2760       NumToSchedule++;
2761     }
2762   }
2763   BS->initialFillReadyList(ReadyInsts);
2764
2765   Instruction *LastScheduledInst = BS->ScheduleEnd;
2766
2767   // Do the "real" scheduling.
2768   while (!ReadyInsts.empty()) {
2769     ScheduleData *picked = *ReadyInsts.begin();
2770     ReadyInsts.erase(ReadyInsts.begin());
2771
2772     // Move the scheduled instruction(s) to their dedicated places, if not
2773     // there yet.
2774     ScheduleData *BundleMember = picked;
2775     while (BundleMember) {
2776       Instruction *pickedInst = BundleMember->Inst;
2777       if (LastScheduledInst->getNextNode() != pickedInst) {
2778         BS->BB->getInstList().remove(pickedInst);
2779         BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
2780       }
2781       LastScheduledInst = pickedInst;
2782       BundleMember = BundleMember->NextInBundle;
2783     }
2784
2785     BS->schedule(picked, ReadyInsts);
2786     NumToSchedule--;
2787   }
2788   assert(NumToSchedule == 0 && "could not schedule all instructions");
2789
2790   // Avoid duplicate scheduling of the block.
2791   BS->ScheduleStart = nullptr;
2792 }
2793
2794 /// The SLPVectorizer Pass.
2795 struct SLPVectorizer : public FunctionPass {
2796   typedef SmallVector<StoreInst *, 8> StoreList;
2797   typedef MapVector<Value *, StoreList> StoreListMap;
2798
2799   /// Pass identification, replacement for typeid
2800   static char ID;
2801
2802   explicit SLPVectorizer() : FunctionPass(ID) {
2803     initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
2804   }
2805
2806   ScalarEvolution *SE;
2807   const DataLayout *DL;
2808   TargetTransformInfo *TTI;
2809   TargetLibraryInfo *TLI;
2810   AliasAnalysis *AA;
2811   LoopInfo *LI;
2812   DominatorTree *DT;
2813
2814   bool runOnFunction(Function &F) override {
2815     if (skipOptnoneFunction(F))
2816       return false;
2817
2818     SE = &getAnalysis<ScalarEvolution>();
2819     DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
2820     DL = DLP ? &DLP->getDataLayout() : nullptr;
2821     TTI = &getAnalysis<TargetTransformInfo>();
2822     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
2823     AA = &getAnalysis<AliasAnalysis>();
2824     LI = &getAnalysis<LoopInfo>();
2825     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2826
2827     StoreRefs.clear();
2828     bool Changed = false;
2829
2830     // If the target claims to have no vector registers don't attempt
2831     // vectorization.
2832     if (!TTI->getNumberOfRegisters(true))
2833       return false;
2834
2835     // Must have DataLayout. We can't require it because some tests run w/o
2836     // triple.
2837     if (!DL)
2838       return false;
2839
2840     // Don't vectorize when the attribute NoImplicitFloat is used.
2841     if (F.hasFnAttribute(Attribute::NoImplicitFloat))
2842       return false;
2843
2844     DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
2845
2846     // Use the bottom up slp vectorizer to construct chains that start with
2847     // store instructions.
2848     BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT);
2849
2850     // Scan the blocks in the function in post order.
2851     for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
2852          e = po_end(&F.getEntryBlock()); it != e; ++it) {
2853       BasicBlock *BB = *it;
2854       // Vectorize trees that end at stores.
2855       if (unsigned count = collectStores(BB, R)) {
2856         (void)count;
2857         DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
2858         Changed |= vectorizeStoreChains(R);
2859       }
2860
2861       // Vectorize trees that end at reductions.
2862       Changed |= vectorizeChainsInBlock(BB, R);
2863     }
2864
2865     if (Changed) {
2866       R.optimizeGatherSequence();
2867       DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
2868       DEBUG(verifyFunction(F));
2869     }
2870     return Changed;
2871   }
2872
2873   void getAnalysisUsage(AnalysisUsage &AU) const override {
2874     FunctionPass::getAnalysisUsage(AU);
2875     AU.addRequired<ScalarEvolution>();
2876     AU.addRequired<AliasAnalysis>();
2877     AU.addRequired<TargetTransformInfo>();
2878     AU.addRequired<LoopInfo>();
2879     AU.addRequired<DominatorTreeWrapperPass>();
2880     AU.addPreserved<LoopInfo>();
2881     AU.addPreserved<DominatorTreeWrapperPass>();
2882     AU.setPreservesCFG();
2883   }
2884
2885 private:
2886
2887   /// \brief Collect memory references and sort them according to their base
2888   /// object. We sort the stores to their base objects to reduce the cost of the
2889   /// quadratic search on the stores. TODO: We can further reduce this cost
2890   /// if we flush the chain creation every time we run into a memory barrier.
2891   unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
2892
2893   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
2894   bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
2895
2896   /// \brief Try to vectorize a list of operands.
2897   /// \@param BuildVector A list of users to ignore for the purpose of
2898   ///                     scheduling and that don't need extracting.
2899   /// \returns true if a value was vectorized.
2900   bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
2901                           ArrayRef<Value *> BuildVector = None,
2902                           bool allowReorder = false);
2903
2904   /// \brief Try to vectorize a chain that may start at the operands of \V;
2905   bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
2906
2907   /// \brief Vectorize the stores that were collected in StoreRefs.
2908   bool vectorizeStoreChains(BoUpSLP &R);
2909
2910   /// \brief Scan the basic block and look for patterns that are likely to start
2911   /// a vectorization chain.
2912   bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
2913
2914   bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
2915                            BoUpSLP &R);
2916
2917   bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
2918                        BoUpSLP &R);
2919 private:
2920   StoreListMap StoreRefs;
2921 };
2922
2923 /// \brief Check that the Values in the slice in VL array are still existent in
2924 /// the WeakVH array.
2925 /// Vectorization of part of the VL array may cause later values in the VL array
2926 /// to become invalid. We track when this has happened in the WeakVH array.
2927 static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
2928                                SmallVectorImpl<WeakVH> &VH,
2929                                unsigned SliceBegin,
2930                                unsigned SliceSize) {
2931   for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
2932     if (VH[i] != VL[i])
2933       return true;
2934
2935   return false;
2936 }
2937
2938 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
2939                                           int CostThreshold, BoUpSLP &R) {
2940   unsigned ChainLen = Chain.size();
2941   DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
2942         << "\n");
2943   Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
2944   unsigned Sz = DL->getTypeSizeInBits(StoreTy);
2945   unsigned VF = MinVecRegSize / Sz;
2946
2947   if (!isPowerOf2_32(Sz) || VF < 2)
2948     return false;
2949
2950   // Keep track of values that were deleted by vectorizing in the loop below.
2951   SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
2952
2953   bool Changed = false;
2954   // Look for profitable vectorizable trees at all offsets, starting at zero.
2955   for (unsigned i = 0, e = ChainLen; i < e; ++i) {
2956     if (i + VF > e)
2957       break;
2958
2959     // Check that a previous iteration of this loop did not delete the Value.
2960     if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
2961       continue;
2962
2963     DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
2964           << "\n");
2965     ArrayRef<Value *> Operands = Chain.slice(i, VF);
2966
2967     R.buildTree(Operands);
2968
2969     int Cost = R.getTreeCost();
2970
2971     DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
2972     if (Cost < CostThreshold) {
2973       DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
2974       R.vectorizeTree();
2975
2976       // Move to the next bundle.
2977       i += VF - 1;
2978       Changed = true;
2979     }
2980   }
2981
2982   return Changed;
2983 }
2984
2985 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
2986                                     int costThreshold, BoUpSLP &R) {
2987   SetVector<Value *> Heads, Tails;
2988   SmallDenseMap<Value *, Value *> ConsecutiveChain;
2989
2990   // We may run into multiple chains that merge into a single chain. We mark the
2991   // stores that we vectorized so that we don't visit the same store twice.
2992   BoUpSLP::ValueSet VectorizedStores;
2993   bool Changed = false;
2994
2995   // Do a quadratic search on all of the given stores and find
2996   // all of the pairs of stores that follow each other.
2997   for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
2998     for (unsigned j = 0; j < e; ++j) {
2999       if (i == j)
3000         continue;
3001
3002       if (R.isConsecutiveAccess(Stores[i], Stores[j])) {
3003         Tails.insert(Stores[j]);
3004         Heads.insert(Stores[i]);
3005         ConsecutiveChain[Stores[i]] = Stores[j];
3006       }
3007     }
3008   }
3009
3010   // For stores that start but don't end a link in the chain:
3011   for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
3012        it != e; ++it) {
3013     if (Tails.count(*it))
3014       continue;
3015
3016     // We found a store instr that starts a chain. Now follow the chain and try
3017     // to vectorize it.
3018     BoUpSLP::ValueList Operands;
3019     Value *I = *it;
3020     // Collect the chain into a list.
3021     while (Tails.count(I) || Heads.count(I)) {
3022       if (VectorizedStores.count(I))
3023         break;
3024       Operands.push_back(I);
3025       // Move to the next value in the chain.
3026       I = ConsecutiveChain[I];
3027     }
3028
3029     bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
3030
3031     // Mark the vectorized stores so that we don't vectorize them again.
3032     if (Vectorized)
3033       VectorizedStores.insert(Operands.begin(), Operands.end());
3034     Changed |= Vectorized;
3035   }
3036
3037   return Changed;
3038 }
3039
3040
3041 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
3042   unsigned count = 0;
3043   StoreRefs.clear();
3044   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
3045     StoreInst *SI = dyn_cast<StoreInst>(it);
3046     if (!SI)
3047       continue;
3048
3049     // Don't touch volatile stores.
3050     if (!SI->isSimple())
3051       continue;
3052
3053     // Check that the pointer points to scalars.
3054     Type *Ty = SI->getValueOperand()->getType();
3055     if (Ty->isAggregateType() || Ty->isVectorTy())
3056       continue;
3057
3058     // Find the base pointer.
3059     Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
3060
3061     // Save the store locations.
3062     StoreRefs[Ptr].push_back(SI);
3063     count++;
3064   }
3065   return count;
3066 }
3067
3068 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
3069   if (!A || !B)
3070     return false;
3071   Value *VL[] = { A, B };
3072   return tryToVectorizeList(VL, R, None, true);
3073 }
3074
3075 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3076                                        ArrayRef<Value *> BuildVector,
3077                                        bool allowReorder) {
3078   if (VL.size() < 2)
3079     return false;
3080
3081   DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
3082
3083   // Check that all of the parts are scalar instructions of the same type.
3084   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
3085   if (!I0)
3086     return false;
3087
3088   unsigned Opcode0 = I0->getOpcode();
3089
3090   Type *Ty0 = I0->getType();
3091   unsigned Sz = DL->getTypeSizeInBits(Ty0);
3092   unsigned VF = MinVecRegSize / Sz;
3093
3094   for (int i = 0, e = VL.size(); i < e; ++i) {
3095     Type *Ty = VL[i]->getType();
3096     if (Ty->isAggregateType() || Ty->isVectorTy())
3097       return false;
3098     Instruction *Inst = dyn_cast<Instruction>(VL[i]);
3099     if (!Inst || Inst->getOpcode() != Opcode0)
3100       return false;
3101   }
3102
3103   bool Changed = false;
3104
3105   // Keep track of values that were deleted by vectorizing in the loop below.
3106   SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
3107
3108   for (unsigned i = 0, e = VL.size(); i < e; ++i) {
3109     unsigned OpsWidth = 0;
3110
3111     if (i + VF > e)
3112       OpsWidth = e - i;
3113     else
3114       OpsWidth = VF;
3115
3116     if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
3117       break;
3118
3119     // Check that a previous iteration of this loop did not delete the Value.
3120     if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
3121       continue;
3122
3123     DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
3124                  << "\n");
3125     ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
3126
3127     ArrayRef<Value *> BuildVectorSlice;
3128     if (!BuildVector.empty())
3129       BuildVectorSlice = BuildVector.slice(i, OpsWidth);
3130
3131     R.buildTree(Ops, BuildVectorSlice);
3132     // TODO: check if we can allow reordering also for other cases than
3133     // tryToVectorizePair()
3134     if (allowReorder && R.shouldReorder()) {
3135       assert(Ops.size() == 2);
3136       assert(BuildVectorSlice.empty());
3137       Value *ReorderedOps[] = { Ops[1], Ops[0] };
3138       R.buildTree(ReorderedOps, None);
3139     }
3140     int Cost = R.getTreeCost();
3141
3142     if (Cost < -SLPCostThreshold) {
3143       DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
3144       Value *VectorizedRoot = R.vectorizeTree();
3145
3146       // Reconstruct the build vector by extracting the vectorized root. This
3147       // way we handle the case where some elements of the vector are undefined.
3148       //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
3149       if (!BuildVectorSlice.empty()) {
3150         // The insert point is the last build vector instruction. The vectorized
3151         // root will precede it. This guarantees that we get an instruction. The
3152         // vectorized tree could have been constant folded.
3153         Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
3154         unsigned VecIdx = 0;
3155         for (auto &V : BuildVectorSlice) {
3156           IRBuilder<true, NoFolder> Builder(
3157               ++BasicBlock::iterator(InsertAfter));
3158           InsertElementInst *IE = cast<InsertElementInst>(V);
3159           Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
3160               VectorizedRoot, Builder.getInt32(VecIdx++)));
3161           IE->setOperand(1, Extract);
3162           IE->removeFromParent();
3163           IE->insertAfter(Extract);
3164           InsertAfter = IE;
3165         }
3166       }
3167       // Move to the next bundle.
3168       i += VF - 1;
3169       Changed = true;
3170     }
3171   }
3172
3173   return Changed;
3174 }
3175
3176 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
3177   if (!V)
3178     return false;
3179
3180   // Try to vectorize V.
3181   if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
3182     return true;
3183
3184   BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
3185   BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
3186   // Try to skip B.
3187   if (B && B->hasOneUse()) {
3188     BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
3189     BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
3190     if (tryToVectorizePair(A, B0, R)) {
3191       return true;
3192     }
3193     if (tryToVectorizePair(A, B1, R)) {
3194       return true;
3195     }
3196   }
3197
3198   // Try to skip A.
3199   if (A && A->hasOneUse()) {
3200     BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
3201     BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
3202     if (tryToVectorizePair(A0, B, R)) {
3203       return true;
3204     }
3205     if (tryToVectorizePair(A1, B, R)) {
3206       return true;
3207     }
3208   }
3209   return 0;
3210 }
3211
3212 /// \brief Generate a shuffle mask to be used in a reduction tree.
3213 ///
3214 /// \param VecLen The length of the vector to be reduced.
3215 /// \param NumEltsToRdx The number of elements that should be reduced in the
3216 ///        vector.
3217 /// \param IsPairwise Whether the reduction is a pairwise or splitting
3218 ///        reduction. A pairwise reduction will generate a mask of
3219 ///        <0,2,...> or <1,3,..> while a splitting reduction will generate
3220 ///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
3221 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
3222 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
3223                                    bool IsPairwise, bool IsLeft,
3224                                    IRBuilder<> &Builder) {
3225   assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
3226
3227   SmallVector<Constant *, 32> ShuffleMask(
3228       VecLen, UndefValue::get(Builder.getInt32Ty()));
3229
3230   if (IsPairwise)
3231     // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
3232     for (unsigned i = 0; i != NumEltsToRdx; ++i)
3233       ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
3234   else
3235     // Move the upper half of the vector to the lower half.
3236     for (unsigned i = 0; i != NumEltsToRdx; ++i)
3237       ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
3238
3239   return ConstantVector::get(ShuffleMask);
3240 }
3241
3242
3243 /// Model horizontal reductions.
3244 ///
3245 /// A horizontal reduction is a tree of reduction operations (currently add and
3246 /// fadd) that has operations that can be put into a vector as its leaf.
3247 /// For example, this tree:
3248 ///
3249 /// mul mul mul mul
3250 ///  \  /    \  /
3251 ///   +       +
3252 ///    \     /
3253 ///       +
3254 /// This tree has "mul" as its reduced values and "+" as its reduction
3255 /// operations. A reduction might be feeding into a store or a binary operation
3256 /// feeding a phi.
3257 ///    ...
3258 ///    \  /
3259 ///     +
3260 ///     |
3261 ///  phi +=
3262 ///
3263 ///  Or:
3264 ///    ...
3265 ///    \  /
3266 ///     +
3267 ///     |
3268 ///   *p =
3269 ///
3270 class HorizontalReduction {
3271   SmallVector<Value *, 16> ReductionOps;
3272   SmallVector<Value *, 32> ReducedVals;
3273
3274   BinaryOperator *ReductionRoot;
3275   PHINode *ReductionPHI;
3276
3277   /// The opcode of the reduction.
3278   unsigned ReductionOpcode;
3279   /// The opcode of the values we perform a reduction on.
3280   unsigned ReducedValueOpcode;
3281   /// The width of one full horizontal reduction operation.
3282   unsigned ReduxWidth;
3283   /// Should we model this reduction as a pairwise reduction tree or a tree that
3284   /// splits the vector in halves and adds those halves.
3285   bool IsPairwiseReduction;
3286
3287 public:
3288   HorizontalReduction()
3289     : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
3290     ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
3291
3292   /// \brief Try to find a reduction tree.
3293   bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
3294                                  const DataLayout *DL) {
3295     assert((!Phi ||
3296             std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
3297            "Thi phi needs to use the binary operator");
3298
3299     // We could have a initial reductions that is not an add.
3300     //  r *= v1 + v2 + v3 + v4
3301     // In such a case start looking for a tree rooted in the first '+'.
3302     if (Phi) {
3303       if (B->getOperand(0) == Phi) {
3304         Phi = nullptr;
3305         B = dyn_cast<BinaryOperator>(B->getOperand(1));
3306       } else if (B->getOperand(1) == Phi) {
3307         Phi = nullptr;
3308         B = dyn_cast<BinaryOperator>(B->getOperand(0));
3309       }
3310     }
3311
3312     if (!B)
3313       return false;
3314
3315     Type *Ty = B->getType();
3316     if (Ty->isVectorTy())
3317       return false;
3318
3319     ReductionOpcode = B->getOpcode();
3320     ReducedValueOpcode = 0;
3321     ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
3322     ReductionRoot = B;
3323     ReductionPHI = Phi;
3324
3325     if (ReduxWidth < 4)
3326       return false;
3327
3328     // We currently only support adds.
3329     if (ReductionOpcode != Instruction::Add &&
3330         ReductionOpcode != Instruction::FAdd)
3331       return false;
3332
3333     // Post order traverse the reduction tree starting at B. We only handle true
3334     // trees containing only binary operators.
3335     SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
3336     Stack.push_back(std::make_pair(B, 0));
3337     while (!Stack.empty()) {
3338       BinaryOperator *TreeN = Stack.back().first;
3339       unsigned EdgeToVist = Stack.back().second++;
3340       bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
3341
3342       // Only handle trees in the current basic block.
3343       if (TreeN->getParent() != B->getParent())
3344         return false;
3345
3346       // Each tree node needs to have one user except for the ultimate
3347       // reduction.
3348       if (!TreeN->hasOneUse() && TreeN != B)
3349         return false;
3350
3351       // Postorder vist.
3352       if (EdgeToVist == 2 || IsReducedValue) {
3353         if (IsReducedValue) {
3354           // Make sure that the opcodes of the operations that we are going to
3355           // reduce match.
3356           if (!ReducedValueOpcode)
3357             ReducedValueOpcode = TreeN->getOpcode();
3358           else if (ReducedValueOpcode != TreeN->getOpcode())
3359             return false;
3360           ReducedVals.push_back(TreeN);
3361         } else {
3362           // We need to be able to reassociate the adds.
3363           if (!TreeN->isAssociative())
3364             return false;
3365           ReductionOps.push_back(TreeN);
3366         }
3367         // Retract.
3368         Stack.pop_back();
3369         continue;
3370       }
3371
3372       // Visit left or right.
3373       Value *NextV = TreeN->getOperand(EdgeToVist);
3374       BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
3375       if (Next)
3376         Stack.push_back(std::make_pair(Next, 0));
3377       else if (NextV != Phi)
3378         return false;
3379     }
3380     return true;
3381   }
3382
3383   /// \brief Attempt to vectorize the tree found by
3384   /// matchAssociativeReduction.
3385   bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
3386     if (ReducedVals.empty())
3387       return false;
3388
3389     unsigned NumReducedVals = ReducedVals.size();
3390     if (NumReducedVals < ReduxWidth)
3391       return false;
3392
3393     Value *VectorizedTree = nullptr;
3394     IRBuilder<> Builder(ReductionRoot);
3395     FastMathFlags Unsafe;
3396     Unsafe.setUnsafeAlgebra();
3397     Builder.SetFastMathFlags(Unsafe);
3398     unsigned i = 0;
3399
3400     for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
3401       V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
3402
3403       // Estimate cost.
3404       int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
3405       if (Cost >= -SLPCostThreshold)
3406         break;
3407
3408       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
3409                    << ". (HorRdx)\n");
3410
3411       // Vectorize a tree.
3412       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
3413       Value *VectorizedRoot = V.vectorizeTree();
3414
3415       // Emit a reduction.
3416       Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
3417       if (VectorizedTree) {
3418         Builder.SetCurrentDebugLocation(Loc);
3419         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3420                                      ReducedSubTree, "bin.rdx");
3421       } else
3422         VectorizedTree = ReducedSubTree;
3423     }
3424
3425     if (VectorizedTree) {
3426       // Finish the reduction.
3427       for (; i < NumReducedVals; ++i) {
3428         Builder.SetCurrentDebugLocation(
3429           cast<Instruction>(ReducedVals[i])->getDebugLoc());
3430         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3431                                      ReducedVals[i]);
3432       }
3433       // Update users.
3434       if (ReductionPHI) {
3435         assert(ReductionRoot && "Need a reduction operation");
3436         ReductionRoot->setOperand(0, VectorizedTree);
3437         ReductionRoot->setOperand(1, ReductionPHI);
3438       } else
3439         ReductionRoot->replaceAllUsesWith(VectorizedTree);
3440     }
3441     return VectorizedTree != nullptr;
3442   }
3443
3444 private:
3445
3446   /// \brief Calcuate the cost of a reduction.
3447   int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
3448     Type *ScalarTy = FirstReducedVal->getType();
3449     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
3450
3451     int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
3452     int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
3453
3454     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
3455     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
3456
3457     int ScalarReduxCost =
3458         ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
3459
3460     DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
3461                  << " for reduction that starts with " << *FirstReducedVal
3462                  << " (It is a "
3463                  << (IsPairwiseReduction ? "pairwise" : "splitting")
3464                  << " reduction)\n");
3465
3466     return VecReduxCost - ScalarReduxCost;
3467   }
3468
3469   static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
3470                             Value *R, const Twine &Name = "") {
3471     if (Opcode == Instruction::FAdd)
3472       return Builder.CreateFAdd(L, R, Name);
3473     return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
3474   }
3475
3476   /// \brief Emit a horizontal reduction of the vectorized value.
3477   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
3478     assert(VectorizedValue && "Need to have a vectorized tree node");
3479     Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue);
3480     assert(isPowerOf2_32(ReduxWidth) &&
3481            "We only handle power-of-two reductions for now");
3482
3483     Value *TmpVec = ValToReduce;
3484     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
3485       if (IsPairwiseReduction) {
3486         Value *LeftMask =
3487           createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
3488         Value *RightMask =
3489           createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
3490
3491         Value *LeftShuf = Builder.CreateShuffleVector(
3492           TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
3493         Value *RightShuf = Builder.CreateShuffleVector(
3494           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
3495           "rdx.shuf.r");
3496         TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
3497                              "bin.rdx");
3498       } else {
3499         Value *UpperHalf =
3500           createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
3501         Value *Shuf = Builder.CreateShuffleVector(
3502           TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
3503         TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
3504       }
3505     }
3506
3507     // The result is in the first element of the vector.
3508     return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
3509   }
3510 };
3511
3512 /// \brief Recognize construction of vectors like
3513 ///  %ra = insertelement <4 x float> undef, float %s0, i32 0
3514 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
3515 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
3516 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
3517 ///
3518 /// Returns true if it matches
3519 ///
3520 static bool findBuildVector(InsertElementInst *FirstInsertElem,
3521                             SmallVectorImpl<Value *> &BuildVector,
3522                             SmallVectorImpl<Value *> &BuildVectorOpds) {
3523   if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
3524     return false;
3525
3526   InsertElementInst *IE = FirstInsertElem;
3527   while (true) {
3528     BuildVector.push_back(IE);
3529     BuildVectorOpds.push_back(IE->getOperand(1));
3530
3531     if (IE->use_empty())
3532       return false;
3533
3534     InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
3535     if (!NextUse)
3536       return true;
3537
3538     // If this isn't the final use, make sure the next insertelement is the only
3539     // use. It's OK if the final constructed vector is used multiple times
3540     if (!IE->hasOneUse())
3541       return false;
3542
3543     IE = NextUse;
3544   }
3545
3546   return false;
3547 }
3548
3549 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
3550   return V->getType() < V2->getType();
3551 }
3552
3553 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
3554   bool Changed = false;
3555   SmallVector<Value *, 4> Incoming;
3556   SmallSet<Value *, 16> VisitedInstrs;
3557
3558   bool HaveVectorizedPhiNodes = true;
3559   while (HaveVectorizedPhiNodes) {
3560     HaveVectorizedPhiNodes = false;
3561
3562     // Collect the incoming values from the PHIs.
3563     Incoming.clear();
3564     for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
3565          ++instr) {
3566       PHINode *P = dyn_cast<PHINode>(instr);
3567       if (!P)
3568         break;
3569
3570       if (!VisitedInstrs.count(P))
3571         Incoming.push_back(P);
3572     }
3573
3574     // Sort by type.
3575     std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
3576
3577     // Try to vectorize elements base on their type.
3578     for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
3579                                            E = Incoming.end();
3580          IncIt != E;) {
3581
3582       // Look for the next elements with the same type.
3583       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
3584       while (SameTypeIt != E &&
3585              (*SameTypeIt)->getType() == (*IncIt)->getType()) {
3586         VisitedInstrs.insert(*SameTypeIt);
3587         ++SameTypeIt;
3588       }
3589
3590       // Try to vectorize them.
3591       unsigned NumElts = (SameTypeIt - IncIt);
3592       DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
3593       if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
3594         // Success start over because instructions might have been changed.
3595         HaveVectorizedPhiNodes = true;
3596         Changed = true;
3597         break;
3598       }
3599
3600       // Start over at the next instruction of a different type (or the end).
3601       IncIt = SameTypeIt;
3602     }
3603   }
3604
3605   VisitedInstrs.clear();
3606
3607   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
3608     // We may go through BB multiple times so skip the one we have checked.
3609     if (!VisitedInstrs.insert(it))
3610       continue;
3611
3612     if (isa<DbgInfoIntrinsic>(it))
3613       continue;
3614
3615     // Try to vectorize reductions that use PHINodes.
3616     if (PHINode *P = dyn_cast<PHINode>(it)) {
3617       // Check that the PHI is a reduction PHI.
3618       if (P->getNumIncomingValues() != 2)
3619         return Changed;
3620       Value *Rdx =
3621           (P->getIncomingBlock(0) == BB
3622                ? (P->getIncomingValue(0))
3623                : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
3624                                                : nullptr));
3625       // Check if this is a Binary Operator.
3626       BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
3627       if (!BI)
3628         continue;
3629
3630       // Try to match and vectorize a horizontal reduction.
3631       HorizontalReduction HorRdx;
3632       if (ShouldVectorizeHor &&
3633           HorRdx.matchAssociativeReduction(P, BI, DL) &&
3634           HorRdx.tryToReduce(R, TTI)) {
3635         Changed = true;
3636         it = BB->begin();
3637         e = BB->end();
3638         continue;
3639       }
3640
3641      Value *Inst = BI->getOperand(0);
3642       if (Inst == P)
3643         Inst = BI->getOperand(1);
3644
3645       if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
3646         // We would like to start over since some instructions are deleted
3647         // and the iterator may become invalid value.
3648         Changed = true;
3649         it = BB->begin();
3650         e = BB->end();
3651         continue;
3652       }
3653
3654       continue;
3655     }
3656
3657     // Try to vectorize horizontal reductions feeding into a store.
3658     if (ShouldStartVectorizeHorAtStore)
3659       if (StoreInst *SI = dyn_cast<StoreInst>(it))
3660         if (BinaryOperator *BinOp =
3661                 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
3662           HorizontalReduction HorRdx;
3663           if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) &&
3664                 HorRdx.tryToReduce(R, TTI)) ||
3665                tryToVectorize(BinOp, R))) {
3666             Changed = true;
3667             it = BB->begin();
3668             e = BB->end();
3669             continue;
3670           }
3671         }
3672
3673     // Try to vectorize trees that start at compare instructions.
3674     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
3675       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
3676         Changed = true;
3677         // We would like to start over since some instructions are deleted
3678         // and the iterator may become invalid value.
3679         it = BB->begin();
3680         e = BB->end();
3681         continue;
3682       }
3683
3684       for (int i = 0; i < 2; ++i) {
3685         if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
3686           if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
3687             Changed = true;
3688             // We would like to start over since some instructions are deleted
3689             // and the iterator may become invalid value.
3690             it = BB->begin();
3691             e = BB->end();
3692           }
3693         }
3694       }
3695       continue;
3696     }
3697
3698     // Try to vectorize trees that start at insertelement instructions.
3699     if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
3700       SmallVector<Value *, 16> BuildVector;
3701       SmallVector<Value *, 16> BuildVectorOpds;
3702       if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
3703         continue;
3704
3705       // Vectorize starting with the build vector operands ignoring the
3706       // BuildVector instructions for the purpose of scheduling and user
3707       // extraction.
3708       if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
3709         Changed = true;
3710         it = BB->begin();
3711         e = BB->end();
3712       }
3713
3714       continue;
3715     }
3716   }
3717
3718   return Changed;
3719 }
3720
3721 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
3722   bool Changed = false;
3723   // Attempt to sort and vectorize each of the store-groups.
3724   for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
3725        it != e; ++it) {
3726     if (it->second.size() < 2)
3727       continue;
3728
3729     DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
3730           << it->second.size() << ".\n");
3731
3732     // Process the stores in chunks of 16.
3733     for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
3734       unsigned Len = std::min<unsigned>(CE - CI, 16);
3735       Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
3736                                  -SLPCostThreshold, R);
3737     }
3738   }
3739   return Changed;
3740 }
3741
3742 } // end anonymous namespace
3743
3744 char SLPVectorizer::ID = 0;
3745 static const char lv_name[] = "SLP Vectorizer";
3746 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
3747 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
3748 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
3749 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
3750 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
3751 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
3752
3753 namespace llvm {
3754 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
3755 }