From 369cc938d261de3295eb70d0738f54ef1a82806c Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 7 Jul 2013 06:57:07 +0000
Subject: [PATCH] SLPVectorizer: Implement DCE as part of vectorization.

This is a complete re-write if the bottom-up vectorization class.
Before this commit we scanned the instruction tree 3 times. First in search of merge points for the trees. Second, for estimating the cost. And finally for vectorization.
There was a lot of code duplication and adding the DCE exposed bugs. The new design is simpler and DCE was a part of the design.
In this implementation we build the tree once. After that we estimate the cost by scanning the different entries in the constructed tree (in any order). The vectorization phase also works on the built tree.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185774 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp    | 2114 +++++++++--------
 .../SLPVectorizer/X86/crash_7zip.ll           |   38 +
 .../SLPVectorizer/X86/crash_bullet.ll         |   38 +
 .../SLPVectorizer/X86/crash_bullet2.ll        |   38 +
 .../SLPVectorizer/X86/crash_dequeue.ll        |   40 +
 .../SLPVectorizer/X86/crash_flop7.ll          |   46 +
 .../SLPVectorizer/X86/crash_lame.ll           |   24 +
 .../SLPVectorizer/X86/crash_lencod.ll         |   66 +
 .../SLPVectorizer/X86/crash_lencod2.ll        |   23 +
 .../SLPVectorizer/X86/crash_mandeltext.ll     |   53 +
 .../Transforms/SLPVectorizer/X86/crash_rc4.ll |   28 +
 .../SLPVectorizer/X86/crash_sim4b1.ll         |  113 +
 .../SLPVectorizer/X86/crash_smallpt.ll        |   65 +
 .../SLPVectorizer/X86/crash_smallpt2.ll       |   46 +
 test/Transforms/SLPVectorizer/X86/diamond.ll  |    6 +-
 .../SLPVectorizer/X86/long_chains.ll          |   13 +-
 test/Transforms/SLPVectorizer/X86/saxpy.ll    |   16 +
 17 files changed, 1717 insertions(+), 1050 deletions(-)
 create mode 100644 test/Transforms/SLPVectorizer/X86/crash_7zip.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/crash_bullet.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/crash_bullet2.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/crash_flop7.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/crash_lame.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/crash_lencod.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/crash_lencod2.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/crash_rc4.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/crash_smallpt2.ll
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 85c01bdeff9..d2e7450d248 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -59,7 +59,7 @@ static const unsigned RecursionMaxDepth = 12;
 class BuilderLocGuard {
 public:
   BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()) {}
-  ~BuilderLocGuard() { Builder.SetInsertPoint(Loc); }
+  ~BuilderLocGuard() { if (Loc) Builder.SetInsertPoint(Loc); }
 
 private:
   // Prevent copying.
@@ -91,6 +91,7 @@ struct BlockNumbering {
   }
 
   int getIndex(Instruction *I) {
+    assert(I->getParent() == BB && "Invalid instruction");
     if (!Valid)
       numberInstructions();
     assert(InstrIdx.count(I) && "Unknown instruction");
@@ -117,72 +118,169 @@ private:
   std::vector<Instruction *> InstrVec;
 };
 
-class FuncSLP {
+/// \returns the parent basic block if all of the instructions in \p VL
+/// are in the same block or null otherwise.
+static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
+  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+  if (!I0)
+    return 0;
+  BasicBlock *BB = I0->getParent();
+  for (int i = 1, e = VL.size(); i < e; i++) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (!I)
+      return 0;
+
+    if (BB != I->getParent())
+      return 0;
+  }
+  return BB;
+}
+
+/// \returns True if all of the values in \p VL are constants.
+static bool allConstant(ArrayRef<Value *> VL) {
+  for (unsigned i = 0, e = VL.size(); i < e; ++i)
+    if (!isa<Constant>(VL[i]))
+      return false;
+  return true;
+}
+
+/// \returns True if all of the values in \p VL are identical.
+static bool isSplat(ArrayRef<Value *> VL) {
+  for (unsigned i = 1, e = VL.size(); i < e; ++i)
+    if (VL[i] != VL[0])
+      return false;
+  return true;
+}
+
+/// \returns The opcode if all of the Instructions in \p VL have the same
+/// opcode, or zero.
+static unsigned getSameOpcode(ArrayRef<Value *> VL) {
+  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+  if (!I0)
+    return 0;
+  unsigned Opcode = I0->getOpcode();
+  for (int i = 1, e = VL.size(); i < e; i++) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (!I || Opcode != I->getOpcode())
+      return 0;
+  }
+  return Opcode;
+}
+
+/// \returns The type that all of the values in \p VL have or null if there
+/// are different types.
+static Type* getSameType(ArrayRef<Value *> VL) {
+  Type *Ty = VL[0]->getType();
+  for (int i = 1, e = VL.size(); i < e; i++)
+    if (VL[0]->getType() != Ty)
+      return 0;
+
+  return Ty;
+}
+
+/// \returns True if the ExtractElement instructions in VL can be vectorized
+/// to use the original vector.
+static bool CanReuseExtract(ArrayRef<Value *> VL) {
+  assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
+  // Check if all of the extracts come from the same vector and from the
+  // correct offset.
+  Value *VL0 = VL[0];
+  ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
+  Value *Vec = E0->getOperand(0);
+
+  // We have to extract from the same vector type.
+  unsigned NElts = Vec->getType()->getVectorNumElements();
+
+  if (NElts != VL.size())
+    return false;
+
+  // Check that all of the indices extract from the correct offset.
+  ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
+  if (!CI || CI->getZExtValue())
+    return false;
+
+  for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+    ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+    ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+
+    if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
+      return false;
+  }
+
+  return true;
+}
+
+/// Bottom Up SLP Vectorizer.
+class BoUpSLP {
+public:
   typedef SmallVector<Value *, 8> ValueList;
   typedef SmallVector<Instruction *, 16> InstrList;
   typedef SmallPtrSet<Value *, 16> ValueSet;
   typedef SmallVector<StoreInst *, 8> StoreList;
 
-public:
-  static const int MAX_COST = INT_MIN;
-
-  FuncSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl,
-          TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li, 
+  BoUpSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl,
+          TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li,
           DominatorTree *Dt) :
     F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt),
     Builder(Se->getContext()) {
-    for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) {
-      BasicBlock *BB = it;
-      BlocksNumbers[BB] = BlockNumbering(BB);
+      // Setup the block numbering utility for all of the blocks in the
+      // function.
+      for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) {
+        BasicBlock *BB = it;
+        BlocksNumbers[BB] = BlockNumbering(BB);
+      }
     }
-  }
-
-  /// \brief Take the pointer operand from the Load/Store instruction.
-  /// \returns NULL if this is not a valid Load/Store instruction.
-  static Value *getPointerOperand(Value *I);
-
-  /// \brief Take the address space operand from the Load/Store instruction.
-  /// \returns -1 if this is not a valid Load/Store instruction.
-  static unsigned getAddressSpaceOperand(Value *I);
-
-  /// \returns true if the memory operations A and B are consecutive.
-  bool isConsecutiveAccess(Value *A, Value *B);
 
   /// \brief Vectorize the tree that starts with the elements in \p VL.
-  /// \returns the vectorized value.
-  Value *vectorizeTree(ArrayRef<Value *> VL);
+  void vectorizeTree();
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
-  int getTreeCost(ArrayRef<Value *> VL);
+  int getTreeCost();
+
+  /// Construct a vectorizable tree that starts at \p Roots.
+  void buildTree(ArrayRef<Value *> Roots);
+
+  /// Clear the internal data structures that are created by 'buildTree'.
+  void deleteTree() {
+    VectorizableTree.clear();
+    ScalarToTreeEntry.clear();
+    MustGather.clear();
+    MemBarrierIgnoreList.clear();
+  }
 
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
   /// roots. This method calculates the cost of extracting the values.
   int getGatherCost(ArrayRef<Value *> VL);
 
-  /// \brief Attempts to order and vectorize a sequence of stores. This
-  /// function does a quadratic scan of the given stores.
-  /// \returns true if the basic block was modified.
-  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold);
+  /// \returns true if the memory operations A and B are consecutive.
+  bool isConsecutiveAccess(Value *A, Value *B);
+
+  /// \brief Perform LICM and CSE on the newly generated gather sequences.
+  void optimizeGatherSequence();
+private:
+  struct TreeEntry;
 
-  /// \brief Vectorize a group of scalars into a vector tree.
-  /// \returns the vectorized value.
-  Value *vectorizeArith(ArrayRef<Value *> Operands);
+  /// \returns the cost of the vectorizable entry.
+  int getEntryCost(TreeEntry *E);
 
-  /// \brief This method contains the recursive part of getTreeCost.
-  int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth);
+  /// This is the recursive part of buildTree.
+  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
 
-  /// \brief This recursive method looks for vectorization hazards such as
-  /// values that are used by multiple users and checks that values are used
-  /// by only one vector lane. It updates the variables LaneMap, MultiUserVals.
-  void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth);
+  /// Vectorizer a single entry in the tree.
+  Value *vectorizeTree(TreeEntry *E);
+
+  /// Vectorizer a single entry in the tree, starting in \p VL.
+  Value *vectorizeTree(ArrayRef<Value *> VL);
 
-  /// \brief This method contains the recursive part of vectorizeTree.
-  Value *vectorizeTree_rec(ArrayRef<Value *> VL);
+  /// \brief Take the pointer operand from the Load/Store instruction.
+  /// \returns NULL if this is not a valid Load/Store instruction.
+  static Value *getPointerOperand(Value *I);
 
-  ///  \brief Vectorize a sorted sequence of stores.
-  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold);
+  /// \brief Take the address space operand from the Load/Store instruction.
+  /// \returns -1 if this is not a valid Load/Store instruction.
+  static unsigned getAddressSpaceOperand(Value *I);
 
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
@@ -211,57 +309,65 @@ public:
   /// \returns a vector from a collection of scalars in \p VL.
   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
 
-  /// \brief Perform LICM and CSE on the newly generated gather sequences.
-  void optimizeGatherSequence();
+  struct TreeEntry {
+    TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0),
+    NeedToGather(0) {}
 
-  bool needToGatherAny(ArrayRef<Value *> VL) {
-    for (int i = 0, e = VL.size(); i < e; ++i)
-      if (MustGather.count(VL[i]))
-        return true;
-    return false;
-  }
+    /// \returns true if the scalars in VL are equal to this entry.
+    bool isSame(ArrayRef<Value *> VL) {
+      assert(VL.size() == Scalars.size() && "Invalid size");
+      for (int i = 0, e = VL.size(); i != e; ++i)
+        if (VL[i] != Scalars[i])
+          return false;
+      return true;
+    }
+
+    /// A vector of scalars.
+    ValueList Scalars;
+
+    /// The Scalars are vectorized into this value. It is initialized to Null.
+    Value *VectorizedValue;
+
+    /// The index in the basic block of the last scalar.
+    int LastScalarIndex;
+
+    /// Do we need to gather this sequence ?
+    bool NeedToGather;
+  };
 
-  void forgetNumbering() {
-    for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it)
-      BlocksNumbers[it].forget();
+  /// Create a new VectorizableTree entry.
+  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
+    VectorizableTree.push_back(TreeEntry());
+    int idx = VectorizableTree.size() - 1;
+    TreeEntry *Last = &VectorizableTree[idx];
+    Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
+    Last->NeedToGather = !Vectorized;
+    if (Vectorized) {
+      Last->LastScalarIndex = getLastIndex(VL);
+      for (int i = 0, e = VL.size(); i != e; ++i) {
+        assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
+        ScalarToTreeEntry[VL[i]] = idx;
+      }
+    } else {
+      Last->LastScalarIndex = 0;
+      MustGather.insert(VL.begin(), VL.end());
+    }
+    return Last;
   }
 
   /// -- Vectorization State --
+  /// Holds all of the tree entries.
+  std::vector<TreeEntry> VectorizableTree;
 
-  /// Maps values in the tree to the vector lanes that uses them. This map must
-  /// be reset between runs of getCost.
-  std::map<Value *, int> LaneMap;
-  /// A list of instructions to ignore while sinking
-  /// memory instructions. This map must be reset between runs of getCost.
-  ValueSet MemBarrierIgnoreList;
-
-  /// Maps between the first scalar to the vector. This map must be reset
-  /// between runs.
-  DenseMap<Value *, Value *> VectorizedValues;
+  /// Maps a specific scalar to its tree entry.
+  SmallDenseMap<Value*, int> ScalarToTreeEntry;
 
-  /// Contains values that must be gathered because they are used
-  /// by multiple lanes, or by users outside the tree.
-  /// NOTICE: The vectorization methods also use this set.
+  /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
-  /// Contains PHINodes that are being processed. We use this data structure
-  /// to stop cycles in the graph.
-  ValueSet VisitedPHIs;
-
-  /// Contains a list of values that are used outside the current tree, the
-  /// first element in the bundle and the insertion point for extracts. This
-  /// set must be reset between runs.
-  struct UseInfo{
-    UseInfo(Instruction *VL0, int I) :
-      Leader(VL0), LastIndex(I) {}
-    UseInfo() : Leader(0), LastIndex(0) {}
-    /// The first element in the bundle.
-    Instruction *Leader;
-    /// The insertion index.
-    int LastIndex;
-  };
-  MapVector<Instruction*, UseInfo> MultiUserVals;
-  SetVector<Instruction*> ExtractedLane;
+  /// A list of instructions to ignore while sinking
+  /// memory instructions. This map must be reset between runs of getCost.
+  ValueSet MemBarrierIgnoreList;
 
   /// Holds all of the instructions that we gathered.
   SetVector<Instruction *> GatherSeq;
@@ -281,817 +387,619 @@ public:
   IRBuilder<> Builder;
 };
 
-int FuncSLP::getGatherCost(Type *Ty) {
-  int Cost = 0;
-  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
-    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
-  return Cost;
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
+  deleteTree();
+  buildTree_rec(Roots, 0);
 }
 
-int FuncSLP::getGatherCost(ArrayRef<Value *> VL) {
-  // Find the type of the operands in VL.
-  Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
-  // Find the cost of inserting/extracting values from the vector.
-  return getGatherCost(VecTy);
-}
 
-AliasAnalysis::Location FuncSLP::getLocation(Instruction *I) {
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return AA->getLocation(SI);
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return AA->getLocation(LI);
-  return AliasAnalysis::Location();
-}
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
+  bool SameTy = getSameType(VL); (void)SameTy;
+  assert(SameTy && "Invalid types!");
 
-Value *FuncSLP::getPointerOperand(Value *I) {
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return LI->getPointerOperand();
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->getPointerOperand();
-  return 0;
-}
+  if (Depth == RecursionMaxDepth) {
+    DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
+    newTreeEntry(VL, false);
+    return;
+  }
 
-unsigned FuncSLP::getAddressSpaceOperand(Value *I) {
-  if (LoadInst *L = dyn_cast<LoadInst>(I))
-    return L->getPointerAddressSpace();
-  if (StoreInst *S = dyn_cast<StoreInst>(I))
-    return S->getPointerAddressSpace();
-  return -1;
-}
+  // Don't handle vectors.
+  if (VL[0]->getType()->isVectorTy()) {
+    DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
+    newTreeEntry(VL, false);
+    return;
+  }
 
-bool FuncSLP::isConsecutiveAccess(Value *A, Value *B) {
-  Value *PtrA = getPointerOperand(A);
-  Value *PtrB = getPointerOperand(B);
-  unsigned ASA = getAddressSpaceOperand(A);
-  unsigned ASB = getAddressSpaceOperand(B);
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    if (SI->getValueOperand()->getType()->isVectorTy()) {
+      DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
+      newTreeEntry(VL, false);
+      return;
+    }
 
-  // Check that the address spaces match and that the pointers are valid.
-  if (!PtrA || !PtrB || (ASA != ASB))
-    return false;
+  // If all of the operands are identical or constant we have a simple solution.
+  if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) ||
+      !getSameOpcode(VL)) {
+    DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+    newTreeEntry(VL, false);
+    return;
+  }
 
-  // Check that A and B are of the same type.
-  if (PtrA->getType() != PtrB->getType())
-    return false;
+  // We now know that this is a vector of instructions of the same type from
+  // the same block.
+
+  // Check if this is a duplicate of another entry.
+  if (ScalarToTreeEntry.count(VL[0])) {
+    int Idx = ScalarToTreeEntry[VL[0]];
+    TreeEntry *E = &VectorizableTree[Idx];
+    for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+      DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
+      if (E->Scalars[i] != VL[i]) {
+        DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+        newTreeEntry(VL, false);
+        return;
+      }
+    }
+    DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
+    return;
+  }
 
-  // Calculate the distance.
-  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
-  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
-  const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
-  const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);
+  // Check that none of the instructions in the bundle are already in the tree.
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    if (ScalarToTreeEntry.count(VL[i])) {
+      DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
+            ") is already in tree.\n");
+      newTreeEntry(VL, false);
+      return;
+    }
+  }
 
-  // Non constant distance.
-  if (!ConstOffSCEV)
-    return false;
+  // If any of the scalars appears in the table OR it is marked as a value that
+  // needs to stat scalar then we need to gather the scalars.
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) {
+      DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n");
+      newTreeEntry(VL, false);
+      return;
+    }
+  }
 
-  int64_t Offset = ConstOffSCEV->getValue()->getSExtValue();
-  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
-  // The Instructions are connsecutive if the size of the first load/store is
-  // the same as the offset.
-  int64_t Sz = DL->getTypeStoreSize(Ty);
-  return ((-Offset) == Sz);
-}
+  // Check that all of the users of the scalars that we want to vectorize are
+  // schedulable.
+  Instruction *VL0 = cast<Instruction>(VL[0]);
+  int MyLastIndex = getLastIndex(VL);
+  BasicBlock *BB = cast<Instruction>(VL0)->getParent();
 
-Value *FuncSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) {
-  assert(Src->getParent() == Dst->getParent() && "Not the same BB");
-  BasicBlock::iterator I = Src, E = Dst;
-  /// Scan all of the instruction from SRC to DST and check if
-  /// the source may alias.
-  for (++I; I != E; ++I) {
-    // Ignore store instructions that are marked as 'ignore'.
-    if (MemBarrierIgnoreList.count(I))
-      continue;
-    if (Src->mayWriteToMemory()) /* Write */ {
-      if (!I->mayReadOrWriteMemory())
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    Instruction *Scalar = cast<Instruction>(VL[i]);
+    DEBUG(dbgs() << "SLP: Checking users of  " << *Scalar << ". \n");
+    for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end();
+         U != UE; ++U) {
+      DEBUG(dbgs() << "SLP: \tUser " << **U << ". \n");
+      Instruction *User = dyn_cast<Instruction>(*U);
+      if (!User) {
+        DEBUG(dbgs() << "SLP: Gathering due unknown user. \n");
+        newTreeEntry(VL, false);
+        return;
+      }
+
+      // We don't care if the user is in a different basic block.
+      BasicBlock *UserBlock = User->getParent();
+      if (UserBlock != BB) {
+        DEBUG(dbgs() << "SLP: User from a different basic block "
+              << *User << ". \n");
         continue;
-    } else /* Read */ {
-      if (!I->mayWriteToMemory())
+      }
+
+      // If this is a PHINode within this basic block then we can place the
+      // extract wherever we want.
+      if (isa<PHINode>(*User)) {
+        DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *User << ". \n");
         continue;
-    }
-    AliasAnalysis::Location A = getLocation(&*I);
-    AliasAnalysis::Location B = getLocation(Src);
+      }
 
-    if (!A.Ptr || !B.Ptr || AA->alias(A, B))
-      return I;
-  }
-  return 0;
-}
+      // Check if this is a safe in-tree user.
+      if (ScalarToTreeEntry.count(User)) {
+        int Idx = ScalarToTreeEntry[User];
+        int VecLocation = VectorizableTree[Idx].LastScalarIndex;
+        if (VecLocation <= MyLastIndex) {
+          DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n");
+          newTreeEntry(VL, false);
+          return;
+        }
+        DEBUG(dbgs() << "SLP: In-tree user (" << *User << ") at #" <<
+              VecLocation << " vector value (" << *Scalar << ") at #"
+              << MyLastIndex << ".\n");
+        continue;
+      }
 
-static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
-  BasicBlock *BB = 0;
-  for (int i = 0, e = VL.size(); i < e; i++) {
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    if (!I)
-      return 0;
+      // Make sure that we can schedule this unknown user.
+      BlockNumbering &BN = BlocksNumbers[BB];
+      int UserIndex = BN.getIndex(User);
+      if (UserIndex < MyLastIndex) {
 
-    if (!BB) {
-      BB = I->getParent();
-      continue;
+        DEBUG(dbgs() << "SLP: Can't schedule extractelement for "
+              << *User << ". \n");
+        newTreeEntry(VL, false);
+        return;
+      }
     }
-
-    if (BB != I->getParent())
-      return 0;
   }
-  return BB;
-}
 
-static bool allConstant(ArrayRef<Value *> VL) {
+  // Check that every instructions appears once in this bundle.
   for (unsigned i = 0, e = VL.size(); i < e; ++i)
-    if (!isa<Constant>(VL[i]))
-      return false;
-  return true;
-}
+    for (unsigned j = i+1; j < e; ++j)
+      if (VL[i] == VL[j]) {
+        DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+        newTreeEntry(VL, false);
+        return;
+      }
 
-static bool isSplat(ArrayRef<Value *> VL) {
-  for (unsigned i = 1, e = VL.size(); i < e; ++i)
-    if (VL[i] != VL[0])
-      return false;
-  return true;
-}
-
-static unsigned getSameOpcode(ArrayRef<Value *> VL) {
-  unsigned Opcode = 0;
-  for (int i = 0, e = VL.size(); i < e; i++) {
-    if (Instruction *I = dyn_cast<Instruction>(VL[i])) {
-      if (!Opcode) {
-        Opcode = I->getOpcode();
-        continue;
+  // Check that instructions in this bundle don't reference other instructions.
+  // The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4.
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end();
+         U != UE; ++U) {
+      for (unsigned j = 0; j < e; ++j) {
+        if (i != j && *U == VL[j]) {
+          DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << **U << ". \n");
+          newTreeEntry(VL, false);
+          return;
+        }
       }
-      if (Opcode != I->getOpcode())
-        return 0;
     }
   }
-  return Opcode;
-}
-
-static bool CanReuseExtract(ArrayRef<Value *> VL, unsigned VF,
-                            VectorType *VecTy) {
-  assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
-  // Check if all of the extracts come from the same vector and from the
-  // correct offset.
-  Value *VL0 = VL[0];
-  ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
-  Value *Vec = E0->getOperand(0);
 
-  // We have to extract from the same vector type.
-  if (Vec->getType() != VecTy)
-    return false;
+  DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
 
-  // Check that all of the indices extract from the correct offset.
-  ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
-  if (!CI || CI->getZExtValue())
-    return false;
+  unsigned Opcode = getSameOpcode(VL);
 
-  for (unsigned i = 1, e = VF; i < e; ++i) {
-    ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
-    ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+  // Check if it is safe to sink the loads or the stores.
+  if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
+    Instruction *Last = getLastInstruction(VL);
 
-    if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
-      return false;
+    for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+      if (VL[i] == Last)
+        continue;
+      Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last);
+      if (Barrier) {
+        DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last
+              << "\n because of " << *Barrier << ".  Gathering.\n");
+        newTreeEntry(VL, false);
+        return;
+      }
+    }
   }
 
-  return true;
-}
+  switch (Opcode) {
+    case Instruction::PHI: {
+      PHINode *PH = dyn_cast<PHINode>(VL0);
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+
+      for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i));
+
+        buildTree_rec(Operands, Depth + 1);
+      }
+      return;
+    }
+    case Instruction::ExtractElement: {
+      bool Reuse = CanReuseExtract(VL);
+      if (Reuse) {
+        DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
+      }
+      newTreeEntry(VL, Reuse);
+      return;
+    }
+    case Instruction::Load: {
+      // Check if the loads are consecutive or of we need to swizzle them.
+      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
+        if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+          newTreeEntry(VL, false);
+          DEBUG(dbgs() << "SLP: Need to swizzle loads.\n");
+          return;
+        }
 
-void FuncSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
-  if (Depth == RecursionMaxDepth)
-    return MustGather.insert(VL.begin(), VL.end());
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+      return;
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      Type *SrcTy = VL0->getOperand(0)->getType();
+      for (unsigned i = 0; i < VL.size(); ++i) {
+        Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
+        if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) {
+          newTreeEntry(VL, false);
+          DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
+          return;
+        }
+      }
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
-  // Don't handle vectors.
-  if (VL[0]->getType()->isVectorTy())
-    return;
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
 
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    if (SI->getValueOperand()->getType()->isVectorTy())
+        buildTree_rec(Operands, Depth+1);
+      }
       return;
+    }
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      // Check that all of the compares have the same predicate.
+      CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
+      for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+        CmpInst *Cmp = cast<CmpInst>(VL[i]);
+        if (Cmp->getPredicate() != P0) {
+          newTreeEntry(VL, false);
+          DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
+          return;
+        }
+      }
 
-  // If all of the operands are identical or constant we have a simple solution.
-  if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL))
-    return MustGather.insert(VL.begin(), VL.end());
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
-  // Stop the scan at unknown IR.
-  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
-  assert(VL0 && "Invalid instruction");
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
 
-  // Mark instructions with multiple users.
-  int LastIndex = getLastIndex(VL);
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    if (PHINode *PN = dyn_cast<PHINode>(VL[i])) {
-      unsigned NumUses = 0;
-      // Check that PHINodes have only one external (non-self) use.
-      for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end();
-           U != UE; ++U) {
-        // Don't count self uses.
-        if (*U == PN)
-          continue;
-        NumUses++;
+        buildTree_rec(Operands, Depth+1);
       }
-      if (NumUses > 1) {
-        DEBUG(dbgs() << "SLP: Adding PHI to MultiUserVals "
-              "because it has " << NumUses << " users:" << *PN << " \n");
-        UseInfo UI(VL0, 0);
-        MultiUserVals[PN] = UI;
-      }
-      continue;
-    }
-
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    // Remember to check if all of the users of this instruction are vectorized
-    // within our tree. At depth zero we have no local users, only external
-    // users that we don't care about.
-    if (Depth && I && I->getNumUses() > 1) {
-      DEBUG(dbgs() << "SLP: Adding to MultiUserVals "
-            "because it has " << I->getNumUses() << " users:" << *I << " \n");
-      UseInfo UI(VL0, LastIndex);
-      MultiUserVals[I] = UI;
+      return;
     }
-  }
-
-  // Check that the instruction is only used within one lane.
-  for (int i = 0, e = VL.size(); i < e; ++i) {
-    if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) {
-      DEBUG(dbgs() << "SLP: Value used by multiple lanes:" << *VL[i] << "\n");
-      return MustGather.insert(VL.begin(), VL.end());
+    case Instruction::Select:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
+
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+        buildTree_rec(Operands, Depth+1);
+      }
+      return;
     }
-    // Make this instruction as 'seen' and remember the lane.
-    LaneMap[VL[i]] = i;
-  }
-
-  unsigned Opcode = getSameOpcode(VL);
-  if (!Opcode)
-    return MustGather.insert(VL.begin(), VL.end());
-
-  switch (Opcode) {
-  case Instruction::PHI: {
-    PHINode *PH = dyn_cast<PHINode>(VL0);
+    case Instruction::Store: {
+      // Check if the stores are consecutive or of we need to swizzle them.
+      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
+        if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+          newTreeEntry(VL, false);
+          DEBUG(dbgs() << "SLP: Non consecutive store.\n");
+          return;
+        }
 
-    // Stop self cycles.
-    if (VisitedPHIs.count(PH))
-        return;
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
-    VisitedPHIs.insert(PH);
-    for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
       ValueList Operands;
-      // Prepare the operand vector.
       for (unsigned j = 0; j < VL.size(); ++j)
-        Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i));
+        Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
 
-      getTreeUses_rec(Operands, Depth + 1);
-    }
-    return;
-  }
-  case Instruction::ExtractElement: {
-    VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size());
-    // No need to follow ExtractElements that are going to be optimized away.
-    if (CanReuseExtract(VL, VL.size(), VecTy))
+      // We can ignore these values because we are sinking them down.
+      MemBarrierIgnoreList.insert(VL.begin(), VL.end());
+      buildTree_rec(Operands, Depth + 1);
       return;
-    // Fall through.
-  }
-  case Instruction::Load:
-    return;
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::FPExt:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-  case Instruction::SIToFP:
-  case Instruction::UIToFP:
-  case Instruction::Trunc:
-  case Instruction::FPTrunc:
-  case Instruction::BitCast:
-  case Instruction::Select:
-  case Instruction::ICmp:
-  case Instruction::FCmp:
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::FDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor: {
-    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
-      ValueList Operands;
-      // Prepare the operand vector.
-      for (unsigned j = 0; j < VL.size(); ++j)
-        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
-
-      getTreeUses_rec(Operands, Depth + 1);
     }
-    return;
-  }
-  case Instruction::Store: {
-    ValueList Operands;
-    for (unsigned j = 0; j < VL.size(); ++j)
-      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
-    getTreeUses_rec(Operands, Depth + 1);
-    return;
-  }
-  default:
-    return MustGather.insert(VL.begin(), VL.end());
+    default:
+      newTreeEntry(VL, false);
+      DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
+      return;
   }
 }
 
-int FuncSLP::getLastIndex(ArrayRef<Value *> VL) {
-  BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
-  assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block");
-  BlockNumbering &BN = BlocksNumbers[BB];
-
-  int MaxIdx = BN.getIndex(BB->getFirstNonPHI());
-  for (unsigned i = 0, e = VL.size(); i < e; ++i)
-    MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
-  return MaxIdx;
-}
-
-Instruction *FuncSLP::getLastInstruction(ArrayRef<Value *> VL) {
-  BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
-  assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block");
-  BlockNumbering &BN = BlocksNumbers[BB];
-
-  int MaxIdx = BN.getIndex(cast<Instruction>(VL[0]));
-  for (unsigned i = 1, e = VL.size(); i < e; ++i)
-    MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
-  return BN.getInstruction(MaxIdx);
-}
-
-Instruction *FuncSLP::getInstructionForIndex(unsigned Index, BasicBlock *BB) {
-  BlockNumbering &BN = BlocksNumbers[BB];
-  return BN.getInstruction(Index);
-}
-
-int FuncSLP::getFirstUserIndex(ArrayRef<Value *> VL) {
-  BasicBlock *BB = getSameBlock(VL);
-  assert(BB && "All instructions must come from the same block");
-  BlockNumbering &BN = BlocksNumbers[BB];
-
-  // Find the first user of the values.
-  int FirstUser = BN.getIndex(BB->getTerminator());
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end();
-         U != UE; ++U) {
-      Instruction *Instr = dyn_cast<Instruction>(*U);
-
-      if (!Instr || Instr->getParent() != BB)
-        continue;
-
-      FirstUser = std::min(FirstUser, BN.getIndex(Instr));
-    }
-  }
-  return FirstUser;
-}
+int BoUpSLP::getEntryCost(TreeEntry *E) {
+  ArrayRef<Value*> VL = E->Scalars;
 
-int FuncSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
   Type *ScalarTy = VL[0]->getType();
-
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     ScalarTy = SI->getValueOperand()->getType();
-
-  /// Don't mess with vectors.
-  if (ScalarTy->isVectorTy())
-    return FuncSLP::MAX_COST;
-
-  if (allConstant(VL))
-    return 0;
-
   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
 
-  if (isSplat(VL))
-    return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
-
-  int GatherCost = getGatherCost(VecTy);
-  if (Depth == RecursionMaxDepth || needToGatherAny(VL))
-    return GatherCost;
-
-  BasicBlock *BB = getSameBlock(VL);
-  unsigned Opcode = getSameOpcode(VL);
-  assert(Opcode && BB && "Invalid Instruction Value");
-
-  // Check if it is safe to sink the loads or the stores.
-  if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
-    int MaxIdx = getLastIndex(VL);
-    Instruction *Last = getInstructionForIndex(MaxIdx, BB);
-
-    for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-      if (VL[i] == Last)
-        continue;
-      Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last);
-      if (Barrier) {
-        DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last
-                     << "\n because of " << *Barrier << "\n");
-        return MAX_COST;
-      }
+  if (E->NeedToGather) {
+    if (allConstant(VL))
+      return 0;
+    if (isSplat(VL)) {
+      return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
     }
+    return getGatherCost(E->Scalars);
   }
 
-  // Calculate the extract cost.
-  unsigned ExternalUserExtractCost = 0;
-  for (unsigned i = 0, e = VL.size(); i < e; ++i)
-    if (ExtractedLane.count(cast<Instruction>(VL[i])))
-      ExternalUserExtractCost +=
-        TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
-
+  assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) &&
+         "Invalid VL");
   Instruction *VL0 = cast<Instruction>(VL[0]);
+  unsigned Opcode = VL0->getOpcode();
   switch (Opcode) {
-  case Instruction::PHI: {
-    PHINode *PH = dyn_cast<PHINode>(VL0);
-
-    // Stop self cycles.
-    if (VisitedPHIs.count(PH))
-        return 0;
-
-    VisitedPHIs.insert(PH);
-    int TotalCost = 0;
-    // Calculate the cost of all of the operands.
-    for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {      
-      ValueList Operands;
-      // Prepare the operand vector.
-      for (unsigned j = 0; j < VL.size(); ++j)
-        Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i));
-
-      int Cost = getTreeCost_rec(Operands, Depth + 1);
-      if (Cost == MAX_COST)
-        return MAX_COST;
-      TotalCost += TotalCost;
-    }
-
-    if (TotalCost > GatherCost) {
-      MustGather.insert(VL.begin(), VL.end());
-      return GatherCost;
-    }
-
-    return TotalCost + ExternalUserExtractCost;
-  }
-  case Instruction::ExtractElement: {
-    if (CanReuseExtract(VL, VL.size(), VecTy))
+    case Instruction::PHI: {
       return 0;
-    return getGatherCost(VecTy);
-  }
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::FPExt:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-  case Instruction::SIToFP:
-  case Instruction::UIToFP:
-  case Instruction::Trunc:
-  case Instruction::FPTrunc:
-  case Instruction::BitCast: {
-    ValueList Operands;
-    Type *SrcTy = VL0->getOperand(0)->getType();
-    // Prepare the operand vector.
-    for (unsigned j = 0; j < VL.size(); ++j) {
-      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
-      // Check that the casted type is the same for all users.
-      if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy)
-        return getGatherCost(VecTy);
     }
-
-    int Cost = getTreeCost_rec(Operands, Depth + 1);
-    if (Cost == MAX_COST)
-      return MAX_COST;
-
-    // Calculate the cost of this instruction.
-    int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
-                                                       VL0->getType(), SrcTy);
-
-    VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
-    int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
-    Cost += (VecCost - ScalarCost);
-
-    if (Cost > GatherCost) {
-      MustGather.insert(VL.begin(), VL.end());
-      return GatherCost;
+    case Instruction::ExtractElement: {
+      if (CanReuseExtract(VL))
+        return 0;
+      return getGatherCost(VecTy);
     }
-
-    return Cost + ExternalUserExtractCost;
-  }
-  case Instruction::FCmp:
-  case Instruction::ICmp: {
-    // Check that all of the compares have the same predicate.
-    CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
-    for (unsigned i = 1, e = VL.size(); i < e; ++i) {
-      CmpInst *Cmp = cast<CmpInst>(VL[i]);
-      if (Cmp->getPredicate() != P0)
-        return getGatherCost(VecTy);
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      Type *SrcTy = VL0->getOperand(0)->getType();
+
+      // Calculate the cost of this instruction.
+      int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
+                                                         VL0->getType(), SrcTy);
+
+      VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+      return VecCost - ScalarCost;
     }
-    // Fall through.
-  }
-  case Instruction::Select:
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::FDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor: {
-    int TotalCost = 0;
-    // Calculate the cost of all of the operands.
-    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
-      ValueList Operands;
-      // Prepare the operand vector.
-      for (unsigned j = 0; j < VL.size(); ++j)
-        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
-
-      int Cost = getTreeCost_rec(Operands, Depth + 1);
-      if (Cost == MAX_COST)
-        return MAX_COST;
-      TotalCost += Cost;
+    case Instruction::FCmp:
+    case Instruction::ICmp:
+    case Instruction::Select:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      // Calculate the cost of this instruction.
+      int ScalarCost = 0;
+      int VecCost = 0;
+      if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
+          Opcode == Instruction::Select) {
+        VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+        ScalarCost = VecTy->getNumElements() *
+        TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
+        VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
+      } else {
+        ScalarCost = VecTy->getNumElements() *
+        TTI->getArithmeticInstrCost(Opcode, ScalarTy);
+        VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
+      }
+      return VecCost - ScalarCost;
     }
-
-    // Calculate the cost of this instruction.
-    int ScalarCost = 0;
-    int VecCost = 0;
-    if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
-        Opcode == Instruction::Select) {
-      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
-      ScalarCost =
-          VecTy->getNumElements() *
-          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
-      VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
-    } else {
-      ScalarCost = VecTy->getNumElements() *
-                   TTI->getArithmeticInstrCost(Opcode, ScalarTy);
-      VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
+    case Instruction::Load: {
+      // Cost of wide load - cost of scalar loads.
+      int ScalarLdCost = VecTy->getNumElements() *
+      TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+      return VecLdCost - ScalarLdCost;
     }
-    TotalCost += (VecCost - ScalarCost);
-
-    if (TotalCost > GatherCost) {
-      MustGather.insert(VL.begin(), VL.end());
-      return GatherCost;
+    case Instruction::Store: {
+      // We know that we can merge the stores. Calculate the cost.
+      int ScalarStCost = VecTy->getNumElements() *
+      TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+      int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+      return VecStCost - ScalarStCost;
     }
-
-    return TotalCost + ExternalUserExtractCost;
+    default:
+      llvm_unreachable("Unknown instruction");
   }
-  case Instruction::Load: {
-    // If we are scalarize the loads, add the cost of forming the vector.
-    for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
-      if (!isConsecutiveAccess(VL[i], VL[i + 1]))
-        return getGatherCost(VecTy);
-
-    // Cost of wide load - cost of scalar loads.
-    int ScalarLdCost = VecTy->getNumElements() *
-                       TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
-    int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
-    int TotalCost = VecLdCost - ScalarLdCost;
-
-    if (TotalCost > GatherCost) {
-      MustGather.insert(VL.begin(), VL.end());
-      return GatherCost;
-    }
+}
 
-    return TotalCost + ExternalUserExtractCost;
+int BoUpSLP::getTreeCost() {
+  int Cost = 0;
+  DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
+        VectorizableTree.size() << ".\n");
+
+  for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
+    int C = getEntryCost(&VectorizableTree[i]);
+    DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
+          << *VectorizableTree[i].Scalars[0] << " .\n");
+    Cost += C;
   }
-  case Instruction::Store: {
-    // We know that we can merge the stores. Calculate the cost.
-    int ScalarStCost = VecTy->getNumElements() *
-                       TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
-    int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
-    int StoreCost = VecStCost - ScalarStCost;
-
-    ValueList Operands;
-    for (unsigned j = 0; j < VL.size(); ++j) {
-      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
-      MemBarrierIgnoreList.insert(VL[j]);
-    }
+  DEBUG(dbgs() << "SLP: Total Cost " << Cost << ".\n");
+  return  Cost;
+}
 
-    int Cost = getTreeCost_rec(Operands, Depth + 1);
-    if (Cost == MAX_COST)
-      return MAX_COST;
+int BoUpSLP::getGatherCost(Type *Ty) {
+  int Cost = 0;
+  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
+    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+  return Cost;
+}
 
-    int TotalCost = StoreCost + Cost;
-    return TotalCost + ExternalUserExtractCost;
-  }
-  default:
-    // Unable to vectorize unknown instructions.
-    return getGatherCost(VecTy);
-  }
+int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
+  // Find the type of the operands in VL.
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  // Find the cost of inserting/extracting values from the vector.
+  return getGatherCost(VecTy);
 }
 
-int FuncSLP::getTreeCost(ArrayRef<Value *> VL) {
-  // Get rid of the list of stores that were removed, and from the
-  // lists of instructions with multiple users.
-  MemBarrierIgnoreList.clear();
-  LaneMap.clear();
-  MultiUserVals.clear();
-  ExtractedLane.clear();
-  MustGather.clear();
-  VisitedPHIs.clear();
-
-  if (!getSameBlock(VL))
-    return MAX_COST;
-
-  // Find the location of the last root.
-  int LastRootIndex = getLastIndex(VL);
-  int FirstUserIndex = getFirstUserIndex(VL);
-
-  // Don't vectorize if there are users of the tree roots inside the tree
-  // itself.
-  if (LastRootIndex > FirstUserIndex)
-    return MAX_COST;
-
-  // Scan the tree and find which value is used by which lane, and which values
-  // must be scalarized.
-  getTreeUses_rec(VL, 0);
-
-  // Check that instructions with multiple users can be vectorized. Mark
-  // unsafe instructions.
-  for (MapVector<Instruction *, UseInfo>::iterator UI = MultiUserVals.begin(),
-       e = MultiUserVals.end(); UI != e; ++UI) {
-    Instruction *Scalar = UI->first;
-
-    if (MustGather.count(Scalar))
-      continue;
+AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return AA->getLocation(SI);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return AA->getLocation(LI);
+  return AliasAnalysis::Location();
+}
 
-    assert(LaneMap.count(Scalar) && "Unknown scalar");
-    int ScalarLane = LaneMap[Scalar];
-
-    bool ExternalUse = false;
-    // Check that all of the users of this instr are within the tree.
-    for (Value::use_iterator Usr = Scalar->use_begin(),
-         UE = Scalar->use_end(); Usr != UE; ++Usr) {
-      // If this user is within the tree, make sure it is from the same lane.
-      // Notice that we have both in-tree and out-of-tree users.
-      if (LaneMap.count(*Usr)) {
-        if (LaneMap[*Usr] != ScalarLane) {
-          DEBUG(dbgs() << "SLP: Adding to MustExtract "
-                "because of an out-of-lane usage.\n");
-          MustGather.insert(Scalar);
-          break;
-        }
-        continue;
-      }
+Value *BoUpSLP::getPointerOperand(Value *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  return 0;
+}
 
-      // We have an out-of-tree user. Check if we can place an 'extract'.
-      Instruction *User = cast<Instruction>(*Usr);
-      // We care about the order only if the user is in the same block.
-      if (User->getParent() == Scalar->getParent()) {
-        int LastLoc = UI->second.LastIndex;
-        BlockNumbering &BN = BlocksNumbers[User->getParent()];
-        int UserIdx = BN.getIndex(User);
-        if (UserIdx <= LastLoc) {
-          DEBUG(dbgs() << "SLP: Adding to MustExtract because of an external "
-                "user that we can't schedule.\n");
-          MustGather.insert(Scalar);
-          break;
-        }
-      }
-      // We have an external user.
-      ExternalUse = true;
-    }
+unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    return L->getPointerAddressSpace();
+  if (StoreInst *S = dyn_cast<StoreInst>(I))
+    return S->getPointerAddressSpace();
+  return -1;
+}
 
-    if (ExternalUse) {
-      // Items that are left in MultiUserVals are to be extracted.
-      // ExtractLane is used for the lookup.
-      ExtractedLane.insert(Scalar);
-    }
+bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
+  Value *PtrA = getPointerOperand(A);
+  Value *PtrB = getPointerOperand(B);
+  unsigned ASA = getAddressSpaceOperand(A);
+  unsigned ASB = getAddressSpaceOperand(B);
 
-  }
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB))
+    return false;
 
-  // Now calculate the cost of vectorizing the tree.
-  return getTreeCost_rec(VL, 0);
-}
-bool FuncSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) {
-  unsigned ChainLen = Chain.size();
-  DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
-               << "\n");
-  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
-  unsigned Sz = DL->getTypeSizeInBits(StoreTy);
-  unsigned VF = MinVecRegSize / Sz;
+  // Check that A and B are of the same type.
+  if (PtrA->getType() != PtrB->getType())
+    return false;
 
-  if (!isPowerOf2_32(Sz) || VF < 2)
+  // Calculate the distance.
+  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
+  const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
+  const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);
+
+  // Non constant distance.
+  if (!ConstOffSCEV)
     return false;
 
-  bool Changed = false;
-  // Look for profitable vectorizable trees at all offsets, starting at zero.
-  for (unsigned i = 0, e = ChainLen; i < e; ++i) {
-    if (i + VF > e)
-      break;
-    DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
-                 << "\n");
-    ArrayRef<Value *> Operands = Chain.slice(i, VF);
+  int64_t Offset = ConstOffSCEV->getValue()->getSExtValue();
+  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
+  // The Instructions are connsecutive if the size of the first load/store is
+  // the same as the offset.
+  int64_t Sz = DL->getTypeStoreSize(Ty);
+  return ((-Offset) == Sz);
+}
 
-    int Cost = getTreeCost(Operands);
-    if (Cost == FuncSLP::MAX_COST)
+Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) {
+  assert(Src->getParent() == Dst->getParent() && "Not the same BB");
+  BasicBlock::iterator I = Src, E = Dst;
+  /// Scan all of the instruction from SRC to DST and check if
+  /// the source may alias.
+  for (++I; I != E; ++I) {
+    // Ignore store instructions that are marked as 'ignore'.
+    if (MemBarrierIgnoreList.count(I))
       continue;
-    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
-    if (Cost < CostThreshold) {
-      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
-      vectorizeTree(Operands);
-
-      // Remove the scalar stores.
-      for (int j = 0, e = VF; j < e; ++j)
-        cast<Instruction>(Operands[j])->eraseFromParent();
-
-      // Move to the next bundle.
-      i += VF - 1;
-      Changed = true;
+    if (Src->mayWriteToMemory()) /* Write */ {
+      if (!I->mayReadOrWriteMemory())
+        continue;
+    } else /* Read */ {
+      if (!I->mayWriteToMemory())
+        continue;
     }
+    AliasAnalysis::Location A = getLocation(&*I);
+    AliasAnalysis::Location B = getLocation(Src);
+
+    if (!A.Ptr || !B.Ptr || AA->alias(A, B))
+      return I;
   }
+  return 0;
+}
 
-  if (Changed || ChainLen > VF)
-    return Changed;
+int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) {
+  BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
+  assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block");
+  BlockNumbering &BN = BlocksNumbers[BB];
 
-  // Handle short chains. This helps us catch types such as <3 x float> that
-  // are smaller than vector size.
-  int Cost = getTreeCost(Chain);
-  if (Cost == FuncSLP::MAX_COST)
-    return false;
-  if (Cost < CostThreshold) {
-    DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost
-                 << " for size = " << ChainLen << "\n");
-    vectorizeTree(Chain);
+  int MaxIdx = BN.getIndex(BB->getFirstNonPHI());
+  for (unsigned i = 0, e = VL.size(); i < e; ++i)
+    MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
+  return MaxIdx;
+}
 
-    // Remove all of the scalar stores.
-    for (int i = 0, e = Chain.size(); i < e; ++i)
-      cast<Instruction>(Chain[i])->eraseFromParent();
+Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) {
+  BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
+  assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block");
+  BlockNumbering &BN = BlocksNumbers[BB];
 
-    return true;
-  }
+  int MaxIdx = BN.getIndex(cast<Instruction>(VL[0]));
+  for (unsigned i = 1, e = VL.size(); i < e; ++i)
+    MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
+  Instruction *I = BN.getInstruction(MaxIdx);
+  assert(I && "bad location");
+  return I;
+}
 
-  return false;
+Instruction *BoUpSLP::getInstructionForIndex(unsigned Index, BasicBlock *BB) {
+  BlockNumbering &BN = BlocksNumbers[BB];
+  return BN.getInstruction(Index);
 }
 
-bool FuncSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) {
-  SetVector<Value *> Heads, Tails;
-  SmallDenseMap<Value *, Value *> ConsecutiveChain;
+int BoUpSLP::getFirstUserIndex(ArrayRef<Value *> VL) {
+  BasicBlock *BB = getSameBlock(VL);
+  assert(BB && "All instructions must come from the same block");
+  BlockNumbering &BN = BlocksNumbers[BB];
 
-  // We may run into multiple chains that merge into a single chain. We mark the
-  // stores that we vectorized so that we don't visit the same store twice.
-  ValueSet VectorizedStores;
-  bool Changed = false;
+  // Find the first user of the values.
+  int FirstUser = BN.getIndex(BB->getTerminator());
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end();
+         U != UE; ++U) {
+      Instruction *Instr = dyn_cast<Instruction>(*U);
 
-  // Do a quadratic search on all of the given stores and find
-  // all of the pairs of loads that follow each other.
-  for (unsigned i = 0, e = Stores.size(); i < e; ++i)
-    for (unsigned j = 0; j < e; ++j) {
-      if (i == j)
+      if (!Instr || Instr->getParent() != BB)
         continue;
 
-      if (isConsecutiveAccess(Stores[i], Stores[j])) {
-        Tails.insert(Stores[j]);
-        Heads.insert(Stores[i]);
-        ConsecutiveChain[Stores[i]] = Stores[j];
-      }
-    }
-
-  // For stores that start but don't end a link in the chain:
-  for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
-       it != e; ++it) {
-    if (Tails.count(*it))
-      continue;
-
-    // We found a store instr that starts a chain. Now follow the chain and try
-    // to vectorize it.
-    ValueList Operands;
-    Value *I = *it;
-    // Collect the chain into a list.
-    while (Tails.count(I) || Heads.count(I)) {
-      if (VectorizedStores.count(I))
-        break;
-      Operands.push_back(I);
-      // Move to the next value in the chain.
-      I = ConsecutiveChain[I];
+      FirstUser = std::min(FirstUser, BN.getIndex(Instr));
     }
-
-    bool Vectorized = vectorizeStoreChain(Operands, costThreshold);
-
-    // Mark the vectorized stores so that we don't vectorize them again.
-    if (Vectorized)
-      VectorizedStores.insert(Operands.begin(), Operands.end());
-    Changed |= Vectorized;
   }
-
-  return Changed;
+  return FirstUser;
 }
 
-Value *FuncSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
+Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
   Value *Vec = UndefValue::get(Ty);
   // Generate the 'InsertElement' instruction.
   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
@@ -1103,282 +1011,292 @@ Value *FuncSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
   return Vec;
 }
 
-Value *FuncSLP::vectorizeTree_rec(ArrayRef<Value *> VL) {
-  BuilderLocGuard Guard(Builder);
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
+  if (ScalarToTreeEntry.count(VL[0])) {
+    int Idx = ScalarToTreeEntry[VL[0]];
+    TreeEntry *E = &VectorizableTree[Idx];
+    if (E->isSame(VL))
+      return vectorizeTree(E);
+  }
 
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     ScalarTy = SI->getValueOperand()->getType();
   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
 
-  if (needToGatherAny(VL))
-    return Gather(VL, VecTy);
+  return Gather(VL, VecTy);
+}
+
+Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+  BuilderLocGuard Guard(Builder);
 
-  if (VectorizedValues.count(VL[0])) {
-    DEBUG(dbgs() << "SLP: Diamond merged at depth.\n");
-    return VectorizedValues[VL[0]];
+  if (E->VectorizedValue) {
+    DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+    return E->VectorizedValue;
   }
 
-  Instruction *VL0 = cast<Instruction>(VL[0]);
-  unsigned Opcode = VL0->getOpcode();
-  assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
+  Type *ScalarTy = E->Scalars[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(E->Scalars[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
 
-  switch (Opcode) {
-  case Instruction::PHI: {
-    PHINode *PH = dyn_cast<PHINode>(VL0);
-    Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt());
-    PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
-    VectorizedValues[VL0] = NewPhi;
+  if (E->NeedToGather) {
+    return Gather(E->Scalars, VecTy);
+  }
 
-    for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
-      ValueList Operands;
-      BasicBlock *IBB = PH->getIncomingBlock(i);
+  Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
+  unsigned Opcode = VL0->getOpcode();
+  assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode");
 
-      // Prepare the operand vector.
-      for (unsigned j = 0; j < VL.size(); ++j)
-        Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(IBB));
+  switch (Opcode) {
+    case Instruction::PHI: {
+      PHINode *PH = dyn_cast<PHINode>(VL0);
+      Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt());
+      PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
+      E->VectorizedValue = NewPhi;
+
+      for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+        ValueList Operands;
+        BasicBlock *IBB = PH->getIncomingBlock(i);
+
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < E->Scalars.size(); ++j)
+          Operands.push_back(cast<PHINode>(E->Scalars[j])->
+                             getIncomingValueForBlock(IBB));
+
+        Builder.SetInsertPoint(IBB->getTerminator());
+        Value *Vec = vectorizeTree(Operands);
+        NewPhi->addIncoming(Vec, IBB);
+      }
 
-      Builder.SetInsertPoint(IBB->getTerminator());
-      Value *Vec = vectorizeTree_rec(Operands);
-      NewPhi->addIncoming(Vec, IBB);
+      assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
+             "Invalid number of incoming values");
+      return NewPhi;
     }
 
-    assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
-           "Invalid number of incoming values");
-    return NewPhi;
-  }
-
-  case Instruction::ExtractElement: {
-    if (CanReuseExtract(VL, VL.size(), VecTy))
-      return VL0->getOperand(0);
-    return Gather(VL, VecTy);
-  }
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::FPExt:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-  case Instruction::SIToFP:
-  case Instruction::UIToFP:
-  case Instruction::Trunc:
-  case Instruction::FPTrunc:
-  case Instruction::BitCast: {
-    ValueList INVL;
-    for (int i = 0, e = VL.size(); i < e; ++i)
-      INVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
-
-    Builder.SetInsertPoint(getLastInstruction(VL));
-    Value *InVec = vectorizeTree_rec(INVL);
-    CastInst *CI = dyn_cast<CastInst>(VL0);
-    Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
-    VectorizedValues[VL0] = V;
-    return V;
-  }
-  case Instruction::FCmp:
-  case Instruction::ICmp: {
-    // Check that all of the compares have the same predicate.
-    CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
-    for (unsigned i = 1, e = VL.size(); i < e; ++i) {
-      CmpInst *Cmp = cast<CmpInst>(VL[i]);
-      if (Cmp->getPredicate() != P0)
-        return Gather(VL, VecTy);
+    case Instruction::ExtractElement: {
+      if (CanReuseExtract(E->Scalars)) {
+        Value *V = VL0->getOperand(0);
+        E->VectorizedValue = V;
+        return V;
+      }
+      return Gather(E->Scalars, VecTy);
     }
-
-    ValueList LHSV, RHSV;
-    for (int i = 0, e = VL.size(); i < e; ++i) {
-      LHSV.push_back(cast<Instruction>(VL[i])->getOperand(0));
-      RHSV.push_back(cast<Instruction>(VL[i])->getOperand(1));
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      ValueList INVL;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i)
+        INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+
+      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
+      Value *InVec = vectorizeTree(INVL);
+      CastInst *CI = dyn_cast<CastInst>(VL0);
+      Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+      E->VectorizedValue = V;
+      return V;
     }
+    case Instruction::FCmp:
+    case Instruction::ICmp: {
+      ValueList LHSV, RHSV;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+        LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+        RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+      }
 
-    Builder.SetInsertPoint(getLastInstruction(VL));
-    Value *L = vectorizeTree_rec(LHSV);
-    Value *R = vectorizeTree_rec(RHSV);
-    Value *V;
+      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
+      Value *L = vectorizeTree(LHSV);
+      Value *R = vectorizeTree(RHSV);
+      Value *V;
 
-    if (Opcode == Instruction::FCmp)
-      V = Builder.CreateFCmp(P0, L, R);
-    else
-      V = Builder.CreateICmp(P0, L, R);
+      CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
+      if (Opcode == Instruction::FCmp)
+        V = Builder.CreateFCmp(P0, L, R);
+      else
+        V = Builder.CreateICmp(P0, L, R);
 
-    VectorizedValues[VL0] = V;
-    return V;
-  }
-  case Instruction::Select: {
-    ValueList TrueVec, FalseVec, CondVec;
-    for (int i = 0, e = VL.size(); i < e; ++i) {
-      CondVec.push_back(cast<Instruction>(VL[i])->getOperand(0));
-      TrueVec.push_back(cast<Instruction>(VL[i])->getOperand(1));
-      FalseVec.push_back(cast<Instruction>(VL[i])->getOperand(2));
+      E->VectorizedValue = V;
+      return V;
     }
+    case Instruction::Select: {
+      ValueList TrueVec, FalseVec, CondVec;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+        CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+        TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+        FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
+      }
 
-    Builder.SetInsertPoint(getLastInstruction(VL));
-    Value *True = vectorizeTree_rec(TrueVec);
-    Value *False = vectorizeTree_rec(FalseVec);
-    Value *Cond = vectorizeTree_rec(CondVec);
-    Value *V = Builder.CreateSelect(Cond, True, False);
-    VectorizedValues[VL0] = V;
-    return V;
-  }
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::FDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor: {
-    ValueList LHSVL, RHSVL;
-    for (int i = 0, e = VL.size(); i < e; ++i) {
-      LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
-      RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
+      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
+      Value *Cond = vectorizeTree(CondVec);
+      Value *True = vectorizeTree(TrueVec);
+      Value *False = vectorizeTree(FalseVec);
+      Value *V = Builder.CreateSelect(Cond, True, False);
+      E->VectorizedValue = V;
+      return V;
     }
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      ValueList LHSVL, RHSVL;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+        LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+        RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+      }
 
-    Builder.SetInsertPoint(getLastInstruction(VL));
-    Value *LHS = vectorizeTree_rec(LHSVL);
-    Value *RHS = vectorizeTree_rec(RHSVL);
+      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
+      Value *LHS = vectorizeTree(LHSVL);
+      Value *RHS = vectorizeTree(RHSVL);
 
-    if (LHS == RHS) {
-      assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
-    }
+      if (LHS == RHS && isa<Instruction>(LHS)) {
+        assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
+      }
 
-    BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
-    Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
-    VectorizedValues[VL0] = V;
-    return V;
-  }
-  case Instruction::Load: {
-    // Check if all of the loads are consecutive.
-    for (unsigned i = 1, e = VL.size(); i < e; ++i)
-      if (!isConsecutiveAccess(VL[i - 1], VL[i]))
-        return Gather(VL, VecTy);
-
-    // Loads are inserted at the head of the tree because we don't want to
-    // sink them all the way down past store instructions.
-    Builder.SetInsertPoint(getLastInstruction(VL));
-    LoadInst *LI = cast<LoadInst>(VL0);
-    Value *VecPtr =
-        Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo());
-    unsigned Alignment = LI->getAlignment();
-    LI = Builder.CreateLoad(VecPtr);
-    LI->setAlignment(Alignment);
-
-    VectorizedValues[VL0] = LI;
-    return LI;
+      BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
+      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
+      E->VectorizedValue = V;
+      return V;
+    }
+    case Instruction::Load: {
+      // Loads are inserted at the head of the tree because we don't want to
+      // sink them all the way down past store instructions.
+      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
+      LoadInst *LI = cast<LoadInst>(VL0);
+      Value *VecPtr =
+      Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo());
+      unsigned Alignment = LI->getAlignment();
+      LI = Builder.CreateLoad(VecPtr);
+      LI->setAlignment(Alignment);
+      E->VectorizedValue = LI;
+      return LI;
+    }
+    case Instruction::Store: {
+      StoreInst *SI = cast<StoreInst>(VL0);
+      unsigned Alignment = SI->getAlignment();
+
+      ValueList ValueOp;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i)
+        ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
+
+      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
+      Value *VecValue = vectorizeTree(ValueOp);
+      Value *VecPtr =
+      Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo());
+      StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
+      S->setAlignment(Alignment);
+      E->VectorizedValue = S;
+      return S;
+    }
+    default:
+    llvm_unreachable("unknown inst");
   }
-  case Instruction::Store: {
-    StoreInst *SI = cast<StoreInst>(VL0);
-    unsigned Alignment = SI->getAlignment();
+  return 0;
+}
 
-    ValueList ValueOp;
-    for (int i = 0, e = VL.size(); i < e; ++i)
-      ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
+void BoUpSLP::vectorizeTree() {
+  vectorizeTree(&VectorizableTree[0]);
 
-    Value *VecValue = vectorizeTree_rec(ValueOp);
+  // For each vectorized value:
+  for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
+    TreeEntry *Entry = &VectorizableTree[EIdx];
 
-    Builder.SetInsertPoint(getLastInstruction(VL));
-    Value *VecPtr =
-        Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo());
-    Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
-    return 0;
-  }
-  default:
-    return Gather(VL, VecTy);
-  }
-}
+    // For each lane:
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
 
-Value *FuncSLP::vectorizeTree(ArrayRef<Value *> VL) {
-  Builder.SetInsertPoint(getLastInstruction(VL));
-  Value *V = vectorizeTree_rec(VL);
+      // No need to handle users of gathered values.
+      if (Entry->NeedToGather)
+        continue;
 
-  DEBUG(dbgs() << "SLP: Placing 'extracts'\n");
-  for (SetVector<Instruction*>::iterator it = ExtractedLane.begin(), e =
-       ExtractedLane.end(); it != e; ++it) {
-    Instruction *Scalar = *it;
-    DEBUG(dbgs() << "SLP: Looking at " << *Scalar);
+      Value *Vec = Entry->VectorizedValue;
+      assert(Vec && "Can't find vectorizable value");
 
-    if (!Scalar)
-      continue;
+      SmallVector<User*, 16> Users(Scalar->use_begin(), Scalar->use_end());
 
-    Instruction *Loc = 0;
+      for (SmallVector<User*, 16>::iterator User = Users.begin(),
+           UE = Users.end(); User != UE; ++User) {
+        DEBUG(dbgs() << "SLP: \tupdating user  " << **User << ".\n");
 
-    assert(MultiUserVals.count(Scalar) && "Can't find the lane to extract");
-    Instruction *Leader = MultiUserVals[Scalar].Leader;
+        bool Gathered = MustGather.count(*User);
 
-    // This value is gathered so we don't need to extract from anywhere.
-    if (!VectorizedValues.count(Leader))
-      continue;
+        // Skip in-tree scalars that become vectors.
+        if (ScalarToTreeEntry.count(*User) && !Gathered) {
+          DEBUG(dbgs() << "SLP: \tUser will be removed soon:" <<
+                **User << ".\n");
+          int Idx = ScalarToTreeEntry[*User]; (void) Idx;
+          assert(!VectorizableTree[Idx].NeedToGather && "bad state ?");
+          continue;
+        }
 
-    Value *Vec = VectorizedValues[Leader];
-    if (PHINode *PN = dyn_cast<PHINode>(Vec)) {
-      Loc = PN->getParent()->getFirstInsertionPt();
-    } else {
-      Instruction *I = cast<Instruction>(Vec);
-      BasicBlock::iterator L = *I;
-      Loc = ++L;
-    }
+        if (!isa<Instruction>(*User))
+          continue;
 
-    Builder.SetInsertPoint(Loc);
-    assert(LaneMap.count(Scalar) && "Can't find the extracted lane.");
-    int Lane = LaneMap[Scalar];
-    Value *Idx = Builder.getInt32(Lane);
-    Value *Extract = Builder.CreateExtractElement(Vec, Idx);
+        // Generate extracts for out-of-tree users.
+        // Find the insertion point for the extractelement lane.
+        Instruction *Loc = 0;
+        if (PHINode *PN = dyn_cast<PHINode>(Vec)) {
+          Loc = PN->getParent()->getFirstInsertionPt();
+        } else if (Instruction *Iv = dyn_cast<Instruction>(Vec)){
+          Loc = ++((BasicBlock::iterator)*Iv);
+        } else {
+          Loc = F->getEntryBlock().begin();
+        }
 
-    bool Replaced = false;;
-    for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end();
-         U != UE; ++U) {
-      Instruction *UI = cast<Instruction>(*U);
-      // No need to replace instructions that are inside our lane map.
-      if (LaneMap.count(UI))
-        continue;
+        Builder.SetInsertPoint(Loc);
+        Value *Ex = Builder.CreateExtractElement(Vec, Builder.getInt32(Lane));
+        (*User)->replaceUsesOfWith(Scalar, Ex);
+        DEBUG(dbgs() << "SLP: \tupdated user:" << **User << ".\n");
+      }
 
-      UI->replaceUsesOfWith(Scalar ,Extract);
-      Replaced = true;
+      Type *Ty = Scalar->getType();
+      if (!Ty->isVoidTy()) {
+        for (Value::use_iterator User = Scalar->use_begin(), UE = Scalar->use_end();
+             User != UE; ++User) {
+          DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n");
+          assert(!MustGather.count(*User) &&
+                 "Replacing gathered value with undef");
+          assert(ScalarToTreeEntry.count(*User) &&
+                 "Replacing out-of-tree value with undef");
+        }
+        Value *Undef = UndefValue::get(Ty);
+        Scalar->replaceAllUsesWith(Undef);
+      }
+      DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
+      cast<Instruction>(Scalar)->eraseFromParent();
     }
-    assert(Replaced && "Must replace at least one outside user");
-    (void)Replaced;
   }
 
-  // We moved some instructions around. We have to number them again
-  // before we can do any analysis.
-  forgetNumbering();
-
-  // Clear the state.
-  MustGather.clear();
-  VisitedPHIs.clear();
-  VectorizedValues.clear();
-  MemBarrierIgnoreList.clear();
-  return V;
-}
-
-Value *FuncSLP::vectorizeArith(ArrayRef<Value *> Operands) {
-  Instruction *LastInst = getLastInstruction(Operands);
-  Value *Vec = vectorizeTree(Operands);
-  // After vectorizing the operands we need to generate extractelement
-  // instructions and replace all of the uses of the scalar values with
-  // the values that we extracted from the vectorized tree.
-  Builder.SetInsertPoint(LastInst);
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
-    Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
-    Operands[i]->replaceAllUsesWith(S);
+  for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) {
+    BlocksNumbers[it].forget();
   }
-
-  forgetNumbering();
-  return Vec;
 }
 
-void FuncSLP::optimizeGatherSequence() {
+void BoUpSLP::optimizeGatherSequence() {
+  DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+        << " gather sequences instructions.\n");
   // LICM InsertElementInst sequences.
   for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
        e = GatherSeq.end(); it != e; ++it) {
@@ -1449,8 +1367,6 @@ void FuncSLP::optimizeGatherSequence() {
     assert((*v)->getNumUses() == 0 && "Can't remove instructions with uses");
     (*v)->eraseFromParent();
   }
-
-  forgetNumbering();
 }
 
 /// The SLPVectorizer Pass.
@@ -1492,7 +1408,7 @@ struct SLPVectorizer : public FunctionPass {
 
     // Use the bollom up slp vectorizer to construct chains that start with
     // he store instructions.
-    FuncSLP R(&F, SE, DL, TTI, AA, LI, DT);
+    BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT);
 
     // Scan the blocks in the function in post order.
     for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
@@ -1536,31 +1452,146 @@ private:
   /// object. We sort the stores to their base objects to reduce the cost of the
   /// quadratic search on the stores. TODO: We can further reduce this cost
   /// if we flush the chain creation every time we run into a memory barrier.
-  unsigned collectStores(BasicBlock *BB, FuncSLP &R);
+  unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
 
   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
-  bool tryToVectorizePair(Value *A, Value *B, FuncSLP &R);
+  bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
 
   /// \brief Try to vectorize a list of operands. If \p NeedExtracts is true
   /// then we calculate the cost of extracting the scalars from the vector.
   /// \returns true if a value was vectorized.
-  bool tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R, bool NeedExtracts);
+  bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool NeedExtracts);
 
   /// \brief Try to vectorize a chain that may start at the operands of \V;
-  bool tryToVectorize(BinaryOperator *V, FuncSLP &R);
+  bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
 
   /// \brief Vectorize the stores that were collected in StoreRefs.
-  bool vectorizeStoreChains(FuncSLP &R);
+  bool vectorizeStoreChains(BoUpSLP &R);
 
   /// \brief Scan the basic block and look for patterns that are likely to start
   /// a vectorization chain.
-  bool vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R);
+  bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
+
+  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
+                           BoUpSLP &R);
 
+  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
+                       BoUpSLP &R);
 private:
   StoreListMap StoreRefs;
 };
 
-unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) {
+bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
+                                          int CostThreshold, BoUpSLP &R) {
+  unsigned ChainLen = Chain.size();
+  DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
+        << "\n");
+  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
+  unsigned Sz = DL->getTypeSizeInBits(StoreTy);
+  unsigned VF = MinVecRegSize / Sz;
+
+  if (!isPowerOf2_32(Sz) || VF < 2)
+    return false;
+
+  bool Changed = false;
+  // Look for profitable vectorizable trees at all offsets, starting at zero.
+  for (unsigned i = 0, e = ChainLen; i < e; ++i) {
+    if (i + VF > e)
+      break;
+    DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
+          << "\n");
+    ArrayRef<Value *> Operands = Chain.slice(i, VF);
+
+    R.buildTree(Operands);
+
+    int Cost = R.getTreeCost();
+
+    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+    if (Cost < CostThreshold) {
+      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+      R.vectorizeTree();
+
+      // Move to the next bundle.
+      i += VF - 1;
+      Changed = true;
+    }
+  }
+
+  if (Changed || ChainLen > VF)
+    return Changed;
+
+  // Handle short chains. This helps us catch types such as <3 x float> that
+  // are smaller than vector size.
+  R.buildTree(Chain);
+
+  int Cost = R.getTreeCost();
+
+  if (Cost < CostThreshold) {
+    DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost
+          << " for size = " << ChainLen << "\n");
+    R.vectorizeTree();
+    return true;
+  }
+
+  return false;
+}
+
+bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
+                                      int costThreshold, BoUpSLP &R) {
+  SetVector<Value *> Heads, Tails;
+  SmallDenseMap<Value *, Value *> ConsecutiveChain;
+
+  // We may run into multiple chains that merge into a single chain. We mark the
+  // stores that we vectorized so that we don't visit the same store twice.
+  BoUpSLP::ValueSet VectorizedStores;
+  bool Changed = false;
+
+  // Do a quadratic search on all of the given stores and find
+  // all of the pairs of loads that follow each other.
+  for (unsigned i = 0, e = Stores.size(); i < e; ++i)
+    for (unsigned j = 0; j < e; ++j) {
+      if (i == j)
+        continue;
+
+      if (R.isConsecutiveAccess(Stores[i], Stores[j])) {
+        Tails.insert(Stores[j]);
+        Heads.insert(Stores[i]);
+        ConsecutiveChain[Stores[i]] = Stores[j];
+      }
+    }
+
+  // For stores that start but don't end a link in the chain:
+  for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
+       it != e; ++it) {
+    if (Tails.count(*it))
+      continue;
+
+    // We found a store instr that starts a chain. Now follow the chain and try
+    // to vectorize it.
+    BoUpSLP::ValueList Operands;
+    Value *I = *it;
+    // Collect the chain into a list.
+    while (Tails.count(I) || Heads.count(I)) {
+      if (VectorizedStores.count(I))
+        break;
+      Operands.push_back(I);
+      // Move to the next value in the chain.
+      I = ConsecutiveChain[I];
+    }
+
+    bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
+
+    // Mark the vectorized stores so that we don't vectorize them again.
+    if (Vectorized)
+      VectorizedStores.insert(Operands.begin(), Operands.end());
+    Changed |= Vectorized;
+  }
+
+  return Changed;
+}
+
+
+unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
   unsigned count = 0;
   StoreRefs.clear();
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
@@ -1585,14 +1616,14 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) {
   return count;
 }
 
-bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, FuncSLP &R) {
+bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
   if (!A || !B)
     return false;
   Value *VL[] = { A, B };
   return tryToVectorizeList(VL, R, true);
 }
 
-bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R,
+bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
                                        bool NeedExtracts) {
   if (VL.size() < 2)
     return false;
@@ -1615,9 +1646,8 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R,
       return 0;
   }
 
-  int Cost = R.getTreeCost(VL);
-  if (Cost == FuncSLP::MAX_COST)
-    return false;
+  R.buildTree(VL);
+  int Cost = R.getTreeCost();
 
   int ExtrCost = NeedExtracts ? R.getGatherCost(VL) : 0;
   DEBUG(dbgs() << "SLP: Cost of pair:" << Cost
@@ -1625,11 +1655,11 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R,
   if ((Cost + ExtrCost) >= -SLPCostThreshold)
     return false;
   DEBUG(dbgs() << "SLP: Vectorizing pair.\n");
-  R.vectorizeArith(VL);
+  R.vectorizeTree();
   return true;
 }
 
-bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) {
+bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
   if (!V)
     return false;
 
@@ -1669,7 +1699,7 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) {
   return 0;
 }
 
-bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) {
+bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     if (isa<DbgInfoIntrinsic>(it))
@@ -1737,7 +1767,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) {
   return Changed;
 }
 
-bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) {
+bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
   bool Changed = false;
   // Attempt to sort and vectorize each of the store-groups.
   for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
@@ -1748,7 +1778,7 @@ bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) {
     DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
                  << it->second.size() << ".\n");
 
-    Changed |= R.vectorizeStores(it->second, -SLPCostThreshold);
+    Changed |= vectorizeStores(it->second, -SLPCostThreshold, R);
   }
   return Changed;
 }
diff --git a/test/Transforms/SLPVectorizer/X86/crash_7zip.ll b/test/Transforms/SLPVectorizer/X86/crash_7zip.ll
new file mode 100644
index 00000000000..51b1c08fb36
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_7zip.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334 = type { %struct._CLzmaProps.0.27.54.81.102.123.144.165.180.195.228.258.333, i16*, i8*, i8*, i32, i32, i64, i64, i32, i32, i32, [4 x i32], i32, i32, i32, i32, i32, [20 x i8] }
+%struct._CLzmaProps.0.27.54.81.102.123.144.165.180.195.228.258.333 = type { i32, i32, i32, i32 }
+
+define fastcc void @LzmaDec_DecodeReal2(%struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p) {
+entry:
+  %range20.i = getelementptr inbounds %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p, i64 0, i32 4
+  %code21.i = getelementptr inbounds %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p, i64 0, i32 5
+  br label %do.body66.i
+
+do.body66.i:                                      ; preds = %do.cond.i, %entry
+  %range.2.i = phi i32 [ %range.4.i, %do.cond.i ], [ undef, %entry ]
+  %code.2.i = phi i32 [ %code.4.i, %do.cond.i ], [ undef, %entry ]
+  %.range.2.i = select i1 undef, i32 undef, i32 %range.2.i
+  %.code.2.i = select i1 undef, i32 undef, i32 %code.2.i
+  br i1 undef, label %do.cond.i, label %if.else.i
+
+if.else.i:                                        ; preds = %do.body66.i
+  %sub91.i = sub i32 %.range.2.i, undef
+  %sub92.i = sub i32 %.code.2.i, undef
+  br label %do.cond.i
+
+do.cond.i:                                        ; preds = %if.else.i, %do.body66.i
+  %range.4.i = phi i32 [ %sub91.i, %if.else.i ], [ undef, %do.body66.i ]
+  %code.4.i = phi i32 [ %sub92.i, %if.else.i ], [ %.code.2.i, %do.body66.i ]
+  br i1 undef, label %do.body66.i, label %do.end1006.i
+
+do.end1006.i:                                     ; preds = %do.cond.i
+  %.range.4.i = select i1 undef, i32 undef, i32 %range.4.i
+  %.code.4.i = select i1 undef, i32 undef, i32 %code.4.i
+  store i32 %.range.4.i, i32* %range20.i, align 4
+  store i32 %.code.4.i, i32* %code21.i, align 4
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/test/Transforms/SLPVectorizer/X86/crash_bullet.ll
new file mode 100644
index 00000000000..565905df2c6
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_bullet.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960" = type { i32, i32 }
+
+define void @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btConstraintInfo1E(%"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* nocapture %info) {
+entry:
+  br i1 undef, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  ret void
+
+if.else:                                          ; preds = %entry
+  %m_numConstraintRows4 = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* %info, i64 0, i32 0
+  %nub5 = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* %info, i64 0, i32 1
+  br i1 undef, label %land.lhs.true.i.1, label %if.then7.1
+
+land.lhs.true.i.1:                                ; preds = %if.else
+  br i1 undef, label %for.inc.1, label %if.then7.1
+
+if.then7.1:                                       ; preds = %land.lhs.true.i.1, %if.else
+  %inc.1 = add nsw i32 0, 1
+  store i32 %inc.1, i32* %m_numConstraintRows4, align 4
+  %dec.1 = add nsw i32 6, -1
+  store i32 %dec.1, i32* %nub5, align 4
+  br label %for.inc.1
+
+for.inc.1:                                        ; preds = %if.then7.1, %land.lhs.true.i.1
+  %0 = phi i32 [ %dec.1, %if.then7.1 ], [ 6, %land.lhs.true.i.1 ]
+  %1 = phi i32 [ %inc.1, %if.then7.1 ], [ 0, %land.lhs.true.i.1 ]
+  %inc.2 = add nsw i32 %1, 1
+  store i32 %inc.2, i32* %m_numConstraintRows4, align 4
+  %dec.2 = add nsw i32 %0, -1
+  store i32 %dec.2, i32* %nub5, align 4
+  unreachable
+}
diff --git a/test/Transforms/SLPVectorizer/X86/crash_bullet2.ll b/test/Transforms/SLPVectorizer/X86/crash_bullet2.ll
new file mode 100644
index 00000000000..df026d12cb9
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_bullet2.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332 = type { float, [3 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [3 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], %class.btVector4.7.32.67.92.117.142.177.187.262.282.331, %class.btVector4.7.32.67.92.117.142.177.187.262.282.331, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, [4 x float], float, float, [4 x float], float, float, [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330] }
+%class.btVector3.5.30.65.90.115.140.175.185.260.280.330 = type { [4 x float] }
+%class.btVector4.7.32.67.92.117.142.177.187.262.282.331 = type { %class.btVector3.5.30.65.90.115.140.175.185.260.280.330 }
+
+define void @_ZN30GIM_TRIANGLE_CALCULATION_CACHE18triangle_collisionERK9btVector3S2_S2_fS2_S2_S2_fR25GIM_TRIANGLE_CONTACT_DATA(%class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this) {
+entry:
+  %arrayidx26 = getelementptr inbounds %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this, i64 0, i32 2, i64 0, i32 0, i64 1
+  %arrayidx36 = getelementptr inbounds %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this, i64 0, i32 2, i64 0, i32 0, i64 2
+  %0 = load float* %arrayidx36, align 4
+  %add587 = fadd float undef, undef
+  %sub600 = fsub float %add587, undef
+  store float %sub600, float* undef, align 4
+  %sub613 = fsub float %add587, %sub600
+  store float %sub613, float* %arrayidx26, align 4
+  %add626 = fadd float %0, undef
+  %sub639 = fsub float %add626, undef
+  %sub652 = fsub float %add626, %sub639
+  store float %sub652, float* %arrayidx36, align 4
+  br i1 undef, label %if.else1609, label %if.then1595
+
+if.then1595:                                      ; preds = %entry
+  br i1 undef, label %return, label %for.body.lr.ph.i.i1702
+
+for.body.lr.ph.i.i1702:                           ; preds = %if.then1595
+  unreachable
+
+if.else1609:                                      ; preds = %entry
+  unreachable
+
+return:                                           ; preds = %if.then1595
+  ret void
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll b/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
new file mode 100644
index 00000000000..ce0159071c6
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+%"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731" = type { double*, double*, double*, double** }
+
+; Function Attrs: nounwind ssp uwtable
+define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(%"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* nocapture %__last) {
+entry:
+  %_M_cur2.i.i = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, i64 0, i32 0
+  %0 = load double** %_M_cur2.i.i, align 8
+  %_M_first3.i.i = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, i64 0, i32 1
+  %_M_cur2.i.i81 = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__last, i64 0, i32 0
+  %1 = load double** %_M_cur2.i.i81, align 8
+  %_M_first3.i.i83 = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__last, i64 0, i32 1
+  %2 = load double** %_M_first3.i.i83, align 8
+  br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.cond.i.preheader
+
+while.cond.i.preheader:                           ; preds = %entry
+  br label %while.cond.i
+
+while.cond.i:                                     ; preds = %while.body.i, %while.cond.i.preheader
+  br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.body.i
+
+while.body.i:                                     ; preds = %while.cond.i
+  br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.cond.i
+
+_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit: ; preds = %while.body.i, %while.cond.i, %entry
+  %3 = phi double* [ %2, %entry ], [ %2, %while.cond.i ], [ undef, %while.body.i ]
+  %4 = phi double* [ %0, %entry ], [ %1, %while.cond.i ], [ undef, %while.body.i ]
+  store double* %4, double** %_M_cur2.i.i, align 8
+  store double* %3, double** %_M_first3.i.i, align 8
+  br i1 undef, label %if.then.i55, label %while.cond
+
+if.then.i55:                                      ; preds = %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %if.then.i55, %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit
+  br label %while.cond
+}
diff --git a/test/Transforms/SLPVectorizer/X86/crash_flop7.ll b/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
new file mode 100644
index 00000000000..e11be488f79
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @main() #0 {
+entry:
+  br i1 undef, label %while.body, label %while.end
+
+while.body:                                       ; preds = %entry
+  unreachable
+
+while.end:                                        ; preds = %entry
+  br i1 undef, label %for.end80, label %for.body75.lr.ph
+
+for.body75.lr.ph:                                 ; preds = %while.end
+  br label %for.body75
+
+for.body75:                                       ; preds = %for.body75, %for.body75.lr.ph
+  br label %for.body75
+
+for.end80:                                        ; preds = %while.end
+  br i1 undef, label %for.end300, label %for.body267.lr.ph
+
+for.body267.lr.ph:                                ; preds = %for.end80
+  br label %for.body267
+
+for.body267:                                      ; preds = %for.body267, %for.body267.lr.ph
+  %s.71010 = phi double [ 0.000000e+00, %for.body267.lr.ph ], [ %add297, %for.body267 ]
+  %mul269 = fmul double undef, undef
+  %mul270 = fmul double %mul269, %mul269
+  %add282 = fadd double undef, undef
+  %mul283 = fmul double %mul269, %add282
+  %add293 = fadd double undef, undef
+  %mul294 = fmul double %mul270, %add293
+  %add295 = fadd double undef, %mul294
+  %div296 = fdiv double %mul283, %add295
+  %add297 = fadd double %s.71010, %div296
+  br i1 undef, label %for.body267, label %for.end300
+
+for.end300:                                       ; preds = %for.body267, %for.end80
+  unreachable
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/SLPVectorizer/X86/crash_lame.ll b/test/Transforms/SLPVectorizer/X86/crash_lame.ll
new file mode 100644
index 00000000000..cfc3fa35a1e
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_lame.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Function Attrs: nounwind ssp uwtable
+define fastcc void @dct36(double* %inbuf) #0 {
+entry:
+  %arrayidx41 = getelementptr inbounds double* %inbuf, i64 2
+  %arrayidx44 = getelementptr inbounds double* %inbuf, i64 1
+  %0 = load double* %arrayidx44, align 8, !tbaa !0
+  %add46 = fadd double %0, undef
+  store double %add46, double* %arrayidx41, align 8, !tbaa !0
+  %1 = load double* %inbuf, align 8, !tbaa !0
+  %add49 = fadd double %1, %0
+  store double %add49, double* %arrayidx44, align 8, !tbaa !0
+  ret void
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"double", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/SLPVectorizer/X86/crash_lencod.ll b/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
new file mode 100644
index 00000000000..b35a5d74e2f
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @RCModelEstimator() {
+entry:
+  br i1 undef, label %for.body.lr.ph, label %for.end.thread
+
+for.end.thread:                                   ; preds = %entry
+  unreachable
+
+for.body.lr.ph:                                   ; preds = %entry
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %for.body.lr.ph
+  br i1 undef, label %for.body3, label %if.end103
+
+for.cond14.preheader:                             ; preds = %for.inc11
+  br i1 undef, label %for.body16.lr.ph, label %if.end103
+
+for.body16.lr.ph:                                 ; preds = %for.cond14.preheader
+  br label %for.body16
+
+for.body3:                                        ; preds = %for.inc11, %for.end
+  br i1 undef, label %if.then7, label %for.inc11
+
+if.then7:                                         ; preds = %for.body3
+  br label %for.inc11
+
+for.inc11:                                        ; preds = %if.then7, %for.body3
+  br i1 false, label %for.cond14.preheader, label %for.body3
+
+for.body16:                                       ; preds = %for.body16, %for.body16.lr.ph
+  br i1 undef, label %for.end39, label %for.body16
+
+for.end39:                                        ; preds = %for.body16
+  br i1 undef, label %if.end103, label %for.cond45.preheader
+
+for.cond45.preheader:                             ; preds = %for.end39
+  br i1 undef, label %if.then88, label %if.else
+
+if.then88:                                        ; preds = %for.cond45.preheader
+  %mul89 = fmul double 0.000000e+00, 0.000000e+00
+  %mul90 = fmul double 0.000000e+00, 0.000000e+00
+  %sub91 = fsub double %mul89, %mul90
+  %div92 = fdiv double %sub91, undef
+  %mul94 = fmul double 0.000000e+00, 0.000000e+00
+  %mul95 = fmul double 0.000000e+00, 0.000000e+00
+  %sub96 = fsub double %mul94, %mul95
+  %div97 = fdiv double %sub96, undef
+  br label %if.end103
+
+if.else:                                          ; preds = %for.cond45.preheader
+  br label %if.end103
+
+if.end103:                                        ; preds = %if.else, %if.then88, %for.end39, %for.cond14.preheader, %for.end
+  %0 = phi double [ 0.000000e+00, %for.end39 ], [ %div97, %if.then88 ], [ 0.000000e+00, %if.else ], [ 0.000000e+00, %for.cond14.preheader ], [ 0.000000e+00, %for.end ]
+  %1 = phi double [ undef, %for.end39 ], [ %div92, %if.then88 ], [ undef, %if.else ], [ 0.000000e+00, %for.cond14.preheader ], [ 0.000000e+00, %for.end ]
+  ret void
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/crash_lencod2.ll b/test/Transforms/SLPVectorizer/X86/crash_lencod2.ll
new file mode 100644
index 00000000000..d1e719c160a
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_lencod2.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @intrapred_luma() #0 {
+entry:
+  %conv153 = trunc i32 undef to i16
+  %arrayidx154 = getelementptr inbounds [13 x i16]* undef, i64 0, i64 12
+  store i16 %conv153, i16* %arrayidx154, align 8, !tbaa !0
+  %arrayidx155 = getelementptr inbounds [13 x i16]* undef, i64 0, i64 11
+  store i16 %conv153, i16* %arrayidx155, align 2, !tbaa !0
+  %arrayidx156 = getelementptr inbounds [13 x i16]* undef, i64 0, i64 10
+  store i16 %conv153, i16* %arrayidx156, align 4, !tbaa !0
+  ret void
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"short", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll b/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
new file mode 100644
index 00000000000..b3ca235c2c7
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @main() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.end44, %entry
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %if.then25, %for.body
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.inc21, %for.cond4.preheader
+  br label %for.body12
+
+for.body12:                                       ; preds = %if.end, %for.body6
+  %fZImg.069 = phi double [ undef, %for.body6 ], [ %add19, %if.end ]
+  %fZReal.068 = phi double [ undef, %for.body6 ], [ %add20, %if.end ]
+  %mul13 = fmul double %fZReal.068, %fZReal.068
+  %mul14 = fmul double %fZImg.069, %fZImg.069
+  %add15 = fadd double %mul13, %mul14
+  %cmp16 = fcmp ogt double %add15, 4.000000e+00
+  br i1 %cmp16, label %for.inc21, label %if.end
+
+if.end:                                           ; preds = %for.body12
+  %mul18 = fmul double undef, %fZImg.069
+  %add19 = fadd double undef, %mul18
+  %sub = fsub double %mul13, %mul14
+  %add20 = fadd double undef, %sub
+  br i1 undef, label %for.body12, label %for.inc21
+
+for.inc21:                                        ; preds = %if.end, %for.body12
+  br i1 undef, label %for.end23, label %for.body6
+
+for.end23:                                        ; preds = %for.inc21
+  br i1 undef, label %if.then25, label %if.then26
+
+if.then25:                                        ; preds = %for.end23
+  br i1 undef, label %for.end44, label %for.cond4.preheader
+
+if.then26:                                        ; preds = %for.end23
+  unreachable
+
+for.end44:                                        ; preds = %if.then25
+  br i1 undef, label %for.end48, label %for.body
+
+for.end48:                                        ; preds = %for.end44
+  ret void
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/crash_rc4.ll b/test/Transforms/SLPVectorizer/X86/crash_rc4.ll
new file mode 100644
index 00000000000..20374706d2b
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_rc4.ll
@@ -0,0 +1,28 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.rc4_state.0.24 = type { i32, i32, [256 x i32] }
+
+define void @rc4_crypt(%struct.rc4_state.0.24* nocapture %s) {
+entry:
+  %x1 = getelementptr inbounds %struct.rc4_state.0.24* %s, i64 0, i32 0
+  %y2 = getelementptr inbounds %struct.rc4_state.0.24* %s, i64 0, i32 1
+  br i1 undef, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %x.045 = phi i32 [ %conv4, %for.body ], [ undef, %entry ]
+  %conv4 = and i32 undef, 255
+  %conv7 = and i32 undef, 255
+  %idxprom842 = zext i32 %conv7 to i64
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %x.0.lcssa = phi i32 [ undef, %entry ], [ %conv4, %for.body ]
+  %y.0.lcssa = phi i32 [ undef, %entry ], [ %conv7, %for.body ]
+  store i32 %x.0.lcssa, i32* %x1, align 4
+  store i32 %y.0.lcssa, i32* %y2, align 4
+  ret void
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll b/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll
new file mode 100644
index 00000000000..05415456cf0
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll
@@ -0,0 +1,113 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171 = type { i32, i32, i32, i32, i32, i32, [8 x i8] }
+
+define void @SIM4() {
+entry:
+  br i1 undef, label %return, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  br i1 undef, label %return, label %if.end
+
+if.end:                                           ; preds = %lor.lhs.false
+  br i1 undef, label %for.end605, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %if.end
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc603, %for.body.lr.ph
+  br i1 undef, label %for.inc603, label %if.end12
+
+if.end12:                                         ; preds = %for.body
+  br i1 undef, label %land.lhs.true, label %land.lhs.true167
+
+land.lhs.true:                                    ; preds = %if.end12
+  br i1 undef, label %if.then17, label %land.lhs.true167
+
+if.then17:                                        ; preds = %land.lhs.true
+  br i1 undef, label %if.end98, label %land.rhs.lr.ph
+
+land.rhs.lr.ph:                                   ; preds = %if.then17
+  unreachable
+
+if.end98:                                         ; preds = %if.then17
+  %from299 = getelementptr inbounds %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 1
+  br i1 undef, label %land.lhs.true167, label %if.then103
+
+if.then103:                                       ; preds = %if.end98
+  %.sub100 = select i1 undef, i32 250, i32 undef
+  %mul114 = shl nsw i32 %.sub100, 2
+  %from1115 = getelementptr inbounds %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0
+  %cond125 = select i1 undef, i32 undef, i32 %mul114
+  br label %for.cond.i
+
+for.cond.i:                                       ; preds = %land.rhs.i874, %if.then103
+  %row.0.i = phi i32 [ undef, %land.rhs.i874 ], [ %.sub100, %if.then103 ]
+  %col.0.i = phi i32 [ undef, %land.rhs.i874 ], [ %cond125, %if.then103 ]
+  br i1 undef, label %land.rhs.i874, label %for.end.i
+
+land.rhs.i874:                                    ; preds = %for.cond.i
+  br i1 undef, label %for.cond.i, label %for.end.i
+
+for.end.i:                                        ; preds = %land.rhs.i874, %for.cond.i
+  br i1 undef, label %if.then.i, label %if.end.i
+
+if.then.i:                                        ; preds = %for.end.i
+  %add14.i = add nsw i32 %row.0.i, undef
+  %add15.i = add nsw i32 %col.0.i, undef
+  br label %extend_bw.exit
+
+if.end.i:                                         ; preds = %for.end.i
+  %add16.i = add i32 %cond125, %.sub100
+  %cmp26514.i = icmp slt i32 %add16.i, 0
+  br i1 %cmp26514.i, label %for.end33.i, label %for.body28.lr.ph.i
+
+for.body28.lr.ph.i:                               ; preds = %if.end.i
+  br label %for.end33.i
+
+for.end33.i:                                      ; preds = %for.body28.lr.ph.i, %if.end.i
+  br i1 undef, label %for.end58.i, label %for.body52.lr.ph.i
+
+for.body52.lr.ph.i:                               ; preds = %for.end33.i
+  br label %for.end58.i
+
+for.end58.i:                                      ; preds = %for.body52.lr.ph.i, %for.end33.i
+  br label %while.cond260.i
+
+while.cond260.i:                                  ; preds = %land.rhs263.i, %for.end58.i
+  br i1 undef, label %land.rhs263.i, label %while.end275.i
+
+land.rhs263.i:                                    ; preds = %while.cond260.i
+  br i1 undef, label %while.cond260.i, label %while.end275.i
+
+while.end275.i:                                   ; preds = %land.rhs263.i, %while.cond260.i
+  br label %extend_bw.exit
+
+extend_bw.exit:                                   ; preds = %while.end275.i, %if.then.i
+  %add14.i1262 = phi i32 [ %add14.i, %if.then.i ], [ undef, %while.end275.i ]
+  %add15.i1261 = phi i32 [ %add15.i, %if.then.i ], [ undef, %while.end275.i ]
+  br i1 false, label %if.then157, label %land.lhs.true167
+
+if.then157:                                       ; preds = %extend_bw.exit
+  %add158 = add nsw i32 %add14.i1262, 1
+  store i32 %add158, i32* %from299, align 4
+  %add160 = add nsw i32 %add15.i1261, 1
+  store i32 %add160, i32* %from1115, align 4
+  br label %land.lhs.true167
+
+land.lhs.true167:                                 ; preds = %if.then157, %extend_bw.exit, %if.end98, %land.lhs.true, %if.end12
+  unreachable
+
+for.inc603:                                       ; preds = %for.body
+  br i1 undef, label %for.body, label %for.end605
+
+for.end605:                                       ; preds = %for.inc603, %if.end
+  unreachable
+
+return:                                           ; preds = %lor.lhs.false, %entry
+  ret void
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
new file mode 100644
index 00000000000..ac7e412a534
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -0,0 +1,65 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.Ray.5.11.53.113.119.137.149.185.329.389.415 = type { %struct.Vec.0.6.48.108.114.132.144.180.324.384.414, %struct.Vec.0.6.48.108.114.132.144.180.324.384.414 }
+%struct.Vec.0.6.48.108.114.132.144.180.324.384.414 = type { double, double, double }
+
+; Function Attrs: ssp uwtable
+define void @main() #0 {
+entry:
+  br i1 undef, label %cond.true, label %cond.end
+
+cond.true:                                        ; preds = %entry
+  unreachable
+
+cond.end:                                         ; preds = %entry
+  br label %invoke.cont
+
+invoke.cont:                                      ; preds = %invoke.cont, %cond.end
+  br i1 undef, label %arrayctor.cont, label %invoke.cont
+
+arrayctor.cont:                                   ; preds = %invoke.cont
+  %agg.tmp99208.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.415* undef, i64 0, i32 0, i32 0
+  %agg.tmp99208.sroa.1.8.idx388 = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.415* undef, i64 0, i32 0, i32 1
+  %agg.tmp101211.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.415* undef, i64 0, i32 1, i32 0
+  %agg.tmp101211.sroa.1.8.idx390 = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.415* undef, i64 0, i32 1, i32 1
+  br label %for.cond36.preheader
+
+for.cond36.preheader:                             ; preds = %_Z5clampd.exit.1, %arrayctor.cont
+  br i1 undef, label %for.body42.lr.ph.us, label %_Z5clampd.exit.1
+
+cond.false51.us:                                  ; preds = %for.body42.lr.ph.us
+  unreachable
+
+cond.true48.us:                                   ; preds = %for.body42.lr.ph.us
+  br i1 undef, label %cond.true63.us, label %cond.false66.us
+
+cond.false66.us:                                  ; preds = %cond.true48.us
+  %add.i276.us = fadd double 0.000000e+00, undef
+  %add.i264.us = fadd double %add.i276.us, 0.000000e+00
+  %add4.i267.us = fadd double undef, 0xBFA5CC2D1960285F
+  %mul.i254.us = fmul double %add.i264.us, 1.400000e+02
+  %mul2.i256.us = fmul double %add4.i267.us, 1.400000e+02
+  %add.i243.us = fadd double %mul.i254.us, 5.000000e+01
+  %add4.i246.us = fadd double %mul2.i256.us, 5.200000e+01
+  %mul.i.i.us = fmul double undef, %add.i264.us
+  %mul2.i.i.us = fmul double undef, %add4.i267.us
+  store double %add.i243.us, double* %agg.tmp99208.sroa.0.0.idx, align 8
+  store double %add4.i246.us, double* %agg.tmp99208.sroa.1.8.idx388, align 8
+  store double %mul.i.i.us, double* %agg.tmp101211.sroa.0.0.idx, align 8
+  store double %mul2.i.i.us, double* %agg.tmp101211.sroa.1.8.idx390, align 8
+  unreachable
+
+cond.true63.us:                                   ; preds = %cond.true48.us
+  unreachable
+
+for.body42.lr.ph.us:                              ; preds = %for.cond36.preheader
+  br i1 undef, label %cond.true48.us, label %cond.false51.us
+
+_Z5clampd.exit.1:                                 ; preds = %for.cond36.preheader
+  br label %for.cond36.preheader
+}
+
+attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/SLPVectorizer/X86/crash_smallpt2.ll b/test/Transforms/SLPVectorizer/X86/crash_smallpt2.ll
new file mode 100644
index 00000000000..84c7b3a0460
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_smallpt2.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601 = type { %struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600, %struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600 }
+%struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600 = type { double, double, double }
+
+; Function Attrs: ssp uwtable
+define void @_Z8radianceRK3RayiPt() #0 {
+entry:
+  br i1 undef, label %if.then78, label %if.then38
+
+if.then38:                                        ; preds = %entry
+  %mul.i.i790 = fmul double undef, undef
+  %mul3.i.i792 = fmul double undef, undef
+  %mul.i764 = fmul double undef, %mul3.i.i792
+  %mul4.i767 = fmul double undef, undef
+  %sub.i768 = fsub double %mul.i764, %mul4.i767
+  %mul6.i770 = fmul double undef, %mul.i.i790
+  %mul9.i772 = fmul double undef, %mul3.i.i792
+  %sub10.i773 = fsub double %mul6.i770, %mul9.i772
+  %mul.i736 = fmul double undef, %sub.i768
+  %mul2.i738 = fmul double undef, %sub10.i773
+  %mul.i727 = fmul double undef, %mul.i736
+  %mul2.i729 = fmul double undef, %mul2.i738
+  %add.i716 = fadd double undef, %mul.i727
+  %add4.i719 = fadd double undef, %mul2.i729
+  %add.i695 = fadd double undef, %add.i716
+  %add4.i698 = fadd double undef, %add4.i719
+  %mul.i.i679 = fmul double undef, %add.i695
+  %mul2.i.i680 = fmul double undef, %add4.i698
+  %agg.tmp74663.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0
+  store double %mul.i.i679, double* %agg.tmp74663.sroa.0.0.idx, align 8
+  %agg.tmp74663.sroa.1.8.idx943 = getelementptr inbounds %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 1
+  store double %mul2.i.i680, double* %agg.tmp74663.sroa.1.8.idx943, align 8
+  br label %return
+
+if.then78:                                        ; preds = %entry
+  br label %return
+
+return:                                           ; preds = %if.then78, %if.then38
+  ret void
+}
+
+attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/SLPVectorizer/X86/diamond.ll b/test/Transforms/SLPVectorizer/X86/diamond.ll
index 008f09db454..2a237eaffdf 100644
--- a/test/Transforms/SLPVectorizer/X86/diamond.ll
+++ b/test/Transforms/SLPVectorizer/X86/diamond.ll
@@ -50,7 +50,8 @@ entry:
 ; }
 
 ; CHECK: @extr_user
-; CHECK: load i32*
+; CHECK: load <4 x i32>
+; CHECK-NEXT: extractelement <4 x i32>
 ; CHECK: store <4 x i32>
 ; CHECK-NEXT: ret
 define i32 @extr_user(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) {
@@ -79,7 +80,8 @@ entry:
 
 ; In this example we have an external user that is not the first element in the vector.
 ; CHECK: @extr_user1
-; CHECK: load i32*
+; CHECK: load <4 x i32>
+; CHECK-NEXT: extractelement <4 x i32>
 ; CHECK: store <4 x i32>
 ; CHECK-NEXT: ret
 define i32 @extr_user1(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) {
diff --git a/test/Transforms/SLPVectorizer/X86/long_chains.ll b/test/Transforms/SLPVectorizer/X86/long_chains.ll
index 0a2ace3f21f..5af3e6d6e90 100644
--- a/test/Transforms/SLPVectorizer/X86/long_chains.ll
+++ b/test/Transforms/SLPVectorizer/X86/long_chains.ll
@@ -3,12 +3,13 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
+; At this point we can't vectorize only parts of the tree.
+
 ; CHECK: test
-; CHECK: sitofp i8
-; CHECK-NEXT: sitofp i8
-; CHECK-NEXT: insertelement
-; CHECK-NEXT: insertelement
-; CHECK-NEXT: fmul <2 x double>
+; CHECK: insertelement <2 x i8>
+; CHECK: insertelement <2 x i8>
+; CHECK: sitofp <2 x i8>
+; CHECK: fmul <2 x double>
 ; CHECK: ret
 define i32 @test(double* nocapture %A, i8* nocapture %B) {
 entry:
@@ -18,7 +19,7 @@ entry:
   %add = add i8 %0, 3
   %add4 = add i8 %1, 3
   %conv6 = sitofp i8 %add to double
-  %conv7 = sitofp i8 %add4 to double ; <--- This is inefficient. The chain stops here.
+  %conv7 = sitofp i8 %add4 to double 
   %mul = fmul double %conv6, %conv6
   %add8 = fadd double %mul, 1.000000e+00
   %mul9 = fmul double %conv7, %conv7
diff --git a/test/Transforms/SLPVectorizer/X86/saxpy.ll b/test/Transforms/SLPVectorizer/X86/saxpy.ll
index b520913a398..46263416a90 100644
--- a/test/Transforms/SLPVectorizer/X86/saxpy.ll
+++ b/test/Transforms/SLPVectorizer/X86/saxpy.ll
@@ -43,3 +43,19 @@ define void @SAXPY(i32* noalias nocapture %x, i32* noalias nocapture %y, i32 %a,
   ret void
 }
 
+; Make sure we don't crash on this one.
+define void @SAXPY_crash(i32* noalias nocapture %x, i32* noalias nocapture %y, i64 %i) {
+  %1 = add i64 %i, 1
+  %2 = getelementptr inbounds i32* %x, i64 %1
+  %3 = getelementptr inbounds i32* %y, i64 %1
+  %4 = load i32* %3, align 4
+  %5 = add nsw i32 undef, %4
+  store i32 %5, i32* %2, align 4
+  %6 = add i64 %i, 2
+  %7 = getelementptr inbounds i32* %x, i64 %6
+  %8 = getelementptr inbounds i32* %y, i64 %6
+  %9 = load i32* %8, align 4
+  %10 = add nsw i32 undef, %9
+  store i32 %10, i32* %7, align 4
+  ret void
+}
-- 
2.34.1