Merge alignment of common GlobalValue.
[oota-llvm.git] / lib / Transforms / Vectorize / SLPVectorizer.cpp
index 59e2ff937f7ab4bc80ca5bc694851c4b13c6d231..76b46779535d8c475ddee27fb041e9de4dad1e2b 100644 (file)
@@ -166,6 +166,23 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
   return Opcode;
 }
 
+/// Get the intersection (logical and) of all of the potential IR flags
+/// of each scalar operation (VL) that will be converted into a vector (I).
+/// Flag set: NSW, NUW, exact, and all of fast-math.
+static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
+  if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
+    if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
+      // Intersection is initialized to the 0th scalar,
+      // so start counting from index '1'.
+      for (int i = 1, e = VL.size(); i < e; ++i) {
+        if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
+          Intersection->andIRFlags(Scalar);
+      }
+      VecOp->copyIRFlags(Intersection);
+    }
+  }
+}
+  
 /// \returns \p I after propagating metadata from \p VL.
 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
   Instruction *I0 = cast<Instruction>(VL[0]);
@@ -342,6 +359,33 @@ static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
   }
 }
 
+/// \returns True if in-tree use also needs extract. This refers to
+/// possible scalar operand in vectorized instruction.
+static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
+                                    TargetLibraryInfo *TLI) {
+
+  unsigned Opcode = UserInst->getOpcode();
+  switch (Opcode) {
+  case Instruction::Load: {
+    LoadInst *LI = cast<LoadInst>(UserInst);
+    return (LI->getPointerOperand() == Scalar);
+  }
+  case Instruction::Store: {
+    StoreInst *SI = cast<StoreInst>(UserInst);
+    return (SI->getPointerOperand() == Scalar);
+  }
+  case Instruction::Call: {
+    CallInst *CI = cast<CallInst>(UserInst);
+    Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+    if (hasVectorInstrinsicScalarOpd(ID, 1)) {
+      return (CI->getArgOperand(1) == Scalar);
+    }
+  }
+  default:
+    return false;
+  }
+}
+
 /// Bottom Up SLP Vectorizer.
 class BoUpSLP {
 public:
@@ -864,18 +908,27 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
       for (User *U : Scalar->users()) {
         DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
 
-        // Skip in-tree scalars that become vectors.
-        if (ScalarToTreeEntry.count(U)) {
-          DEBUG(dbgs() << "SLP: \tInternal user will be removed:" <<
-                *U << ".\n");
-          int Idx = ScalarToTreeEntry[U]; (void) Idx;
-          assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
-          continue;
-        }
         Instruction *UserInst = dyn_cast<Instruction>(U);
         if (!UserInst)
           continue;
 
+        // Skip in-tree scalars that become vectors
+        if (ScalarToTreeEntry.count(U)) {
+          int Idx = ScalarToTreeEntry[U];
+          TreeEntry *UseEntry = &VectorizableTree[Idx];
+          Value *UseScalar = UseEntry->Scalars[0];
+          // Some in-tree scalars will remain as scalar in vectorized
+          // instructions. If that is the case, the one in Lane 0 will
+          // be used.
+          if (UseScalar != U ||
+              !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+            DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+                         << ".\n");
+            assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
+            continue;
+          }
+        }
+
         // Ignore users in the user ignore list.
         if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
             UserIgnoreList.end())
@@ -1179,6 +1232,54 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       }
       return;
     }
+    case Instruction::GetElementPtr: {
+      // We don't combine GEPs with complicated (nested) indexing.
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
+          DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+          BS.cancelScheduling(VL);
+          newTreeEntry(VL, false);
+          return;
+        }
+      }
+
+      // We can't combine several GEPs into one vector if they operate on
+      // different types.
+      Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
+        if (Ty0 != CurTy) {
+          DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+          BS.cancelScheduling(VL);
+          newTreeEntry(VL, false);
+          return;
+        }
+      }
+
+      // We don't combine GEPs with non-constant indexes.
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        auto Op = cast<Instruction>(VL[j])->getOperand(1);
+        if (!isa<ConstantInt>(Op)) {
+          DEBUG(
+              dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+          BS.cancelScheduling(VL);
+          newTreeEntry(VL, false);
+          return;
+        }
+      }
+
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+      for (unsigned i = 0, e = 2; i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1);
+      }
+      return;
+    }
     case Instruction::Store: {
       // Check if the stores are consecutive or of we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
@@ -1416,6 +1517,20 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       }
       return VecCost - ScalarCost;
     }
+    case Instruction::GetElementPtr: {
+      TargetTransformInfo::OperandValueKind Op1VK =
+          TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Op2VK =
+          TargetTransformInfo::OK_UniformConstantValue;
+
+      int ScalarCost =
+          VecTy->getNumElements() *
+          TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+      int VecCost =
+          TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
+
+      return VecCost - ScalarCost;
+    }
     case Instruction::Load: {
       // Cost of wide load - cost of scalar loads.
       int ScalarLdCost = VecTy->getNumElements() *
@@ -1933,6 +2048,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
       E->VectorizedValue = V;
+      propagateIRFlags(E->VectorizedValue, E->Scalars);
       ++NumVectorInstructions;
 
       if (Instruction *I = dyn_cast<Instruction>(V))
@@ -1951,6 +2067,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
                                             VecTy->getPointerTo(AS));
+
+      // The pointer operand uses an in-tree scalar so we add the new BitCast to
+      // ExternalUses list to make sure that an extract will be generated in the
+      // future.
+      if (ScalarToTreeEntry.count(LI->getPointerOperand()))
+        ExternalUses.push_back(
+            ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
+
       unsigned Alignment = LI->getAlignment();
       LI = Builder.CreateLoad(VecPtr);
       if (!Alignment)
@@ -1975,6 +2099,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
                                             VecTy->getPointerTo(AS));
       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
+
+      // The pointer operand uses an in-tree scalar so we add the new BitCast to
+      // ExternalUses list to make sure that an extract will be generated in the
+      // future.
+      if (ScalarToTreeEntry.count(SI->getPointerOperand()))
+        ExternalUses.push_back(
+            ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
+
       if (!Alignment)
         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
       S->setAlignment(Alignment);
@@ -1982,11 +2114,41 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       ++NumVectorInstructions;
       return propagateMetadata(S, E->Scalars);
     }
+    case Instruction::GetElementPtr: {
+      setInsertPointAfterBundle(E->Scalars);
+
+      ValueList Op0VL;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i)
+        Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
+
+      Value *Op0 = vectorizeTree(Op0VL);
+
+      std::vector<Value *> OpVecs;
+      for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
+           ++j) {
+        ValueList OpVL;
+        for (int i = 0, e = E->Scalars.size(); i < e; ++i)
+          OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
+
+        Value *OpVec = vectorizeTree(OpVL);
+        OpVecs.push_back(OpVec);
+      }
+
+      Value *V = Builder.CreateGEP(Op0, OpVecs);
+      E->VectorizedValue = V;
+      ++NumVectorInstructions;
+
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        return propagateMetadata(I, E->Scalars);
+
+      return V;
+    }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
       setInsertPointAfterBundle(E->Scalars);
       Function *FI;
       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
+      Value *ScalarArg = nullptr;
       if (CI && (FI = CI->getCalledFunction())) {
         IID = (Intrinsic::ID) FI->getIntrinsicID();
       }
@@ -1997,6 +2159,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         // a scalar. This argument should not be vectorized.
         if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
           CallInst *CEI = cast<CallInst>(E->Scalars[0]);
+          ScalarArg = CEI->getArgOperand(j);
           OpVecs.push_back(CEI->getArgOperand(j));
           continue;
         }
@@ -2015,6 +2178,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
       Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
       Value *V = Builder.CreateCall(CF, OpVecs);
+
+      // The scalar argument uses an in-tree scalar so we add the new vectorized
+      // call to ExternalUses list to make sure that an extract will be
+      // generated in the future.
+      if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
+        ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -2042,18 +2212,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
       Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
 
-      // Create appropriate shuffle to take alternative operations from
-      // the vector.
-      std::vector<Constant *> Mask(E->Scalars.size());
+      // Create shuffle to take alternate operations from the vector.
+      // Also, gather up odd and even scalar ops to propagate IR flags to
+      // each vector operation.
+      ValueList OddScalars, EvenScalars;
       unsigned e = E->Scalars.size();
+      SmallVector<Constant *, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
-        if (i & 1)
+        if (i & 1) {
           Mask[i] = Builder.getInt32(e + i);
-        else
+          OddScalars.push_back(E->Scalars[i]);
+        } else {
           Mask[i] = Builder.getInt32(i);
+          EvenScalars.push_back(E->Scalars[i]);
+        }
       }
 
       Value *ShuffleMask = ConstantVector::get(Mask);
+      propagateIRFlags(V0, EvenScalars);
+      propagateIRFlags(V1, OddScalars);
 
       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
       E->VectorizedValue = V;
@@ -3011,11 +3188,9 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
     BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
     BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
     if (tryToVectorizePair(A, B0, R)) {
-      B->moveBefore(V);
       return true;
     }
     if (tryToVectorizePair(A, B1, R)) {
-      B->moveBefore(V);
       return true;
     }
   }
@@ -3025,11 +3200,9 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
     BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
     BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
     if (tryToVectorizePair(A0, B, R)) {
-      A->moveBefore(V);
       return true;
     }
     if (tryToVectorizePair(A1, B, R)) {
-      A->moveBefore(V);
       return true;
     }
   }