Scalarizer for masked.gather and masked.scatter intrinsics.

author Elena Demikhovsky <elena.demikhovsky@intel.com>

Sun, 25 Oct 2015 15:37:55 +0000 (15:37 +0000)

committer Elena Demikhovsky <elena.demikhovsky@intel.com>

Sun, 25 Oct 2015 15:37:55 +0000 (15:37 +0000)
author Elena Demikhovsky <elena.demikhovsky@intel.com>
Sun, 25 Oct 2015 15:37:55 +0000 (15:37 +0000)
committer Elena Demikhovsky <elena.demikhovsky@intel.com>
Sun, 25 Oct 2015 15:37:55 +0000 (15:37 +0000)
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h

index 084fd45f27fb7ddb7eb7694888e76edf292cce33..1ebbaa034757440e4a7f9d060b1f47746587774d 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -316,6 +316,12 @@ public:
    bool isLegalMaskedStore(Type *DataType) const;
    bool isLegalMaskedLoad(Type *DataType) const;
  
+  /// \brief Return true if the target supports masked gather/scatter
+  /// AVX-512 fully supports gather and scatter for vectors with 32 and 64
+  /// bits scalar type.
+  bool isLegalMaskedScatter(Type *DataType) const;
+  bool isLegalMaskedGather(Type *DataType) const;
+
    /// \brief Return the cost of the scaling factor used in the addressing
    /// mode represented by AM for this target, for a load/store
    /// of the specified type.
@@ -569,6 +575,8 @@ public:
                                       unsigned AddrSpace) = 0;
    virtual bool isLegalMaskedStore(Type *DataType) = 0;
    virtual bool isLegalMaskedLoad(Type *DataType) = 0;
+  virtual bool isLegalMaskedScatter(Type *DataType) = 0;
+  virtual bool isLegalMaskedGather(Type *DataType) = 0;
    virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                     int64_t BaseOffset, bool HasBaseReg,
                                     int64_t Scale, unsigned AddrSpace) = 0;
@@ -698,6 +706,12 @@ public:
    bool isLegalMaskedLoad(Type *DataType) override {
      return Impl.isLegalMaskedLoad(DataType);
    }
+  bool isLegalMaskedScatter(Type *DataType) override {
+    return Impl.isLegalMaskedScatter(DataType);
+  }
+  bool isLegalMaskedGather(Type *DataType) override {
+    return Impl.isLegalMaskedGather(DataType);
+  }
    int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                             bool HasBaseReg, int64_t Scale,
                             unsigned AddrSpace) override {
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h

index 47609ff52902ccb513686877183c42917e58c019..8d5d2a71e426b9b67440703e154c4a15dcd0db91 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -213,6 +213,10 @@ public:
  
    bool isLegalMaskedLoad(Type *DataType) { return false; }
  
+  bool isLegalMaskedScatter(Type *DataType) { return false; }
+
+  bool isLegalMaskedGather(Type *DataType) { return false; }
+
    int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                             bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
      // Guess that all legal addressing mode are free.
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp

index 427e1ed8ceb6f2feb028eb99560ef8ced44da956..f82af757125b0e2d3bd58568f2369391e6c3f314 100644 (file)
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -121,6 +121,14 @@ bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType) const {
    return TTIImpl->isLegalMaskedLoad(DataType);
  }
  
+bool TargetTransformInfo::isLegalMaskedGather(Type *DataType) const {
+  return TTIImpl->isLegalMaskedGather(DataType);
+}
+
+bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType) const {
+  return TTIImpl->isLegalMaskedGather(DataType);
+}
+
  int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                                int64_t BaseOffset,
                                                bool HasBaseReg,
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp

index 6f00c471c213e160736c1b96da1ef5de6a2223ed..87669d772d764867109792c595bde433ba82aebe 100644 (file)
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -1215,7 +1215,7 @@ static void ScalarizeMaskedLoad(CallInst *CI) {
  
      Value *Gep =
          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-    LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal);
+    LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
      VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
  
      // Create "else" block, fill it in the next iteration
@@ -1353,6 +1353,250 @@ static void ScalarizeMaskedStore(CallInst *CI) {
    CI->eraseFromParent();
  }
  
+// Translate a masked gather intrinsic like
+// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
+//                               <16 x i1> %Mask, <16 x i32> %Src)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+// 
+// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> %Mask, i32 0
+// % ToLoad0 = icmp eq i1 % Mask0, true
+// br i1 % ToLoad0, label %cond.load, label %else
+// 
+// cond.load:
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// % Load0 = load i32, i32* % Ptr0, align 4
+// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
+// br label %else
+// 
+// else:
+// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
+// % Mask1 = extractelement <16 x i1> %Mask, i32 1
+// % ToLoad1 = icmp eq i1 % Mask1, true
+// br i1 % ToLoad1, label %cond.load1, label %else2
+// 
+// cond.load1:
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// % Load1 = load i32, i32* % Ptr1, align 4
+// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
+// br label %else2
+// . . .
+// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
+// ret <16 x i32> %Result
+static void ScalarizeMaskedGather(CallInst *CI) {
+  Value *Ptrs = CI->getArgOperand(0);
+  Value *Alignment = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+  Value *Src0 = CI->getArgOperand(3);
+
+  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+
+  assert(VecType && "Unexpected return type of masked load intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  BasicBlock *CondBlock = nullptr;
+  BasicBlock *PrevIfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  Value *UndefVal = UndefValue::get(VecType);
+
+  // The result vector
+  Value *VResult = UndefVal;
+  unsigned VectorWidth = VecType->getNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  bool IsConstMask = isa<ConstantVector>(Mask);
+
+  if (IsConstMask) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                                "Ptr" + Twine(Idx));
+      LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
+                                                 "Load" + Twine(Idx));
+      VResult = Builder.CreateInsertElement(VResult, Load,
+                                            Builder.getInt32(Idx),
+                                            "Res" + Twine(Idx));
+    }
+    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  PHINode *Phi = nullptr;
+  Value *PrevPhi = UndefVal;
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %Mask1 = extractelement <16 x i1> %Mask, i32 1
+    //  %ToLoad1 = icmp eq i1 %Mask1, true
+    //  br i1 %ToLoad1, label %cond.load, label %else
+    //
+    if (Idx > 0) {
+      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+      Phi->addIncoming(VResult, CondBlock);
+      Phi->addIncoming(PrevPhi, PrevIfBlock);
+      PrevPhi = Phi;
+      VResult = Phi;
+    }
+
+    Value *Predicate = Builder.CreateExtractElement(Mask,
+                                                    Builder.getInt32(Idx),
+                                                    "Mask" + Twine(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1),
+                                    "ToLoad" + Twine(Idx));
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                              "Ptr" + Twine(Idx));
+    LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
+                                               "Load" + Twine(Idx));
+    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
+                                          "Res" + Twine(Idx));
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+  }
+
+  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+  Phi->addIncoming(VResult, CondBlock);
+  Phi->addIncoming(PrevPhi, PrevIfBlock);
+  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+  CI->replaceAllUsesWith(NewI);
+  CI->eraseFromParent();
+}
+
+// Translate a masked scatter intrinsic, like
+// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
+//                                  <16 x i1> %Mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set.
+//
+// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> % Mask, i32 0
+// % ToStore0 = icmp eq i1 % Mask0, true
+// br i1 %ToStore0, label %cond.store, label %else
+//
+// cond.store:
+// % Elt0 = extractelement <16 x i32> %Src, i32 0
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// store i32 %Elt0, i32* % Ptr0, align 4
+// br label %else
+// 
+// else:
+// % Mask1 = extractelement <16 x i1> % Mask, i32 1
+// % ToStore1 = icmp eq i1 % Mask1, true
+// br i1 % ToStore1, label %cond.store1, label %else2
+//
+// cond.store1:
+// % Elt1 = extractelement <16 x i32> %Src, i32 1
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// store i32 % Elt1, i32* % Ptr1, align 4
+// br label %else2
+//   . . .
+static void ScalarizeMaskedScatter(CallInst *CI) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptrs = CI->getArgOperand(1);
+  Value *Alignment = CI->getArgOperand(2);
+  Value *Mask = CI->getArgOperand(3);
+
+  assert(isa<VectorType>(Src->getType()) &&
+         "Unexpected data type in masked scatter intrinsic");
+  assert(isa<VectorType>(Ptrs->getType()) &&
+         isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
+         "Vector of pointers is expected in masked scatter intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  unsigned VectorWidth = Src->getType()->getVectorNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  bool IsConstMask = isa<ConstantVector>(Mask);
+
+  if (IsConstMask) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+                                                   "Elt" + Twine(Idx));
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                                "Ptr" + Twine(Idx));
+      Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+    }
+    CI->eraseFromParent();
+    return;
+  }
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
+    //  % ToStore = icmp eq i1 % Mask1, true
+    //  br i1 % ToStore, label %cond.store, label %else
+    //
+    Value *Predicate = Builder.CreateExtractElement(Mask,
+                                                    Builder.getInt32(Idx),
+                                                    "Mask" + Twine(Idx));
+    Value *Cmp =
+       Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                          ConstantInt::get(Predicate->getType(), 1),
+                          "ToStore" + Twine(Idx));
+
+    // Create "cond" block
+    //
+    //  % Elt1 = extractelement <16 x i32> %Src, i32 1
+    //  % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+    //  %store i32 % Elt1, i32* % Ptr1
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+                                                 "Elt" + Twine(Idx));
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                              "Ptr" + Twine(Idx));
+    Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
+  }
+  CI->eraseFromParent();
+}
+
  bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
    BasicBlock *BB = CI->getParent();
  
@@ -1460,6 +1704,22 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
        }
        return false;
      }
+    case Intrinsic::masked_gather: {
+      if (!TTI->isLegalMaskedGather(CI->getType())) {
+        ScalarizeMaskedGather(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_scatter: {
+      if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
+        ScalarizeMaskedScatter(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
      case Intrinsic::aarch64_stlxr:
      case Intrinsic::aarch64_stxr: {
        ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp

index ba2f71c6f629ee6f7b8722ee6bf607fa627c3b44..1b825df4c87360fcd69a420958d582d295ea5920 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1203,6 +1203,33 @@ bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
    return isLegalMaskedLoad(DataType);
  }
  
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
+  // This function is called now in two cases: from the Loop Vectorizer
+  // and from the Scalarizer.
+  // When the Loop Vectorizer asks about legality of the feature,
+  // the vectorization factor is not calculated yet. The Loop Vectorizer
+  // sends a scalar type and the decision is based on the width of the
+  // scalar element.
+  // Later on, the cost model will estimate usage this intrinsic based on
+  // the vector type.
+  // The Scalarizer asks again about legality. It sends a vector type.
+  // In this case we can reject non-power-of-2 vectors.
+  if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements()))
+    return false;
+  Type *ScalarTy = DataTy->getScalarType();
+  // TODO: Pointers should also be legal,
+  // but it requires additional support in composing intrinsics name.
+  // getPrimitiveSizeInBits() returns 0 for PointerType
+  int DataWidth = ScalarTy->getPrimitiveSizeInBits();
+
+  // AVX-512 allows gather and scatter
+  return DataWidth >= 32 && ST->hasAVX512();
+}
+
+bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
+  return isLegalMaskedGather(DataType);
+}
+
  bool X86TTIImpl::areInlineCompatible(const Function *Caller,
                                       const Function *Callee) const {
    const TargetMachine &TM = getTLI()->getTargetMachine();
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h

index 5ddde2a45a570961bffdaa9678be1555a2b49711..e337475ed410b9f461b070ce0a00c9655dacb1b5 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -90,6 +90,8 @@ public:
                      Type *Ty);
    bool isLegalMaskedLoad(Type *DataType);
    bool isLegalMaskedStore(Type *DataType);
+  bool isLegalMaskedGather(Type *DataType);
+  bool isLegalMaskedScatter(Type *DataType);
    bool areInlineCompatible(const Function *Caller,
                             const Function *Callee) const;
  
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll

index 11d87bb675539d837aeb2099bb7ccc185eb416dd..b65146ed41729a158812fcec6858c3d5f38db36f 100644 (file)
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1,4 +1,6 @@
  ; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=knl < %s | FileCheck %s -check-prefix=KNL
+; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
+
  
  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-unknown-linux-gnu"
@@ -6,6 +8,14 @@ target triple = "x86_64-unknown-linux-gnu"
  ; KNL-LABEL: test1
  ; KNL: kxnorw  %k1, %k1, %k1
  ; KNL: vgatherdps      (%rdi,%zmm0,4), %zmm1 {%k1}
+
+; SCALAR-LABEL: test1
+; SCALAR:      extractelement <16 x float*> 
+; SCALAR-NEXT: load float
+; SCALAR-NEXT: insertelement <16 x float>
+; SCALAR-NEXT: extractelement <16 x float*>
+; SCALAR-NEXT: load float
+
  define <16 x float> @test1(float* %base, <16 x i32> %ind) {
  
    %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
@@ -25,6 +35,18 @@ declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i3
  ; KNL-LABEL: test2
  ; KNL: kmovw %esi, %k1
  ; KNL: vgatherdps      (%rdi,%zmm0,4), %zmm1 {%k1}
+
+; SCALAR-LABEL: test2
+; SCALAR:      extractelement <16 x float*> 
+; SCALAR-NEXT: load float
+; SCALAR-NEXT: insertelement <16 x float>
+; SCALAR-NEXT: br label %else
+; SCALAR: else:
+; SCALAR-NEXT:  %res.phi.else = phi 
+; SCALAR-NEXT:  %Mask1 = extractelement <16 x i1> %imask, i32 1
+; SCALAR-NEXT:  %ToLoad1 = icmp eq i1 %Mask1, true
+; SCALAR-NEXT:  br i1 %ToLoad1, label %cond.load1, label %else2
+
  define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
  
    %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
@@ -76,6 +98,20 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
  ; KNL: vpscatterdd {{.*}}%k2
  ; KNL: vpscatterdd {{.*}}%k1
  
+; SCALAR-LABEL: test5
+; SCALAR:        %Mask0 = extractelement <16 x i1> %imask, i32 0
+; SCALAR-NEXT:   %ToStore0 = icmp eq i1 %Mask0, true
+; SCALAR-NEXT:   br i1 %ToStore0, label %cond.store, label %else
+; SCALAR: cond.store:
+; SCALAR-NEXT:  %Elt0 = extractelement <16 x i32> %val, i32 0
+; SCALAR-NEXT:  %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0
+; SCALAR-NEXT:  store i32 %Elt0, i32* %Ptr0, align 4
+; SCALAR-NEXT:  br label %else
+; SCALAR: else:
+; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
+; SCALAR-NEXT:  %ToStore1 = icmp eq i1 %Mask1, true
+; SCALAR-NEXT:  br i1 %ToStore1, label %cond.store1, label %else2
+
  define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
  
    %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
@@ -96,6 +132,16 @@ declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x
  ; KNL: kxnorw  %k2, %k2, %k2
  ; KNL: vpgatherqd      (,%zmm{{.*}}), %ymm{{.*}} {%k2}
  ; KNL: vpscatterqd     %ymm{{.*}}, (,%zmm{{.*}}) {%k1}
+
+; SCALAR-LABEL: test6
+; SCALAR:        store i32 %Elt0, i32* %Ptr01, align 4
+; SCALAR-NEXT:   %Elt1 = extractelement <8 x i32> %a1, i32 1
+; SCALAR-NEXT:   %Ptr12 = extractelement <8 x i32*> %ptr, i32 1
+; SCALAR-NEXT:   store i32 %Elt1, i32* %Ptr12, align 4
+; SCALAR-NEXT:   %Elt2 = extractelement <8 x i32> %a1, i32 2
+; SCALAR-NEXT:   %Ptr23 = extractelement <8 x i32*> %ptr, i32 2
+; SCALAR-NEXT:   store i32 %Elt2, i32* %Ptr23, align 4
+
  define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
  
    %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
@@ -245,3 +291,42 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
  }
  
  
+; KNL-LABEL: test15
+; KNL: kmovw %eax, %k1
+; KNL: vgatherdps      (%rdi,%zmm0,4), %zmm1 {%k1}
+
+; SCALAR-LABEL: test15
+; SCALAR:      extractelement <16 x float*> 
+; SCALAR-NEXT: load float
+; SCALAR-NEXT: insertelement <16 x float>
+; SCALAR-NEXT: extractelement <16 x float*>
+; SCALAR-NEXT: load float
+
+define <16 x float> @test15(float* %base, <16 x i32> %ind) {
+
+  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
+  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+; Check non-power-of-2 case. It should be scalarized.
+declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
+; KNL-LABEL: test16
+; KNL: testb
+; KNL: je
+; KNL: testb
+; KNL: je
+; KNL: testb
+; KNL: je
+define <3 x i32> @test16(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
+  %sext_ind = sext <3 x i32> %ind to <3 x i64>
+  %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
+  %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
+  ret <3 x i32>%res
+}
+
author	Elena Demikhovsky <elena.demikhovsky@intel.com>
	Sun, 25 Oct 2015 15:37:55 +0000 (15:37 +0000)
committer	Elena Demikhovsky <elena.demikhovsky@intel.com>
	Sun, 25 Oct 2015 15:37:55 +0000 (15:37 +0000)
include/llvm/Analysis/TargetTransformInfo.h		patch \| blob \| history
include/llvm/Analysis/TargetTransformInfoImpl.h		patch \| blob \| history
lib/Analysis/TargetTransformInfo.cpp		patch \| blob \| history
lib/CodeGen/CodeGenPrepare.cpp		patch \| blob \| history
lib/Target/X86/X86TargetTransformInfo.cpp		patch \| blob \| history
lib/Target/X86/X86TargetTransformInfo.h		patch \| blob \| history
test/CodeGen/X86/masked_gather_scatter.ll		patch \| blob \| history