int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace) const;
+ /// \return The cost of Gather or Scatter operation
+ /// \p Opcode - is a type of memory access Load or Store
+ /// \p DataTy - a vector type of the data to be loaded or stored
+ /// \p Ptr - pointer [or vector of pointers] - address[es] in memory
+ /// \p VariableMask - true when the memory access is predicated with a mask
+ /// that is not a compile-time constant
+ /// \p Alignment - alignment of single element
+ int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+ bool VariableMask, unsigned Alignment) const;
+
/// \return The cost of the interleaved memory operation.
/// \p Opcode is the memory operation code
/// \p VecTy is the vector type of the interleaved access.
/// ((v0+v2), (v1+v3), undef, undef)
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const;
- /// \returns The cost of Intrinsic instructions.
+ /// \returns The cost of Intrinsic instructions. Types analysis only.
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Type *> Tys) const;
+ /// \returns The cost of Intrinsic instructions. Analyses the real arguments.
+ int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+ ArrayRef<Value *> Args) const;
+
/// \returns The cost of Call instructions.
int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) const;
virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
unsigned Alignment,
unsigned AddressSpace) = 0;
+ virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+ Value *Ptr, bool VariableMask,
+ unsigned Alignment) = 0;
virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
bool IsPairwiseForm) = 0;
virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Type *> Tys) = 0;
+ virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+ ArrayRef<Value *> Args) = 0;
virtual int getCallInstrCost(Function *F, Type *RetTy,
ArrayRef<Type *> Tys) = 0;
virtual unsigned getNumberOfParts(Type *Tp) = 0;
unsigned AddressSpace) override {
return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
}
+ int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+ Value *Ptr, bool VariableMask,
+ unsigned Alignment) override {
+ return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+ Alignment);
+ }
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
unsigned AddressSpace) override {
ArrayRef<Type *> Tys) override {
return Impl.getIntrinsicInstrCost(ID, RetTy, Tys);
}
+ int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+ ArrayRef<Value *> Args) override {
+ return Impl.getIntrinsicInstrCost(ID, RetTy, Args);
+ }
int getCallInstrCost(Function *F, Type *RetTy,
ArrayRef<Type *> Tys) override {
return Impl.getCallInstrCost(F, RetTy, Tys);
return 1;
}
+ unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+ bool VariableMask,
+ unsigned Alignment) {
+ return 1;
+ }
+
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
ArrayRef<Type *> Tys) {
return 1;
}
+ unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+ ArrayRef<Value *> Args) {
+ return 1;
+ }
unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) {
return 1;
return Cost;
}
+ /// Get intrinsic cost based on arguments
+ unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+ ArrayRef<Value *> Args) {
+ switch (IID) {
+ default: {
+ SmallVector<Type *, 4> Types;
+ for (Value *Op : Args)
+ Types.push_back(Op->getType());
+ return getIntrinsicInstrCost(IID, RetTy, Types);
+ }
+ case Intrinsic::masked_scatter: {
+ Value *Mask = Args[3];
+ bool VarMask = !isa<Constant>(Mask);
+ unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
+ return
+ static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Store,
+ Args[0]->getType(),
+ Args[1], VarMask,
+ Alignment);
+ }
+ case Intrinsic::masked_gather: {
+ Value *Mask = Args[2];
+ bool VarMask = !isa<Constant>(Mask);
+ unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
+ return
+ static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Load,
+ RetTy, Args[0], VarMask,
+ Alignment);
+ }
+ }
+ }
+
+ /// Get intrinsic cost based on argument types
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> Tys) {
unsigned ISD = 0;
}
case Instruction::Call:
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- SmallVector<Type*, 4> Tys;
+ SmallVector<Value *, 4> Args;
for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J)
- Tys.push_back(II->getArgOperand(J)->getType());
+ Args.push_back(II->getArgOperand(J));
return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(),
- Tys);
+ Args);
}
return -1;
default:
return Cost;
}
+int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+ Value *Ptr, bool VariableMask,
+ unsigned Alignment) const {
+ int Cost = TTIImpl->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+ Alignment);
+ assert(Cost >= 0 && "TTI should not produce negative costs!");
+ return Cost;
+}
+
int TargetTransformInfo::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace) const {
return Cost;
}
+int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+ ArrayRef<Value *> Args) const {
+ int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args);
+ assert(Cost >= 0 && "TTI should not produce negative costs!");
+ return Cost;
+}
+
int TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
ArrayRef<Type *> Tys) const {
int Cost = TTIImpl->getCallInstrCost(F, RetTy, Tys);
return X86TTIImpl::getIntImmCost(Imm, Ty);
}
+// Return an average cost of Gather / Scatter instruction, maybe improved later
+int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
+ unsigned Alignment, unsigned AddressSpace) {
+
+ assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
+ unsigned VF = SrcVTy->getVectorNumElements();
+
+ // Try to reduce index size from 64 bit (default for GEP)
+ // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
+ // operation will use 16 x 64 indices which do not fit in a zmm and needs
+ // to split. Also check that the base pointer is the same for all lanes,
+ // and that there's at most one variable index.
+ auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
+ unsigned IndexSize = DL.getPointerSizeInBits();
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ if (IndexSize < 64 || !GEP)
+ return IndexSize;
+
+ unsigned NumOfVarIndices = 0;
+ Value *Ptrs = GEP->getPointerOperand();
+ if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
+ return IndexSize;
+ for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
+ if (isa<Constant>(GEP->getOperand(i)))
+ continue;
+ Type *IndxTy = GEP->getOperand(i)->getType();
+ if (IndxTy->isVectorTy())
+ IndxTy = IndxTy->getVectorElementType();
+ if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
+ !isa<SExtInst>(GEP->getOperand(i))) ||
+ ++NumOfVarIndices > 1)
+ return IndexSize; // 64
+ }
+ return (unsigned)32;
+ };
+
+
+ // Trying to reduce IndexSize to 32 bits for vector 16.
+ // By default the IndexSize is equal to pointer size.
+ unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
+ DL.getPointerSizeInBits();
+
+ Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(),
+ IndexSize), VF);
+ std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
+ std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+ int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
+ if (SplitFactor > 1) {
+ // Handle splitting of vector of pointers
+ Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
+ return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
+ AddressSpace);
+ }
+
+ // The gather / scatter cost is given by Intel architects. It is a rough
+ // number since we are looking at one instruction in a time.
+ const int GSOverhead = 2;
+ return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ Alignment, AddressSpace);
+}
+
+/// Return the cost of full scalarization of gather / scatter operation.
+///
+/// Opcode - Load or Store instruction.
+/// SrcVTy - The type of the data vector that should be gathered or scattered.
+/// VariableMask - The mask is non-constant at compile time.
+/// Alignment - Alignment for one element.
+/// AddressSpace - pointer[s] address space.
+///
+int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
+ bool VariableMask, unsigned Alignment,
+ unsigned AddressSpace) {
+ unsigned VF = SrcVTy->getVectorNumElements();
+
+ int MaskUnpackCost = 0;
+ if (VariableMask) {
+ VectorType *MaskTy =
+ VectorType::get(Type::getInt1Ty(getGlobalContext()), VF);
+ MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
+ int ScalarCompareCost =
+ getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()),
+ nullptr);
+ int BranchCost = getCFInstrCost(Instruction::Br);
+ MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
+ }
+
+ // The cost of the scalar loads/stores.
+ int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ Alignment, AddressSpace);
+
+ int InsertExtractCost = 0;
+ if (Opcode == Instruction::Load)
+ for (unsigned i = 0; i < VF; ++i)
+ // Add the cost of inserting each scalar load into the vector
+ InsertExtractCost +=
+ getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
+ else
+ for (unsigned i = 0; i < VF; ++i)
+ // Add the cost of extracting each element out of the data vector
+ InsertExtractCost +=
+ getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
+
+ return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
+}
+
+/// Calculate the cost of Gather / Scatter operation
+int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
+ Value *Ptr, bool VariableMask,
+ unsigned Alignment) {
+ assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
+ unsigned VF = SrcVTy->getVectorNumElements();
+ PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+ if (!PtrTy && Ptr->getType()->isVectorTy())
+ PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
+ assert(PtrTy && "Unexpected type for Ptr argument");
+ unsigned AddressSpace = PtrTy->getAddressSpace();
+
+ bool Scalarize = false;
+ if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
+ (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
+ Scalarize = true;
+ // Gather / Scatter for vector 2 is not profitable on KNL / SKX
+ // Vector-4 of gather/scatter instruction does not exist on KNL.
+ // We can extend it to 8 elements, but zeroing upper bits of
+ // the mask vector will add more instructions. Right now we give the scalar
+ // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is
+ // better in the VariableMask case.
+ if (VF == 2 || (VF == 4 && !ST->hasVLX()))
+ Scalarize = true;
+
+ if (Scalarize)
+ return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace);
+
+ return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
+}
+
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
Type *ScalarTy = DataTy->getScalarType();
int DataWidth = isa<PointerType>(ScalarTy) ?
unsigned AddressSpace);
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace);
-
+ int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+ bool VariableMask, unsigned Alignment);
int getAddressComputationCost(Type *PtrTy, bool IsComplex);
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
bool isLegalMaskedScatter(Type *DataType);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
+private:
+ int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
+ unsigned Alignment, unsigned AddressSpace);
+ int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+ unsigned Alignment, unsigned AddressSpace);
/// @}
};
-; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s -check-prefix=AVX2
+; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s --check-prefix=AVX2
+; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze < %s | FileCheck %s --check-prefix=KNL
+; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze < %s | FileCheck %s --check-prefix=SKX
; AVX2-LABEL: test1
ret <2 x i32> %res
}
+define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x double> %src0) {
+
+; AVX2-LABEL: test_gather_2f64
+; AVX2: Found an estimated cost of 7 {{.*}}.gather
+
+; KNL-LABEL: test_gather_2f64
+; KNL: Found an estimated cost of 7 {{.*}}.gather
+
+; SKX-LABEL: test_gather_2f64
+; SKX: Found an estimated cost of 7 {{.*}}.gather
+
+%res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
+
+define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0) {
+
+; AVX2-LABEL: test_gather_4i32
+; AVX2: Found an estimated cost of 16 {{.*}}.gather
+
+; KNL-LABEL: test_gather_4i32
+; KNL: Found an estimated cost of 16 {{.*}}.gather
+
+; SKX-LABEL: test_gather_4i32
+; SKX: Found an estimated cost of 6 {{.*}}.gather
+
+%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0) {
+
+; AVX2-LABEL: test_gather_4i32_const_mask
+; AVX2: Found an estimated cost of 8 {{.*}}.gather
+
+; KNL-LABEL: test_gather_4i32_const_mask
+; KNL: Found an estimated cost of 8 {{.*}}.gather
+
+; SKX-LABEL: test_gather_4i32_const_mask
+; SKX: Found an estimated cost of 6 {{.*}}.gather
+
+%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0)
+
+define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind) {
+
+; AVX2-LABEL: test_gather_16f32_const_mask
+; AVX2: Found an estimated cost of 30 {{.*}}.gather
+
+; KNL-LABEL: test_gather_16f32_const_mask
+; KNL: Found an estimated cost of 18 {{.*}}.gather
+
+; SKX-LABEL: test_gather_16f32_const_mask
+; SKX: Found an estimated cost of 18 {{.*}}.gather
+
+ %sext_ind = sext <16 x i32> %ind to <16 x i64>
+ %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
+
+ %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+ ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <16 x i1>%mask) {
+
+; AVX2-LABEL: test_gather_16f32_var_mask
+; AVX2: Found an estimated cost of 62 {{.*}}.gather
+
+; KNL-LABEL: test_gather_16f32_var_mask
+; KNL: Found an estimated cost of 18 {{.*}}.gather
+
+; SKX-LABEL: test_gather_16f32_var_mask
+; SKX: Found an estimated cost of 18 {{.*}}.gather
+
+ %sext_ind = sext <16 x i32> %ind to <16 x i64>
+ %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
+
+ %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+ ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
+
+; AVX2-LABEL: test_gather_16f32_ra_var_mask
+; AVX2: Found an estimated cost of 62 {{.*}}.gather
+
+; KNL-LABEL: test_gather_16f32_ra_var_mask
+; KNL: Found an estimated cost of 20 {{.*}}.gather
+
+; SKX-LABEL: test_gather_16f32_ra_var_mask
+; SKX: Found an estimated cost of 20 {{.*}}.gather
+
+ %sext_ind = sext <16 x i32> %ind to <16 x i64>
+ %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
+
+ %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+ ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind) {
+
+; AVX2-LABEL: test_gather_16f32_const_mask2
+; AVX2: Found an estimated cost of 30 {{.*}}.gather
+
+; KNL-LABEL: test_gather_16f32_const_mask2
+; KNL: Found an estimated cost of 18 {{.*}}.gather
+
+; SKX-LABEL: test_gather_16f32_const_mask2
+; SKX: Found an estimated cost of 18 {{.*}}.gather
+
+ %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
+ %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
+
+ %sext_ind = sext <16 x i32> %ind to <16 x i64>
+ %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
+
+ %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+ ret <16 x float>%res
+}
+
+define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; AVX2-LABEL: test_scatter_16i32
+; AVX2: Found an estimated cost of 64 {{.*}}.scatter
+
+; KNL-LABEL: test_scatter_16i32
+; KNL: Found an estimated cost of 18 {{.*}}.scatter
+
+; SKX-LABEL: test_scatter_16i32
+; SKX: Found an estimated cost of 18 {{.*}}.scatter
+
+ %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
+ %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
+
+ %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
+ %imask = bitcast i16 %mask to <16 x i1>
+ call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+ ret void
+}
+
+define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) {
+; AVX2-LABEL: test_scatter_8i32
+; AVX2: Found an estimated cost of 32 {{.*}}.scatter
+
+; KNL-LABEL: test_scatter_8i32
+; KNL: Found an estimated cost of 10 {{.*}}.scatter
+
+; SKX-LABEL: test_scatter_8i32
+; SKX: Found an estimated cost of 10 {{.*}}.scatter
+
+ call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask)
+
+define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
+; AVX2-LABEL: test_scatter_4i32
+; AVX2: Found an estimated cost of 16 {{.*}}.scatter
+
+; KNL-LABEL: test_scatter_4i32
+; KNL: Found an estimated cost of 16 {{.*}}.scatter
+
+; SKX-LABEL: test_scatter_4i32
+; SKX: Found an estimated cost of 6 {{.*}}.scatter
+
+ call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+ ret void
+}
+
+define <4 x float> @test_gather_4f32(float* %ptr, <4 x i32> %ind, <4 x i1>%mask) {
+
+; AVX2-LABEL: test_gather_4f32
+; AVX2: Found an estimated cost of 15 {{.*}}.gather
+
+; KNL-LABEL: test_gather_4f32
+; KNL: Found an estimated cost of 15 {{.*}}.gather
+
+; SKX-LABEL: test_gather_4f32
+; SKX: Found an estimated cost of 6 {{.*}}.gather
+
+ %sext_ind = sext <4 x i32> %ind to <4 x i64>
+ %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
+
+ %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+ ret <4 x float>%res
+}
+
+define <4 x float> @test_gather_4f32_const_mask(float* %ptr, <4 x i32> %ind) {
+
+; AVX2-LABEL: test_gather_4f32_const_mask
+; AVX2: Found an estimated cost of 7 {{.*}}.gather
+
+; KNL-LABEL: test_gather_4f32_const_mask
+; KNL: Found an estimated cost of 7 {{.*}}.gather
+
+; SKX-LABEL: test_gather_4f32_const_mask
+; SKX: Found an estimated cost of 6 {{.*}}.gather
+
+ %sext_ind = sext <4 x i32> %ind to <4 x i64>
+ %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
+
+ %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+ ret <4 x float>%res
+}
+
+declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32, <4 x i1> %mask, <4 x float> )
+declare void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32, <4 x i1> %mask)
+declare void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask)
+declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>)
declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)