From: Elena Demikhovsky Date: Mon, 28 Dec 2015 20:10:59 +0000 (+0000) Subject: Implemented cost model for masked gather and scatter operations X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=84f6badcccd77214a13fb2afefb6d110915c6bb6;p=oota-llvm.git Implemented cost model for masked gather and scatter operations The cost is calculated for all X86 targets. When gather/scatter instruction is not supported we calculate the cost of scalar sequence. Differential revision: http://reviews.llvm.org/D15677 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@256519 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 35c756b362d..3913cc3f107 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -458,6 +458,16 @@ public: int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const; + /// \return The cost of Gather or Scatter operation + /// \p Opcode - is a type of memory access Load or Store + /// \p DataTy - a vector type of the data to be loaded or stored + /// \p Ptr - pointer [or vector of pointers] - address[es] in memory + /// \p VariableMask - true when the memory access is predicated with a mask + /// that is not a compile-time constant + /// \p Alignment - alignment of single element + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, + bool VariableMask, unsigned Alignment) const; + /// \return The cost of the interleaved memory operation. /// \p Opcode is the memory operation code /// \p VecTy is the vector type of the interleaved access. @@ -485,10 +495,14 @@ public: /// ((v0+v2), (v1+v3), undef, undef) int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const; - /// \returns The cost of Intrinsic instructions. + /// \returns The cost of Intrinsic instructions. Types analysis only. int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) const; + /// \returns The cost of Intrinsic instructions. Analyses the real arguments. + int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args) const; + /// \returns The cost of Call instructions. int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) const; @@ -614,6 +628,9 @@ public: virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) = 0; + virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + Value *Ptr, bool VariableMask, + unsigned Alignment) = 0; virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, @@ -623,6 +640,8 @@ public: bool IsPairwiseForm) = 0; virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) = 0; + virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args) = 0; virtual int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) = 0; virtual unsigned getNumberOfParts(Type *Tp) = 0; @@ -791,6 +810,12 @@ public: unsigned AddressSpace) override { return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + Value *Ptr, bool VariableMask, + unsigned Alignment) override { + return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment); + } int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace) override { @@ -805,6 +830,10 @@ public: ArrayRef Tys) override { return Impl.getIntrinsicInstrCost(ID, RetTy, Tys); } + int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args) override { + return Impl.getIntrinsicInstrCost(ID, RetTy, Args); + } int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) override { return Impl.getCallInstrCost(F, RetTy, Tys); diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index 2acd5f5fb09..43815234051 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -301,6 +301,12 @@ public: return 1; } + unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, + bool VariableMask, + unsigned Alignment) { + return 1; + } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, @@ -313,6 +319,10 @@ public: ArrayRef Tys) { return 1; } + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args) { + return 1; + } unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) { return 1; diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h index ec311a09386..d99054eb6f3 100644 --- a/include/llvm/CodeGen/BasicTTIImpl.h +++ b/include/llvm/CodeGen/BasicTTIImpl.h @@ -580,6 +580,39 @@ public: return Cost; } + /// Get intrinsic cost based on arguments + unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, + ArrayRef Args) { + switch (IID) { + default: { + SmallVector Types; + for (Value *Op : Args) + Types.push_back(Op->getType()); + return getIntrinsicInstrCost(IID, RetTy, Types); + } + case Intrinsic::masked_scatter: { + Value *Mask = Args[3]; + bool VarMask = !isa(Mask); + unsigned Alignment = cast(Args[2])->getZExtValue(); + return + static_cast(this)->getGatherScatterOpCost(Instruction::Store, + Args[0]->getType(), + Args[1], VarMask, + Alignment); + } + case Intrinsic::masked_gather: { + Value *Mask = Args[2]; + bool VarMask = !isa(Mask); + unsigned Alignment = cast(Args[1])->getZExtValue(); + return + static_cast(this)->getGatherScatterOpCost(Instruction::Load, + RetTy, Args[0], VarMask, + Alignment); + } + } + } + + /// Get intrinsic cost based on argument types unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef Tys) { unsigned ISD = 0; diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp index b11f64d4bf0..0383cbfbbe4 100644 --- a/lib/Analysis/CostModel.cpp +++ b/lib/Analysis/CostModel.cpp @@ -500,12 +500,12 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const { } case Instruction::Call: if (const IntrinsicInst *II = dyn_cast(I)) { - SmallVector Tys; + SmallVector Args; for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J) - Tys.push_back(II->getArgOperand(J)->getType()); + Args.push_back(II->getArgOperand(J)); return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), - Tys); + Args); } return -1; default: diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index c2d5c88e641..9c1d3fd4f58 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -280,6 +280,15 @@ int TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, return Cost; } +int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + Value *Ptr, bool VariableMask, + unsigned Alignment) const { + int Cost = TTIImpl->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + int TargetTransformInfo::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace) const { @@ -296,6 +305,13 @@ int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, return Cost; } +int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args) const { + int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + int TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) const { int Cost = TTIImpl->getCallInstrCost(F, RetTy, Tys); diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index cad649ed6f7..2e7bbb20874 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1297,6 +1297,142 @@ int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, return X86TTIImpl::getIntImmCost(Imm, Ty); } +// Return an average cost of Gather / Scatter instruction, maybe improved later +int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, + unsigned Alignment, unsigned AddressSpace) { + + assert(isa(SrcVTy) && "Unexpected type in getGSVectorCost"); + unsigned VF = SrcVTy->getVectorNumElements(); + + // Try to reduce index size from 64 bit (default for GEP) + // to 32. It is essential for VF 16. If the index can't be reduced to 32, the + // operation will use 16 x 64 indices which do not fit in a zmm and needs + // to split. Also check that the base pointer is the same for all lanes, + // and that there's at most one variable index. + auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) { + unsigned IndexSize = DL.getPointerSizeInBits(); + GetElementPtrInst *GEP = dyn_cast(Ptr); + if (IndexSize < 64 || !GEP) + return IndexSize; + + unsigned NumOfVarIndices = 0; + Value *Ptrs = GEP->getPointerOperand(); + if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) + return IndexSize; + for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { + if (isa(GEP->getOperand(i))) + continue; + Type *IndxTy = GEP->getOperand(i)->getType(); + if (IndxTy->isVectorTy()) + IndxTy = IndxTy->getVectorElementType(); + if ((IndxTy->getPrimitiveSizeInBits() == 64 && + !isa(GEP->getOperand(i))) || + ++NumOfVarIndices > 1) + return IndexSize; // 64 + } + return (unsigned)32; + }; + + + // Trying to reduce IndexSize to 32 bits for vector 16. + // By default the IndexSize is equal to pointer size. + unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) : + DL.getPointerSizeInBits(); + + Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(), + IndexSize), VF); + std::pair IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy); + std::pair SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy); + int SplitFactor = std::max(IdxsLT.first, SrcLT.first); + if (SplitFactor > 1) { + // Handle splitting of vector of pointers + Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); + return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, + AddressSpace); + } + + // The gather / scatter cost is given by Intel architects. It is a rough + // number since we are looking at one instruction in a time. + const int GSOverhead = 2; + return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); +} + +/// Return the cost of full scalarization of gather / scatter operation. +/// +/// Opcode - Load or Store instruction. +/// SrcVTy - The type of the data vector that should be gathered or scattered. +/// VariableMask - The mask is non-constant at compile time. +/// Alignment - Alignment for one element. +/// AddressSpace - pointer[s] address space. +/// +int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, + bool VariableMask, unsigned Alignment, + unsigned AddressSpace) { + unsigned VF = SrcVTy->getVectorNumElements(); + + int MaskUnpackCost = 0; + if (VariableMask) { + VectorType *MaskTy = + VectorType::get(Type::getInt1Ty(getGlobalContext()), VF); + MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true); + int ScalarCompareCost = + getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()), + nullptr); + int BranchCost = getCFInstrCost(Instruction::Br); + MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); + } + + // The cost of the scalar loads/stores. + int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); + + int InsertExtractCost = 0; + if (Opcode == Instruction::Load) + for (unsigned i = 0; i < VF; ++i) + // Add the cost of inserting each scalar load into the vector + InsertExtractCost += + getVectorInstrCost(Instruction::InsertElement, SrcVTy, i); + else + for (unsigned i = 0; i < VF; ++i) + // Add the cost of extracting each element out of the data vector + InsertExtractCost += + getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i); + + return MemoryOpCost + MaskUnpackCost + InsertExtractCost; +} + +/// Calculate the cost of Gather / Scatter operation +int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, + Value *Ptr, bool VariableMask, + unsigned Alignment) { + assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); + unsigned VF = SrcVTy->getVectorNumElements(); + PointerType *PtrTy = dyn_cast(Ptr->getType()); + if (!PtrTy && Ptr->getType()->isVectorTy()) + PtrTy = dyn_cast(Ptr->getType()->getVectorElementType()); + assert(PtrTy && "Unexpected type for Ptr argument"); + unsigned AddressSpace = PtrTy->getAddressSpace(); + + bool Scalarize = false; + if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) || + (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy))) + Scalarize = true; + // Gather / Scatter for vector 2 is not profitable on KNL / SKX + // Vector-4 of gather/scatter instruction does not exist on KNL. + // We can extend it to 8 elements, but zeroing upper bits of + // the mask vector will add more instructions. Right now we give the scalar + // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is + // better in the VariableMask case. + if (VF == 2 || (VF == 4 && !ST->hasVLX())) + Scalarize = true; + + if (Scalarize) + return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace); + + return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); +} + bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { Type *ScalarTy = DataTy->getScalarType(); int DataWidth = isa(ScalarTy) ? diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index e337475ed41..adb745e912d 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -76,7 +76,8 @@ public: unsigned AddressSpace); int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); - + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, + bool VariableMask, unsigned Alignment); int getAddressComputationCost(Type *PtrTy, bool IsComplex); int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); @@ -94,6 +95,11 @@ public: bool isLegalMaskedScatter(Type *DataType); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; +private: + int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, + unsigned Alignment, unsigned AddressSpace); + int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr, + unsigned Alignment, unsigned AddressSpace); /// @} }; diff --git a/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll index 4683c432c55..61d3e0116e8 100644 --- a/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ b/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -1,4 +1,6 @@ -; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s -check-prefix=AVX2 +; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s --check-prefix=AVX2 +; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze < %s | FileCheck %s --check-prefix=KNL +; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze < %s | FileCheck %s --check-prefix=SKX ; AVX2-LABEL: test1 @@ -65,6 +67,217 @@ define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { ret <2 x i32> %res } +define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x double> %src0) { + +; AVX2-LABEL: test_gather_2f64 +; AVX2: Found an estimated cost of 7 {{.*}}.gather + +; KNL-LABEL: test_gather_2f64 +; KNL: Found an estimated cost of 7 {{.*}}.gather + +; SKX-LABEL: test_gather_2f64 +; SKX: Found an estimated cost of 7 {{.*}}.gather + +%res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) + ret <2 x double> %res +} +declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0) + +define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0) { + +; AVX2-LABEL: test_gather_4i32 +; AVX2: Found an estimated cost of 16 {{.*}}.gather + +; KNL-LABEL: test_gather_4i32 +; KNL: Found an estimated cost of 16 {{.*}}.gather + +; SKX-LABEL: test_gather_4i32 +; SKX: Found an estimated cost of 6 {{.*}}.gather + +%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) + ret <4 x i32> %res +} + +define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0) { + +; AVX2-LABEL: test_gather_4i32_const_mask +; AVX2: Found an estimated cost of 8 {{.*}}.gather + +; KNL-LABEL: test_gather_4i32_const_mask +; KNL: Found an estimated cost of 8 {{.*}}.gather + +; SKX-LABEL: test_gather_4i32_const_mask +; SKX: Found an estimated cost of 6 {{.*}}.gather + +%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0) + +define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind) { + +; AVX2-LABEL: test_gather_16f32_const_mask +; AVX2: Found an estimated cost of 30 {{.*}}.gather + +; KNL-LABEL: test_gather_16f32_const_mask +; KNL: Found an estimated cost of 18 {{.*}}.gather + +; SKX-LABEL: test_gather_16f32_const_mask +; SKX: Found an estimated cost of 18 {{.*}}.gather + + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <16 x i1>%mask) { + +; AVX2-LABEL: test_gather_16f32_var_mask +; AVX2: Found an estimated cost of 62 {{.*}}.gather + +; KNL-LABEL: test_gather_16f32_var_mask +; KNL: Found an estimated cost of 18 {{.*}}.gather + +; SKX-LABEL: test_gather_16f32_var_mask +; SKX: Found an estimated cost of 18 {{.*}}.gather + + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i32> %ind, <16 x i1>%mask) { + +; AVX2-LABEL: test_gather_16f32_ra_var_mask +; AVX2: Found an estimated cost of 62 {{.*}}.gather + +; KNL-LABEL: test_gather_16f32_ra_var_mask +; KNL: Found an estimated cost of 20 {{.*}}.gather + +; SKX-LABEL: test_gather_16f32_ra_var_mask +; SKX: Found an estimated cost of 20 {{.*}}.gather + + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) + ret <16 x float>%res +} + +define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind) { + +; AVX2-LABEL: test_gather_16f32_const_mask2 +; AVX2: Found an estimated cost of 30 {{.*}}.gather + +; KNL-LABEL: test_gather_16f32_const_mask2 +; KNL: Found an estimated cost of 18 {{.*}}.gather + +; SKX-LABEL: test_gather_16f32_const_mask2 +; SKX: Found an estimated cost of 18 {{.*}}.gather + + %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 + %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer + + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) + ret <16 x float>%res +} + +define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { +; AVX2-LABEL: test_scatter_16i32 +; AVX2: Found an estimated cost of 64 {{.*}}.scatter + +; KNL-LABEL: test_scatter_16i32 +; KNL: Found an estimated cost of 18 {{.*}}.scatter + +; SKX-LABEL: test_scatter_16i32 +; SKX: Found an estimated cost of 18 {{.*}}.scatter + + %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 + %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer + + %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind + %imask = bitcast i16 %mask to <16 x i1> + call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) + ret void +} + +define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) { +; AVX2-LABEL: test_scatter_8i32 +; AVX2: Found an estimated cost of 32 {{.*}}.scatter + +; KNL-LABEL: test_scatter_8i32 +; KNL: Found an estimated cost of 10 {{.*}}.scatter + +; SKX-LABEL: test_scatter_8i32 +; SKX: Found an estimated cost of 10 {{.*}}.scatter + + call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask) + ret void +} + +declare void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask) + +define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { +; AVX2-LABEL: test_scatter_4i32 +; AVX2: Found an estimated cost of 16 {{.*}}.scatter + +; KNL-LABEL: test_scatter_4i32 +; KNL: Found an estimated cost of 16 {{.*}}.scatter + +; SKX-LABEL: test_scatter_4i32 +; SKX: Found an estimated cost of 6 {{.*}}.scatter + + call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) + ret void +} + +define <4 x float> @test_gather_4f32(float* %ptr, <4 x i32> %ind, <4 x i1>%mask) { + +; AVX2-LABEL: test_gather_4f32 +; AVX2: Found an estimated cost of 15 {{.*}}.gather + +; KNL-LABEL: test_gather_4f32 +; KNL: Found an estimated cost of 15 {{.*}}.gather + +; SKX-LABEL: test_gather_4f32 +; SKX: Found an estimated cost of 6 {{.*}}.gather + + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind + + %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) + ret <4 x float>%res +} + +define <4 x float> @test_gather_4f32_const_mask(float* %ptr, <4 x i32> %ind) { + +; AVX2-LABEL: test_gather_4f32_const_mask +; AVX2: Found an estimated cost of 7 {{.*}}.gather + +; KNL-LABEL: test_gather_4f32_const_mask +; KNL: Found an estimated cost of 7 {{.*}}.gather + +; SKX-LABEL: test_gather_4f32_const_mask +; SKX: Found an estimated cost of 6 {{.*}}.gather + + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind + + %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) + ret <4 x float>%res +} + +declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32, <4 x i1> %mask, <4 x float> ) +declare void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32, <4 x i1> %mask) +declare void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask) +declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>) declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)