From 16ed8780c77340f44a76ec64c4275a626795ed10 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Sun, 25 Oct 2015 15:37:55 +0000 Subject: [PATCH] Scalarizer for masked.gather and masked.scatter intrinsics. When the target does not support these intrinsics they should be converted to a chain of scalar load or store operations. If the mask is not constant, the scalarizer will build a chain of conditional basic blocks. I added isLegalMaskedGather() isLegalMaskedScatter() APIs. Differential Revision: http://reviews.llvm.org/D13722 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251237 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/TargetTransformInfo.h | 14 + .../llvm/Analysis/TargetTransformInfoImpl.h | 4 + lib/Analysis/TargetTransformInfo.cpp | 8 + lib/CodeGen/CodeGenPrepare.cpp | 262 +++++++++++++++++- lib/Target/X86/X86TargetTransformInfo.cpp | 27 ++ lib/Target/X86/X86TargetTransformInfo.h | 2 + test/CodeGen/X86/masked_gather_scatter.ll | 85 ++++++ 7 files changed, 401 insertions(+), 1 deletion(-) diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 084fd45f27f..1ebbaa03475 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -316,6 +316,12 @@ public: bool isLegalMaskedStore(Type *DataType) const; bool isLegalMaskedLoad(Type *DataType) const; + /// \brief Return true if the target supports masked gather/scatter + /// AVX-512 fully supports gather and scatter for vectors with 32 and 64 + /// bits scalar type. + bool isLegalMaskedScatter(Type *DataType) const; + bool isLegalMaskedGather(Type *DataType) const; + /// \brief Return the cost of the scaling factor used in the addressing /// mode represented by AM for this target, for a load/store /// of the specified type. @@ -569,6 +575,8 @@ public: unsigned AddrSpace) = 0; virtual bool isLegalMaskedStore(Type *DataType) = 0; virtual bool isLegalMaskedLoad(Type *DataType) = 0; + virtual bool isLegalMaskedScatter(Type *DataType) = 0; + virtual bool isLegalMaskedGather(Type *DataType) = 0; virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) = 0; @@ -698,6 +706,12 @@ public: bool isLegalMaskedLoad(Type *DataType) override { return Impl.isLegalMaskedLoad(DataType); } + bool isLegalMaskedScatter(Type *DataType) override { + return Impl.isLegalMaskedScatter(DataType); + } + bool isLegalMaskedGather(Type *DataType) override { + return Impl.isLegalMaskedGather(DataType); + } int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) override { diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index 47609ff5290..8d5d2a71e42 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -213,6 +213,10 @@ public: bool isLegalMaskedLoad(Type *DataType) { return false; } + bool isLegalMaskedScatter(Type *DataType) { return false; } + + bool isLegalMaskedGather(Type *DataType) { return false; } + int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { // Guess that all legal addressing mode are free. diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 427e1ed8ceb..f82af757125 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -121,6 +121,14 @@ bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType) const { return TTIImpl->isLegalMaskedLoad(DataType); } +bool TargetTransformInfo::isLegalMaskedGather(Type *DataType) const { + return TTIImpl->isLegalMaskedGather(DataType); +} + +bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType) const { + return TTIImpl->isLegalMaskedGather(DataType); +} + int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 6f00c471c21..87669d772d7 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -1215,7 +1215,7 @@ static void ScalarizeMaskedLoad(CallInst *CI) { Value *Gep = Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal); + LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); // Create "else" block, fill it in the next iteration @@ -1353,6 +1353,250 @@ static void ScalarizeMaskedStore(CallInst *CI) { CI->eraseFromParent(); } +// Translate a masked gather intrinsic like +// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4, +// <16 x i1> %Mask, <16 x i32> %Src) +// to a chain of basic blocks, with loading element one-by-one if +// the appropriate mask bit is set +// +// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind +// % Mask0 = extractelement <16 x i1> %Mask, i32 0 +// % ToLoad0 = icmp eq i1 % Mask0, true +// br i1 % ToLoad0, label %cond.load, label %else +// +// cond.load: +// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 +// % Load0 = load i32, i32* % Ptr0, align 4 +// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0 +// br label %else +// +// else: +// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0] +// % Mask1 = extractelement <16 x i1> %Mask, i32 1 +// % ToLoad1 = icmp eq i1 % Mask1, true +// br i1 % ToLoad1, label %cond.load1, label %else2 +// +// cond.load1: +// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 +// % Load1 = load i32, i32* % Ptr1, align 4 +// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1 +// br label %else2 +// . . . +// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src +// ret <16 x i32> %Result +static void ScalarizeMaskedGather(CallInst *CI) { + Value *Ptrs = CI->getArgOperand(0); + Value *Alignment = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + Value *Src0 = CI->getArgOperand(3); + + VectorType *VecType = dyn_cast(CI->getType()); + + assert(VecType && "Unexpected return type of masked load intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + BasicBlock *CondBlock = nullptr; + BasicBlock *PrevIfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + unsigned AlignVal = cast(Alignment)->getZExtValue(); + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + Value *UndefVal = UndefValue::get(VecType); + + // The result vector + Value *VResult = UndefVal; + unsigned VectorWidth = VecType->getNumElements(); + + // Shorten the way if the mask is a vector of constants. + bool IsConstMask = isa(Mask); + + if (IsConstMask) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, + "Load" + Twine(Idx)); + VResult = Builder.CreateInsertElement(VResult, Load, + Builder.getInt32(Idx), + "Res" + Twine(Idx)); + } + Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + PHINode *Phi = nullptr; + Value *PrevPhi = UndefVal; + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %Mask1 = extractelement <16 x i1> %Mask, i32 1 + // %ToLoad1 = icmp eq i1 %Mask1, true + // br i1 %ToLoad1, label %cond.load, label %else + // + if (Idx > 0) { + Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + PrevPhi = Phi; + VResult = Phi; + } + + Value *Predicate = Builder.CreateExtractElement(Mask, + Builder.getInt32(Idx), + "Mask" + Twine(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1), + "ToLoad" + Twine(Idx)); + + // Create "cond" block + // + // %EltAddr = getelementptr i32* %1, i32 0 + // %Elt = load i32* %EltAddr + // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx + // + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); + Builder.SetInsertPoint(InsertPt); + + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, + "Load" + Twine(Idx)); + VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx), + "Res" + Twine(Idx)); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; + } + + Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); +} + +// Translate a masked scatter intrinsic, like +// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4, +// <16 x i1> %Mask) +// to a chain of basic blocks, that stores element one-by-one if +// the appropriate mask bit is set. +// +// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind +// % Mask0 = extractelement <16 x i1> % Mask, i32 0 +// % ToStore0 = icmp eq i1 % Mask0, true +// br i1 %ToStore0, label %cond.store, label %else +// +// cond.store: +// % Elt0 = extractelement <16 x i32> %Src, i32 0 +// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 +// store i32 %Elt0, i32* % Ptr0, align 4 +// br label %else +// +// else: +// % Mask1 = extractelement <16 x i1> % Mask, i32 1 +// % ToStore1 = icmp eq i1 % Mask1, true +// br i1 % ToStore1, label %cond.store1, label %else2 +// +// cond.store1: +// % Elt1 = extractelement <16 x i32> %Src, i32 1 +// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 +// store i32 % Elt1, i32* % Ptr1, align 4 +// br label %else2 +// . . . +static void ScalarizeMaskedScatter(CallInst *CI) { + Value *Src = CI->getArgOperand(0); + Value *Ptrs = CI->getArgOperand(1); + Value *Alignment = CI->getArgOperand(2); + Value *Mask = CI->getArgOperand(3); + + assert(isa(Src->getType()) && + "Unexpected data type in masked scatter intrinsic"); + assert(isa(Ptrs->getType()) && + isa(Ptrs->getType()->getVectorElementType()) && + "Vector of pointers is expected in masked scatter intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + unsigned AlignVal = cast(Alignment)->getZExtValue(); + unsigned VectorWidth = Src->getType()->getVectorNumElements(); + + // Shorten the way if the mask is a vector of constants. + bool IsConstMask = isa(Mask); + + if (IsConstMask) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), + "Elt" + Twine(Idx)); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); + } + CI->eraseFromParent(); + return; + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + // Fill the "else" block, created in the previous iteration + // + // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx + // % ToStore = icmp eq i1 % Mask1, true + // br i1 % ToStore, label %cond.store, label %else + // + Value *Predicate = Builder.CreateExtractElement(Mask, + Builder.getInt32(Idx), + "Mask" + Twine(Idx)); + Value *Cmp = + Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1), + "ToStore" + Twine(Idx)); + + // Create "cond" block + // + // % Elt1 = extractelement <16 x i32> %Src, i32 1 + // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 + // %store i32 % Elt1, i32* % Ptr1 + // + BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + Builder.SetInsertPoint(InsertPt); + + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), + "Elt" + Twine(Idx)); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } + CI->eraseFromParent(); +} + bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -1460,6 +1704,22 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { } return false; } + case Intrinsic::masked_gather: { + if (!TTI->isLegalMaskedGather(CI->getType())) { + ScalarizeMaskedGather(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_scatter: { + if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) { + ScalarizeMaskedScatter(CI); + ModifiedDT = true; + return true; + } + return false; + } case Intrinsic::aarch64_stlxr: case Intrinsic::aarch64_stxr: { ZExtInst *ExtVal = dyn_cast(CI->getArgOperand(0)); diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index ba2f71c6f62..1b825df4c87 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1203,6 +1203,33 @@ bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { return isLegalMaskedLoad(DataType); } +bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { + // This function is called now in two cases: from the Loop Vectorizer + // and from the Scalarizer. + // When the Loop Vectorizer asks about legality of the feature, + // the vectorization factor is not calculated yet. The Loop Vectorizer + // sends a scalar type and the decision is based on the width of the + // scalar element. + // Later on, the cost model will estimate usage this intrinsic based on + // the vector type. + // The Scalarizer asks again about legality. It sends a vector type. + // In this case we can reject non-power-of-2 vectors. + if (isa(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements())) + return false; + Type *ScalarTy = DataTy->getScalarType(); + // TODO: Pointers should also be legal, + // but it requires additional support in composing intrinsics name. + // getPrimitiveSizeInBits() returns 0 for PointerType + int DataWidth = ScalarTy->getPrimitiveSizeInBits(); + + // AVX-512 allows gather and scatter + return DataWidth >= 32 && ST->hasAVX512(); +} + +bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { + return isLegalMaskedGather(DataType); +} + bool X86TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index 5ddde2a45a5..e337475ed41 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -90,6 +90,8 @@ public: Type *Ty); bool isLegalMaskedLoad(Type *DataType); bool isLegalMaskedStore(Type *DataType); + bool isLegalMaskedGather(Type *DataType); + bool isLegalMaskedScatter(Type *DataType); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll index 11d87bb6755..b65146ed417 100644 --- a/test/CodeGen/X86/masked_gather_scatter.ll +++ b/test/CodeGen/X86/masked_gather_scatter.ll @@ -1,4 +1,6 @@ ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=KNL +; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR + target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -6,6 +8,14 @@ target triple = "x86_64-unknown-linux-gnu" ; KNL-LABEL: test1 ; KNL: kxnorw %k1, %k1, %k1 ; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} + +; SCALAR-LABEL: test1 +; SCALAR: extractelement <16 x float*> +; SCALAR-NEXT: load float +; SCALAR-NEXT: insertelement <16 x float> +; SCALAR-NEXT: extractelement <16 x float*> +; SCALAR-NEXT: load float + define <16 x float> @test1(float* %base, <16 x i32> %ind) { %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 @@ -25,6 +35,18 @@ declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i3 ; KNL-LABEL: test2 ; KNL: kmovw %esi, %k1 ; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} + +; SCALAR-LABEL: test2 +; SCALAR: extractelement <16 x float*> +; SCALAR-NEXT: load float +; SCALAR-NEXT: insertelement <16 x float> +; SCALAR-NEXT: br label %else +; SCALAR: else: +; SCALAR-NEXT: %res.phi.else = phi +; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1 +; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true +; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2 + define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) { %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 @@ -76,6 +98,20 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) { ; KNL: vpscatterdd {{.*}}%k2 ; KNL: vpscatterdd {{.*}}%k1 +; SCALAR-LABEL: test5 +; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0 +; SCALAR-NEXT: %ToStore0 = icmp eq i1 %Mask0, true +; SCALAR-NEXT: br i1 %ToStore0, label %cond.store, label %else +; SCALAR: cond.store: +; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i32 0 +; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0 +; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4 +; SCALAR-NEXT: br label %else +; SCALAR: else: +; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1 +; SCALAR-NEXT: %ToStore1 = icmp eq i1 %Mask1, true +; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2 + define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 @@ -96,6 +132,16 @@ declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x ; KNL: kxnorw %k2, %k2, %k2 ; KNL: vpgatherqd (,%zmm{{.*}}), %ymm{{.*}} {%k2} ; KNL: vpscatterqd %ymm{{.*}}, (,%zmm{{.*}}) {%k1} + +; SCALAR-LABEL: test6 +; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4 +; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i32 1 +; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i32 1 +; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4 +; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i32 2 +; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i32 2 +; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4 + define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) { %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> , <8 x i32> undef) @@ -245,3 +291,42 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) { } +; KNL-LABEL: test15 +; KNL: kmovw %eax, %k1 +; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} + +; SCALAR-LABEL: test15 +; SCALAR: extractelement <16 x float*> +; SCALAR-NEXT: load float +; SCALAR-NEXT: insertelement <16 x float> +; SCALAR-NEXT: extractelement <16 x float*> +; SCALAR-NEXT: load float + +define <16 x float> @test15(float* %base, <16 x i32> %ind) { + + %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 + %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer + + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) + ret <16 x float>%res +} + +; Check non-power-of-2 case. It should be scalarized. +declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>) +; KNL-LABEL: test16 +; KNL: testb +; KNL: je +; KNL: testb +; KNL: je +; KNL: testb +; KNL: je +define <3 x i32> @test16(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { + %sext_ind = sext <3 x i32> %ind to <3 x i64> + %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind + %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0) + ret <3 x i32>%res +} + -- 2.34.1