From ac77529540b03ac974fc203e668747b2d27374ec Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 10 Aug 2015 20:21:15 +0000 Subject: [PATCH] [InstCombine] Move SSE2/AVX2 arithmetic vector shift folding to instcombiner As discussed in D11760, this patch moves the (V)PSRA(WD) arithmetic shift-by-constant folding to InstCombine to match the logical shift implementations. Differential Revision: http://reviews.llvm.org/D11886 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@244495 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 44 --- .../InstCombine/InstCombineCalls.cpp | 38 ++- test/CodeGen/X86/combine-avx2-intrinsics.ll | 45 --- test/CodeGen/X86/combine-sse2-intrinsics.ll | 53 --- .../InstCombine/x86-vector-shifts.ll | 309 ++++++++++++++++++ 5 files changed, 340 insertions(+), 149 deletions(-) delete mode 100644 test/CodeGen/X86/combine-sse2-intrinsics.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6dd4fb8d98e..aac4b1452c0 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -23464,50 +23464,6 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } - - // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psra_d: { - SDValue Op0 = N->getOperand(1); - SDValue Op1 = N->getOperand(2); - EVT VT = Op0.getValueType(); - assert(VT.isVector() && "Expected a vector type!"); - - if (isa(Op1)) - Op1 = Op1.getOperand(0); - - if (!isa(Op1)) - return SDValue(); - - EVT SVT = VT.getVectorElementType(); - unsigned SVTBits = SVT.getSizeInBits(); - - ConstantSDNode *CND = cast(Op1); - const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue()); - uint64_t ShAmt = C.getZExtValue(); - - // Don't try to convert this shift into a ISD::SRA if the shift - // count is bigger than or equal to the element size. - if (ShAmt >= SVTBits) - return SDValue(); - - // Trivial case: if the shift count is zero, then fold this - // into the first operand. - if (ShAmt == 0) - return Op0; - - // Replace this packed shift intrinsic with a target independent - // shift dag node. - SDLoc DL(N); - SDValue Splat = DAG.getConstant(C, DL, VT); - return DAG.getNode(ISD::SRA, DL, VT, Op0, Splat); - } } } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index de413c42348..600c8c36392 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -199,7 +199,9 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { static Value *SimplifyX86immshift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder, - bool ShiftLeft) { + bool LogicalShift, bool ShiftLeft) { + assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); + // Simplify if count is constant. auto Arg1 = II.getArgOperand(1); auto CAZ = dyn_cast(Arg1); @@ -238,9 +240,15 @@ static Value *SimplifyX86immshift(const IntrinsicInst &II, if (Count == 0) return Vec; - // Handle cases when Shift >= BitWidth - just return zero. - if (Count.uge(BitWidth)) - return ConstantAggregateZero::get(VT); + // Handle cases when Shift >= BitWidth. + if (Count.uge(BitWidth)) { + // If LogicalShift - just return zero. + if (LogicalShift) + return ConstantAggregateZero::get(VT); + + // If ArithmeticShift - clamp Shift to (BitWidth - 1). + Count = APInt(64, BitWidth - 1); + } // Get a constant vector of the same type as the first operand. auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); @@ -249,7 +257,10 @@ static Value *SimplifyX86immshift(const IntrinsicInst &II, if (ShiftLeft) return Builder.CreateShl(Vec, ShiftVec); - return Builder.CreateLShr(Vec, ShiftVec); + if (LogicalShift) + return Builder.CreateLShr(Vec, ShiftVec); + + return Builder.CreateAShr(Vec, ShiftVec); } static Value *SimplifyX86extend(const IntrinsicInst &II, @@ -776,6 +787,19 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + // Constant fold ashr( , Ci ). + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + if (Value *V = SimplifyX86immshift(*II, *Builder, false, false)) + return ReplaceInstUsesWith(*II, V); + break; + // Constant fold lshr( , Ci ). case Intrinsic::x86_sse2_psrl_d: case Intrinsic::x86_sse2_psrl_q: @@ -789,7 +813,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_avx2_psrli_d: case Intrinsic::x86_avx2_psrli_q: case Intrinsic::x86_avx2_psrli_w: - if (Value *V = SimplifyX86immshift(*II, *Builder, false)) + if (Value *V = SimplifyX86immshift(*II, *Builder, true, false)) return ReplaceInstUsesWith(*II, V); break; @@ -806,7 +830,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_avx2_pslli_d: case Intrinsic::x86_avx2_pslli_q: case Intrinsic::x86_avx2_pslli_w: - if (Value *V = SimplifyX86immshift(*II, *Builder, true)) + if (Value *V = SimplifyX86immshift(*II, *Builder, true, true)) return ReplaceInstUsesWith(*II, V); break; diff --git a/test/CodeGen/X86/combine-avx2-intrinsics.ll b/test/CodeGen/X86/combine-avx2-intrinsics.ll index 8794f8b8684..2951ec4b95d 100644 --- a/test/CodeGen/X86/combine-avx2-intrinsics.ll +++ b/test/CodeGen/X86/combine-avx2-intrinsics.ll @@ -3,47 +3,6 @@ ; Verify that the backend correctly combines AVX2 builtin intrinsics. -define <8 x i32> @test_psra_1(<8 x i32> %A) { - %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 3) - %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> ) - %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 2) - ret <8 x i32> %3 -} -; CHECK-LABEL: test_psra_1 -; CHECK: vpsrad $8, %ymm0, %ymm0 -; CHECK-NEXT: ret - -define <16 x i16> @test_psra_2(<16 x i16> %A) { - %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 3) - %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> ) - %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 2) - ret <16 x i16> %3 -} -; CHECK-LABEL: test_psra_2 -; CHECK: vpsraw $8, %ymm0, %ymm0 -; CHECK-NEXT: ret - -define <16 x i16> @test_psra_3(<16 x i16> %A) { - %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0) - %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> ) - %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0) - ret <16 x i16> %3 -} -; CHECK-LABEL: test_psra_3 -; CHECK-NOT: vpsraw -; CHECK: ret - -define <8 x i32> @test_psra_4(<8 x i32> %A) { - %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0) - %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> ) - %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0) - ret <8 x i32> %3 -} -; CHECK-LABEL: test_psra_4 -; CHECK-NOT: vpsrad -; CHECK: ret - - define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) { %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a0, <32 x i8> %a1) ret <32 x i8> %res @@ -157,8 +116,4 @@ declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) -declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) -declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) -declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) -declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) diff --git a/test/CodeGen/X86/combine-sse2-intrinsics.ll b/test/CodeGen/X86/combine-sse2-intrinsics.ll deleted file mode 100644 index fa500e5d8d6..00000000000 --- a/test/CodeGen/X86/combine-sse2-intrinsics.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s -; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s - -; Verify that the backend correctly combines SSE2 builtin intrinsics. - - -define <4 x i32> @test_psra_1(<4 x i32> %A) { - %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 3) - %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> ) - %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 2) - ret <4 x i32> %3 -} -; CHECK-LABEL: test_psra_1 -; CHECK: psrad $8, %xmm0 -; CHECK-NEXT: ret - -define <8 x i16> @test_psra_2(<8 x i16> %A) { - %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 3) - %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> ) - %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 2) - ret <8 x i16> %3 -} -; CHECK-LABEL: test_psra_2 -; CHECK: psraw $8, %xmm0 -; CHECK-NEXT: ret - -define <4 x i32> @test_psra_3(<4 x i32> %A) { - %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0) - %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> ) - %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 0) - ret <4 x i32> %3 -} -; CHECK-LABEL: test_psra_3 -; CHECK-NOT: psrad -; CHECK: ret - - -define <8 x i16> @test_psra_4(<8 x i16> %A) { - %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0) - %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> ) - %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0) - ret <8 x i16> %3 -} -; CHECK-LABEL: test_psra_4 -; CHECK-NOT: psraw -; CHECK: ret - - -declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) -declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) -declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) -declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) - diff --git a/test/Transforms/InstCombine/x86-vector-shifts.ll b/test/Transforms/InstCombine/x86-vector-shifts.ll index fdfad4900f9..95700f08d81 100644 --- a/test/Transforms/InstCombine/x86-vector-shifts.ll +++ b/test/Transforms/InstCombine/x86-vector-shifts.ll @@ -1,6 +1,102 @@ ; RUN: opt < %s -instcombine -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +; +; ASHR - Immediate +; + +define <8 x i16> @sse2_psrai_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_w_0 +; CHECK-NEXT: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 0) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psrai_w_15(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_w_15 +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 15) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psrai_w_64(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_w_64 +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 64) + ret <8 x i16> %1 +} + +define <4 x i32> @sse2_psrai_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_d_0 +; CHECK-NEXT: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 0) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psrai_d_15(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_d_15 +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 15) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psrai_d_64(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psrai_d_64 +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %v, i32 64) + ret <4 x i32> %1 +} + +define <16 x i16> @avx2_psrai_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_w_0 +; CHECK-NEXT: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 0) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psrai_w_15(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_w_15 +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 15) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psrai_w_64(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_w_64 +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %v, i32 64) + ret <16 x i16> %1 +} + +define <8 x i32> @avx2_psrai_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_d_0 +; CHECK-NEXT: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 0) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psrai_d_15(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_d_15 +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 15) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psrai_d_64(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psrai_d_64 +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 64) + ret <8 x i32> %1 +} + ; ; LSHR - Immediate ; @@ -273,6 +369,134 @@ define <4 x i64> @avx2_pslli_q_64(<4 x i64> %v) nounwind readnone uwtable { ret <4 x i64> %1 } +; +; ASHR - Constant Vector +; + +define <8 x i16> @sse2_psra_w_0(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_0 +; CHECK-NEXT: ret <8 x i16> %v + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> zeroinitializer) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psra_w_15(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_15 +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psra_w_15_splat(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_15_splat +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_psra_w_64(<8 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_w_64 +; CHECK-NEXT: %1 = ashr <8 x i16> %v, +; CHECK-NEXT: ret <8 x i16> %1 + %1 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> ) + ret <8 x i16> %1 +} + +define <4 x i32> @sse2_psra_d_0(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_0 +; CHECK-NEXT: ret <4 x i32> %v + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> zeroinitializer) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psra_d_15(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_15 +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psra_d_15_splat(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_15_splat +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <4 x i32> @sse2_psra_d_64(<4 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @sse2_psra_d_64 +; CHECK-NEXT: %1 = ashr <4 x i32> %v, +; CHECK-NEXT: ret <4 x i32> %1 + %1 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %v, <4 x i32> ) + ret <4 x i32> %1 +} + +define <16 x i16> @avx2_psra_w_0(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_0 +; CHECK-NEXT: ret <16 x i16> %v + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> zeroinitializer) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psra_w_15(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_15 +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psra_w_15_splat(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_15_splat +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_psra_w_64(<16 x i16> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_w_64 +; CHECK-NEXT: %1 = ashr <16 x i16> %v, +; CHECK-NEXT: ret <16 x i16> %1 + %1 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %v, <8 x i16> ) + ret <16 x i16> %1 +} + +define <8 x i32> @avx2_psra_d_0(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_0 +; CHECK-NEXT: ret <8 x i32> %v + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> zeroinitializer) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psra_d_15(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_15 +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psra_d_15_splat(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_15_splat +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + +define <8 x i32> @avx2_psra_d_64(<8 x i32> %v) nounwind readnone uwtable { +; CHECK-LABEL: @avx2_psra_d_64 +; CHECK-NEXT: %1 = ashr <8 x i32> %v, +; CHECK-NEXT: ret <8 x i32> %1 + %1 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> ) + ret <8 x i32> %1 +} + ; ; LSHR - Constant Vector ; @@ -605,6 +829,82 @@ define <4 x i64> @avx2_psll_q_64(<4 x i64> %v) nounwind readnone uwtable { ; Constant Folding ; +define <8 x i16> @test_sse2_psra_w_0(<8 x i16> %A) { +; CHECK-LABEL: @test_sse2_psra_w_0 +; CHECK-NEXT: ret <8 x i16> %A + %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0) + %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> ) + %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0) + ret <8 x i16> %3 +} + +define <8 x i16> @test_sse2_psra_w_8() { +; CHECK-LABEL: @test_sse2_psra_w_8 +; CHECK-NEXT: ret <8 x i16> + %1 = bitcast <2 x i64> to <8 x i16> + %2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %1, i32 3) + %3 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %2, <8 x i16> ) + %4 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %3, i32 2) + ret <8 x i16> %4 +} + +define <4 x i32> @test_sse2_psra_d_0(<4 x i32> %A) { +; CHECK-LABEL: @test_sse2_psra_d_0 +; CHECK-NEXT: ret <4 x i32> %A + %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0) + %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> ) + %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %1, i32 0) + ret <4 x i32> %3 +} + +define <4 x i32> @sse2_psra_d_8() { +; CHECK-LABEL: @sse2_psra_d_8 +; CHECK-NEXT: ret <4 x i32> + %1 = bitcast <2 x i64> to <4 x i32> + %2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %1, i32 3) + %3 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %2, <4 x i32> ) + %4 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %3, i32 2) + ret <4 x i32> %4 +} + +define <16 x i16> @test_avx2_psra_w_0(<16 x i16> %A) { +; CHECK-LABEL: @test_avx2_psra_w_0 +; CHECK-NEXT: ret <16 x i16> %A + %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0) + %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> ) + %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0) + ret <16 x i16> %3 +} + +define <16 x i16> @test_avx2_psra_w_8(<16 x i16> %A) { +; CHECK-LABEL: @test_avx2_psra_w_8 +; CHECK-NEXT: ret <16 x i16> + %1 = bitcast <4 x i64> to <16 x i16> + %2 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %1, i32 3) + %3 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %2, <8 x i16> ) + %4 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %3, i32 2) + ret <16 x i16> %4 +} + +define <8 x i32> @test_avx2_psra_d_0(<8 x i32> %A) { +; CHECK-LABEL: @test_avx2_psra_d_0 +; CHECK-NEXT: ret <8 x i32> %A + %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0) + %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> ) + %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0) + ret <8 x i32> %3 +} + +define <8 x i32> @test_avx2_psra_d_8() { +; CHECK-LABEL: @test_avx2_psra_d_8 +; CHECK-NEXT: ret <8 x i32> + %1 = bitcast <4 x i64> to <8 x i32> + %2 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %1, i32 3) + %3 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %2, <4 x i32> ) + %4 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %3, i32 2) + ret <8 x i32> %4 +} + define <2 x i64> @test_sse2_1() nounwind readnone uwtable { %S = bitcast i32 1 to i32 %1 = zext i32 %S to i64 @@ -814,4 +1114,13 @@ declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1 declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1 +declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) #1 +declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) #1 +declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) #1 +declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) #1 +declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) #1 +declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) #1 +declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) #1 +declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) #1 + attributes #1 = { nounwind readnone } -- 2.34.1