From 6ebf741ea20beec72e6c898c99b601b8c64d370c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 17 Jun 2015 22:43:34 +0000 Subject: [PATCH] [X86][SSE] Improved support for vector i16 to float conversions. Added explicit sign extension for v4i16/v8i16 to v4i32/v8i32 before conversion to floats. Matches existing support for v4i8/v8i8. Follow up to D10433 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239966 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 17 ++++++------ test/CodeGen/X86/vec_int_to_fp.ll | 43 ++++++------------------------ 2 files changed, 17 insertions(+), 43 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9b89082df5f..2773ffb0003 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -24690,18 +24690,19 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. - SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); - if (Res != SDValue()) + if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; // Now move on to more general possibilities. SDValue Op0 = N->getOperand(0); EVT InVT = Op0->getValueType(0); - // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) - if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { + // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) + // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) + if (InVT == MVT::v8i8 || InVT == MVT::v4i8 || + InVT == MVT::v8i16 || InVT == MVT::v4i16) { SDLoc dl(N); - MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; + MVT DstVT = MVT::getVectorVT(MVT::i32, InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); } @@ -24710,7 +24711,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, // a 32-bit target where SSE doesn't support i64->FP operations. if (Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast(Op0.getNode()); - EVT VT = Ld->getValueType(0); + EVT LdVT = Ld->getValueType(0); // This transformation is not supported if the result type is f16 if (N->getValueType(0) == MVT::f16) @@ -24718,9 +24719,9 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, if (!Ld->isVolatile() && !N->getValueType(0).isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && - !Subtarget->is64Bit() && VT == MVT::i64) { + !Subtarget->is64Bit() && LdVT == MVT::i64) { SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD( - SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG); + SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG); DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); return FILDChain; } diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index ad6ac435533..596cbbda2ff 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -176,38 +176,16 @@ define <4 x double> @sitofp_4vf64_i32(<4 x i32> %a) { define <4 x double> @sitofp_4vf64_i16(<8 x i16> %a) { ; SSE2-LABEL: sitofp_4vf64_i16: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: movswq %ax, %rax -; SSE2-NEXT: cvtsi2sdq %rax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: movswq %ax, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: movswq %ax, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: movswq %ax, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; ; AVX-LABEL: sitofp_4vf64_i16: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX-NEXT: retq %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -632,16 +610,13 @@ define <4 x float> @sitofp_4vf32_i16(<8 x i16> %a) { ; SSE2-LABEL: sitofp_4vf32_i16: ; SSE2: # BB#0: ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: sitofp_4vf32_i16: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX-NEXT: retq %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -728,13 +703,11 @@ define <4 x float> @sitofp_4vf32_4i64(<4 x i64> %a) { define <8 x float> @sitofp_8vf32_i16(<8 x i16> %a) { ; SSE2-LABEL: sitofp_8vf32_i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm0 -- 2.34.1