From d6de44078ba8eb484271e3ab6dd2fd9f59c93920 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Sun, 10 Jan 2016 09:41:22 +0000 Subject: [PATCH] Optimized instruction sequence for sitofp operation on X86-32 Optimized sitofp i64 %x to double. The current sequence movl %ecx, 8(%esp) movl %edx, 12(%esp) fildll 8(%esp) is replaced with: movd %ecx, %xmm0 movd %edx, %xmm1 punpckldq %xmm1, %xmm0 movq %xmm0, 8(%esp) Differential Revision: http://reviews.llvm.org/D15946 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@257285 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 58 +++++++++++++++++++++------- test/CodeGen/X86/dagcombine-cse.ll | 2 +- test/CodeGen/X86/scalar-int-to-fp.ll | 43 +++++++++++++++++++++ 3 files changed, 87 insertions(+), 16 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 17ac3da53f5..9702eb860ac 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -265,7 +265,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } - } + } else if (!Subtarget->is64Bit()) + setOperationAction(ISD::BITCAST , MVT::i64 , Custom); // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes @@ -12672,13 +12673,21 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, return Op; } + SDValue ValueToStore = Op.getOperand(0); + if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && + !Subtarget->is64Bit()) + // Bitcasting to f64 here allows us to do a single 64-bit store from + // an SSE register, avoiding the store forwarding penalty that would come + // with two 32-bit stores. + ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); + unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); SDValue Chain = DAG.getStore( - DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, + DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, false, 0); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); @@ -13051,7 +13060,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + SDValue ValueToStore = Op.getOperand(0); + if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget->is64Bit()) + // Bitcasting to f64 here allows us to do a single 64-bit store from + // an SSE register, avoiding the store forwarding penalty that would come + // with two 32-bit stores. + ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo(), false, false, 0); // For i64 source, we need to add the appropriate power of 2 if the input @@ -19536,24 +19551,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, MVT SrcVT = Op.getOperand(0).getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); - if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) { + if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || + SrcVT == MVT::i64) { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); if (DstVT != MVT::f64) // This conversion needs to be expanded. return SDValue(); - SDValue InVec = Op->getOperand(0); - SDLoc dl(Op); - unsigned NumElts = SrcVT.getVectorNumElements(); - MVT SVT = SrcVT.getVectorElementType(); - - // Widen the vector in input in the case of MVT::v2i32. - // Example: from MVT::v2i32 to MVT::v4i32. + SDValue Op0 = Op->getOperand(0); SmallVector Elts; - for (unsigned i = 0, e = NumElts; i != e; ++i) - Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec, - DAG.getIntPtrConstant(i, dl))); - + SDLoc dl(Op); + unsigned NumElts; + MVT SVT; + if (SrcVT.isVector()) { + NumElts = SrcVT.getVectorNumElements(); + SVT = SrcVT.getVectorElementType(); + + // Widen the vector in input in the case of MVT::v2i32. + // Example: from MVT::v2i32 to MVT::v4i32. + for (unsigned i = 0, e = NumElts; i != e; ++i) + Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0, + DAG.getIntPtrConstant(i, dl))); + } else { + assert(SrcVT == MVT::i64 && !Subtarget->is64Bit() && + "Unexpected source type in LowerBITCAST"); + Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, + DAG.getIntPtrConstant(0, dl))); + Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, + DAG.getIntPtrConstant(1, dl))); + NumElts = 2; + SVT = MVT::i32; + } // Explicitly mark the extra elements as Undef. Elts.append(NumElts, DAG.getUNDEF(SVT)); diff --git a/test/CodeGen/X86/dagcombine-cse.ll b/test/CodeGen/X86/dagcombine-cse.ll index be1dcff7ae8..bff0e64910b 100644 --- a/test/CodeGen/X86/dagcombine-cse.ll +++ b/test/CodeGen/X86/dagcombine-cse.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin -stats 2>&1 | grep asm-printer | grep 14 +; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin -stats 2>&1 | grep asm-printer | grep 13 define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) nounwind { entry: diff --git a/test/CodeGen/X86/scalar-int-to-fp.ll b/test/CodeGen/X86/scalar-int-to-fp.ll index 93039859cdf..4a16c3198aa 100644 --- a/test/CodeGen/X86/scalar-int-to-fp.ll +++ b/test/CodeGen/X86/scalar-int-to-fp.ll @@ -74,9 +74,16 @@ define x86_fp80 @s32_to_x(i32 %a) nounwind { } ; CHECK-LABEL: u64_to_f +; AVX512_32: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512_32: vmovlpd %xmm0, {{[0-9]+}}(%esp) ; AVX512_32: fildll + ; AVX512_64: vcvtusi2ssq + +; SSE2_32: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2_32: movq %xmm0, {{[0-9]+}}(%esp) ; SSE2_32: fildll + ; SSE2_64: cvtsi2ssq ; X87: fildll define float @u64_to_f(i64 %a) nounwind { @@ -95,6 +102,24 @@ define float @s64_to_f(i64 %a) nounwind { ret float %r } +; CHECK-LABEL: s64_to_f_2 +; SSE2_32: movd %ecx, %xmm0 +; SSE2_32: movd %eax, %xmm1 +; SSE2_32: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2_32: movq %xmm1, {{[0-9]+}}(%esp) +; SSE2_32: fildll {{[0-9]+}}(%esp) + +; AVX512_32: vmovd %eax, %xmm0 +; AVX512_32: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX512_32: vmovlpd %xmm0, {{[0-9]+}}(%esp) +; AVX512_32: fildll {{[0-9]+}}(%esp) + +define float @s64_to_f_2(i64 %a) nounwind { + %a1 = add i64 %a, 5 + %r = sitofp i64 %a1 to float + ret float %r +} + ; CHECK-LABEL: u64_to_d ; AVX512_32: vpunpckldq ; AVX512_64: vcvtusi2sdq @@ -117,6 +142,24 @@ define double @s64_to_d(i64 %a) nounwind { ret double %r } +; CHECK-LABEL: s64_to_d_2 +; SSE2_32: movd %ecx, %xmm0 +; SSE2_32: movd %eax, %xmm1 +; SSE2_32: punpckldq %xmm0, %xmm1 +; SSE2_32: movq %xmm1, {{[0-9]+}}(%esp) +; SSE2_32: fildll + +; AVX512_32: vmovd %eax, %xmm0 +; AVX512_32: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX512_32: vmovlpd %xmm0, {{[0-9]+}}(%esp) +; AVX512_32: fildll + +define double @s64_to_d_2(i64 %a) nounwind { + %b = add i64 %a, 5 + %f = sitofp i64 %b to double + ret double %f +} + ; CHECK-LABEL: u64_to_x ; CHECK: fildll define x86_fp80 @u64_to_x(i64 %a) nounwind { -- 2.34.1