From 44c2d61b6703469a95fcd2d5397c5d09a67e75c1 Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Wed, 10 Oct 2012 16:53:28 +0000 Subject: [PATCH] Add support for FP_ROUND from v2f64 to v2f32 - Due to the current matching vector elements constraints in ISD::FP_ROUND, rounding from v2f64 to v4f32 (after legalization from v2f32) is scalarized. Add a customized v2f32 widening to convert it into a target-specific X86ISD::VFPROUND to work around this constraints. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@165631 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 7 +++ lib/Target/X86/X86ISelLowering.h | 3 ++ lib/Target/X86/X86InstrFragmentsSIMD.td | 3 ++ lib/Target/X86/X86InstrSSE.td | 11 ++++- test/CodeGen/X86/fp-load-trunc.ll | 61 +++++++++++++++++++++++++ test/CodeGen/X86/fp-trunc.ll | 51 +++++++++++++++------ 6 files changed, 121 insertions(+), 15 deletions(-) create mode 100644 test/CodeGen/X86/fp-load-trunc.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d551d9d0275..7df10980fd9 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -940,6 +940,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); } @@ -11468,6 +11469,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case ISD::FP_ROUND: { + SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); + Results.push_back(V); + return; + } case ISD::READCYCLECOUNTER: { SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue TheChain = N->getOperand(0); @@ -11662,6 +11668,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; + case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index dca65b895e2..b654d5f2756 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -233,6 +233,9 @@ namespace llvm { // VFPEXT - Vector FP extend. VFPEXT, + // VFPROUND - Vector FP round. + VFPROUND, + // VSHL, VSRL - 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 90354354367..46281efa571 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -93,6 +93,9 @@ def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, def X86vfpext : SDNode<"X86ISD::VFPEXT", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>]>>; +def X86vfpround: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisFP<1>]>>; def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>; def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 2aa4f3f4dbb..cc1291a8a0f 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2125,6 +2125,10 @@ let Predicates = [HasAVX] in { (VCVTDQ2PSYrm addr:$src)>; // Match fround and fextend for 128/256-bit conversions + def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), + (VCVTPD2PSrr VR128:$src)>; + def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))), + (VCVTPD2PSXrm addr:$src)>; def : Pat<(v4f32 (fround (v4f64 VR256:$src))), (VCVTPD2PSYrr VR256:$src)>; def : Pat<(v4f32 (fround (loadv4f64 addr:$src))), @@ -2139,7 +2143,12 @@ let Predicates = [HasAVX] in { } let Predicates = [UseSSE2] in { - // Match fextend for 128 conversions + // Match fround and fextend for 128 conversions + def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), + (CVTPD2PSrr VR128:$src)>; + def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))), + (CVTPD2PSrm addr:$src)>; + def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), (CVTPS2PDrr VR128:$src)>; } diff --git a/test/CodeGen/X86/fp-load-trunc.ll b/test/CodeGen/X86/fp-load-trunc.ll new file mode 100644 index 00000000000..8430a31b3a8 --- /dev/null +++ b/test/CodeGen/X86/fp-load-trunc.ll @@ -0,0 +1,61 @@ +; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s +; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s --check-prefix=AVX + +define <1 x float> @test1(<1 x double>* %p) nounwind { +; CHECK: test1 +; CHECK: cvtsd2ss +; CHECK: ret +; AVX: test1 +; AVX: vcvtsd2ss +; AVX: ret + %x = load <1 x double>* %p + %y = fptrunc <1 x double> %x to <1 x float> + ret <1 x float> %y +} + +define <2 x float> @test2(<2 x double>* %p) nounwind { +; CHECK: test2 +; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) +; CHECK: ret +; AVX: test2 +; AVX: vcvtpd2psx {{[0-9]*}}(%{{.*}}) +; AVX: ret + %x = load <2 x double>* %p + %y = fptrunc <2 x double> %x to <2 x float> + ret <2 x float> %y +} + +define <4 x float> @test3(<4 x double>* %p) nounwind { +; CHECK: test3 +; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) +; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) +; CHECK: movlhps +; CHECK: ret +; AVX: test3 +; AVX: vcvtpd2psy {{[0-9]*}}(%{{.*}}) +; AVX: ret + %x = load <4 x double>* %p + %y = fptrunc <4 x double> %x to <4 x float> + ret <4 x float> %y +} + +define <8 x float> @test4(<8 x double>* %p) nounwind { +; CHECK: test4 +; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) +; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) +; CHECK: movlhps +; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) +; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) +; CHECK: movlhps +; CHECK: ret +; AVX: test4 +; AVX: vcvtpd2psy {{[0-9]*}}(%{{.*}}) +; AVX: vcvtpd2psy {{[0-9]*}}(%{{.*}}) +; AVX: vinsertf128 +; AVX: ret + %x = load <8 x double>* %p + %y = fptrunc <8 x double> %x to <8 x float> + ret <8 x float> %y +} + + diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll index 170637a40ee..544fa537cca 100644 --- a/test/CodeGen/X86/fp-trunc.ll +++ b/test/CodeGen/X86/fp-trunc.ll @@ -1,33 +1,56 @@ ; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s +; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s --check-prefix=AVX define <1 x float> @test1(<1 x double> %x) nounwind { +; CHECK: test1 ; CHECK: cvtsd2ss ; CHECK: ret +; AVX: test1 +; AVX: vcvtsd2ss +; AVX: ret %y = fptrunc <1 x double> %x to <1 x float> ret <1 x float> %y } - define <2 x float> @test2(<2 x double> %x) nounwind { -; FIXME: It would be nice if this compiled down to a cvtpd2ps -; CHECK: cvtsd2ss -; CHECK: cvtsd2ss +; CHECK: test2 +; CHECK: cvtpd2ps ; CHECK: ret +; AVX: test2 +; AVX-NOT: vcvtpd2psy +; AVX: vcvtpd2ps +; AVX: ret %y = fptrunc <2 x double> %x to <2 x float> ret <2 x float> %y } -define <8 x float> @test3(<8 x double> %x) nounwind { -; FIXME: It would be nice if this compiled down to a series of cvtpd2ps -; CHECK: cvtsd2ss -; CHECK: cvtsd2ss -; CHECK: cvtsd2ss -; CHECK: cvtsd2ss -; CHECK: cvtsd2ss -; CHECK: cvtsd2ss -; CHECK: cvtsd2ss -; CHECK: cvtsd2ss +define <4 x float> @test3(<4 x double> %x) nounwind { +; CHECK: test3 +; CHECK: cvtpd2ps +; CHECK: cvtpd2ps +; CHECK: movlhps +; CHECK: ret +; AVX: test3 +; AVX: vcvtpd2psy +; AVX: ret + %y = fptrunc <4 x double> %x to <4 x float> + ret <4 x float> %y +} + +define <8 x float> @test4(<8 x double> %x) nounwind { +; CHECK: test4 +; CHECK: cvtpd2ps +; CHECK: cvtpd2ps +; CHECK: movlhps +; CHECK: cvtpd2ps +; CHECK: cvtpd2ps +; CHECK: movlhps ; CHECK: ret +; AVX: test4 +; AVX: vcvtpd2psy +; AVX: vcvtpd2psy +; AVX: vinsertf128 +; AVX: ret %y = fptrunc <8 x double> %x to <8 x float> ret <8 x float> %y } -- 2.34.1