From a5063429b2666581df88584093250f5af5a1f8a3 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 28 Dec 2015 21:16:55 +0000 Subject: [PATCH] [x86] lower calls to fmin and llvm.minnum.* using minss/minsd/minps/minpd (PR24475) This is a follow-on to: http://reviews.llvm.org/rL255700 http://reviews.llvm.org/rL256454 http://reviews.llvm.org/rL256510 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@256522 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 16 ++- test/CodeGen/X86/fminnum.ll | 175 ++++++++++++++++++++++++----- 2 files changed, 159 insertions(+), 32 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 63fc0c4210b..0927c2f4fa5 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1809,6 +1809,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FMA); + setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); @@ -26917,8 +26918,8 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { N->getOperand(0), N->getOperand(1)); } -static SDValue performFMaxNumCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { if (Subtarget->useSoftFloat()) return SDValue(); @@ -26926,7 +26927,6 @@ static SDValue performFMaxNumCombine(SDNode *N, SelectionDAG &DAG, // should be able to lower to FMAX/FMIN alone. // TODO: If an operand is already known to be a NaN or not a NaN, this // should be an optional swap and FMAX/FMIN. - // TODO: Allow fminnum. EVT VT = N->getValueType(0); if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || @@ -26957,19 +26957,21 @@ static SDValue performFMaxNumCombine(SDNode *N, SelectionDAG &DAG, // // The SSE FP max/min instructions were not designed for this case, but rather // to implement: + // Min = Op1 < Op0 ? Op1 : Op0 // Max = Op1 > Op0 ? Op1 : Op0 // // So they always return Op0 if either input is a NaN. However, we can still // use those instructions for fmaxnum by selecting away a NaN input. // If either operand is NaN, the 2nd source operand (Op0) is passed through. - SDValue Max = DAG.getNode(X86ISD::FMAX, DL, VT, Op1, Op0); + auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; + SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0); SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO); // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands // are NaN, the NaN value of Op1 is the result. auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; - return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, Max); + return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); } /// Do target-specific dag combines on X86ISD::FAND nodes. @@ -27831,7 +27833,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget); case X86ISD::FMIN: case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); - case ISD::FMAXNUM: return performFMaxNumCombine(N, DAG, Subtarget); + case ISD::FMINNUM: + case ISD::FMAXNUM: return performFMinNumFMaxNumCombine(N, DAG, + Subtarget); case X86ISD::FAND: return PerformFANDCombine(N, DAG, Subtarget); case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); diff --git a/test/CodeGen/X86/fminnum.ll b/test/CodeGen/X86/fminnum.ll index e89ed32ad61..afe8b804f26 100644 --- a/test/CodeGen/X86/fminnum.ll +++ b/test/CodeGen/X86/fminnum.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2 < %s | FileCheck %s -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s --check-prefix=CHECK --check-prefix=AVX declare float @fminf(float, float) declare double @fmin(double, double) @@ -14,15 +14,45 @@ declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) +; FIXME: As the vector tests show, the SSE run shouldn't need this many moves. + ; CHECK-LABEL: @test_fminf -; CHECK: jmp fminf +; SSE: movaps %xmm0, %xmm2 +; SSE-NEXT: cmpunordss %xmm2, %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: andps %xmm1, %xmm3 +; SSE-NEXT: minss %xmm0, %xmm1 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: orps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX: vminss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: retq define float @test_fminf(float %x, float %y) { %z = call float @fminf(float %x, float %y) readnone ret float %z } +; FIXME: As the vector tests show, the SSE run shouldn't need this many moves. + ; CHECK-LABEL: @test_fmin -; CHECK: jmp fmin +; SSE: movapd %xmm0, %xmm2 +; SSE-NEXT: cmpunordsd %xmm2, %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm3 +; SSE-NEXT: andpd %xmm1, %xmm3 +; SSE-NEXT: minsd %xmm0, %xmm1 +; SSE-NEXT: andnpd %xmm1, %xmm2 +; SSE-NEXT: orpd %xmm3, %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX: vminsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: retq define double @test_fmin(double %x, double %y) { %z = call double @fmin(double %x, double %y) readnone ret double %z @@ -36,14 +66,40 @@ define x86_fp80 @test_fminl(x86_fp80 %x, x86_fp80 %y) { } ; CHECK-LABEL: @test_intrinsic_fminf -; CHECK: jmp fminf +; SSE: movaps %xmm0, %xmm2 +; SSE-NEXT: cmpunordss %xmm2, %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: andps %xmm1, %xmm3 +; SSE-NEXT: minss %xmm0, %xmm1 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: orps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX: vminss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: retq define float @test_intrinsic_fminf(float %x, float %y) { %z = call float @llvm.minnum.f32(float %x, float %y) readnone ret float %z } ; CHECK-LABEL: @test_intrinsic_fmin -; CHECK: jmp fmin +; SSE: movapd %xmm0, %xmm2 +; SSE-NEXT: cmpunordsd %xmm2, %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm3 +; SSE-NEXT: andpd %xmm1, %xmm3 +; SSE-NEXT: minsd %xmm0, %xmm1 +; SSE-NEXT: andnpd %xmm1, %xmm2 +; SSE-NEXT: orpd %xmm3, %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX: vminsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: retq define double @test_intrinsic_fmin(double %x, double %y) { %z = call double @llvm.minnum.f64(double %x, double %y) readnone ret double %z @@ -57,50 +113,117 @@ define x86_fp80 @test_intrinsic_fminl(x86_fp80 %x, x86_fp80 %y) { } ; CHECK-LABEL: @test_intrinsic_fmin_v2f32 -; CHECK: callq fminf -; CHECK: callq fminf +; SSE: movaps %xmm1, %xmm2 +; SSE-NEXT: minps %xmm0, %xmm2 +; SSE-NEXT: cmpunordps %xmm0, %xmm0 +; SSE-NEXT: andps %xmm0, %xmm1 +; SSE-NEXT: andnps %xmm2, %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX: vminps %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: retq define <2 x float> @test_intrinsic_fmin_v2f32(<2 x float> %x, <2 x float> %y) { %z = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> %y) readnone ret <2 x float> %z } ; CHECK-LABEL: @test_intrinsic_fmin_v4f32 -; CHECK: callq fminf -; CHECK: callq fminf -; CHECK: callq fminf -; CHECK: callq fminf +; SSE: movaps %xmm1, %xmm2 +; SSE-NEXT: minps %xmm0, %xmm2 +; SSE-NEXT: cmpunordps %xmm0, %xmm0 +; SSE-NEXT: andps %xmm0, %xmm1 +; SSE-NEXT: andnps %xmm2, %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX: vminps %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: retq define <4 x float> @test_intrinsic_fmin_v4f32(<4 x float> %x, <4 x float> %y) { %z = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) readnone ret <4 x float> %z } ; CHECK-LABEL: @test_intrinsic_fmin_v2f64 -; CHECK: callq fmin -; CHECK: callq fmin +; SSE: movapd %xmm1, %xmm2 +; SSE-NEXT: minpd %xmm0, %xmm2 +; SSE-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE-NEXT: andpd %xmm0, %xmm1 +; SSE-NEXT: andnpd %xmm2, %xmm0 +; SSE-NEXT: orpd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX: vminpd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: retq define <2 x double> @test_intrinsic_fmin_v2f64(<2 x double> %x, <2 x double> %y) { %z = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y) readnone ret <2 x double> %z } ; CHECK-LABEL: @test_intrinsic_fmin_v4f64 -; CHECK: callq fmin -; CHECK: callq fmin -; CHECK: callq fmin -; CHECK: callq fmin +; SSE: movapd %xmm2, %xmm4 +; SSE-NEXT: minpd %xmm0, %xmm4 +; SSE-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE-NEXT: andpd %xmm0, %xmm2 +; SSE-NEXT: andnpd %xmm4, %xmm0 +; SSE-NEXT: orpd %xmm2, %xmm0 +; SSE-NEXT: movapd %xmm3, %xmm2 +; SSE-NEXT: minpd %xmm1, %xmm2 +; SSE-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE-NEXT: andpd %xmm1, %xmm3 +; SSE-NEXT: andnpd %xmm2, %xmm1 +; SSE-NEXT: orpd %xmm3, %xmm1 +; SSE-NEXT: retq +; +; AVX: vminpd %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 +; AVX-NEXT: retq define <4 x double> @test_intrinsic_fmin_v4f64(<4 x double> %x, <4 x double> %y) { %z = call <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y) readnone ret <4 x double> %z } ; CHECK-LABEL: @test_intrinsic_fmin_v8f64 -; CHECK: callq fmin -; CHECK: callq fmin -; CHECK: callq fmin -; CHECK: callq fmin -; CHECK: callq fmin -; CHECK: callq fmin -; CHECK: callq fmin -; CHECK: callq fmin +; SSE: movapd %xmm4, %xmm8 +; SSE-NEXT: minpd %xmm0, %xmm8 +; SSE-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE-NEXT: andpd %xmm0, %xmm4 +; SSE-NEXT: andnpd %xmm8, %xmm0 +; SSE-NEXT: orpd %xmm4, %xmm0 +; SSE-NEXT: movapd %xmm5, %xmm4 +; SSE-NEXT: minpd %xmm1, %xmm4 +; SSE-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE-NEXT: andpd %xmm1, %xmm5 +; SSE-NEXT: andnpd %xmm4, %xmm1 +; SSE-NEXT: orpd %xmm5, %xmm1 +; SSE-NEXT: movapd %xmm6, %xmm4 +; SSE-NEXT: minpd %xmm2, %xmm4 +; SSE-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE-NEXT: andpd %xmm2, %xmm6 +; SSE-NEXT: andnpd %xmm4, %xmm2 +; SSE-NEXT: orpd %xmm6, %xmm2 +; SSE-NEXT: movapd %xmm7, %xmm4 +; SSE-NEXT: minpd %xmm3, %xmm4 +; SSE-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE-NEXT: andpd %xmm3, %xmm7 +; SSE-NEXT: andnpd %xmm4, %xmm3 +; SSE-NEXT: orpd %xmm7, %xmm3 +; SSE-NEXT: retq +; +; AVX: vminpd %ymm0, %ymm2, %ymm4 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm2, %ymm4, %ymm0 +; AVX-NEXT: vminpd %ymm1, %ymm3, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 +; AVX-NEXT: retq define <8 x double> @test_intrinsic_fmin_v8f64(<8 x double> %x, <8 x double> %y) { %z = call <8 x double> @llvm.minnum.v8f64(<8 x double> %x, <8 x double> %y) readnone ret <8 x double> %z -- 2.34.1