From c6058e24621bc157fdde3eaf6e2bc7558d189787 Mon Sep 17 00:00:00 2001 From: Jim Grosbach Date: Fri, 18 Jul 2014 00:40:56 +0000 Subject: [PATCH] X86: Constant fold converting vector setcc results to float. Since the result of a SETCC for X86 is 0 or -1 in each lane, we can move unary operations, in this case [su]int_to_fp through the mask operation and constant fold the operation away. Generally speaking: UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> AND(VECTOR_CMP(x,y), constant2) where constant2 is UNARYOP(constant). This implements the transform where UNARYOP is [su]int_to_fp. For example, consider the simple function: define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind { %cmp = fcmp oeq <4 x float> %val, %test %ext = zext <4 x i1> %cmp to <4 x i32> %result = sitofp <4 x i32> %ext to <4 x float> ret <4 x float> %result } Before this change, the SSE code is generated as: LCPI0_0: .long 1 ## 0x1 .long 1 ## 0x1 .long 1 ## 0x1 .long 1 ## 0x1 .section __TEXT,__text,regular,pure_instructions .globl _foo .align 4, 0x90 _foo: ## @foo cmpeqps %xmm1, %xmm0 andps LCPI0_0(%rip), %xmm0 cvtdq2ps %xmm0, %xmm0 retq After, the code is improved to: LCPI0_0: .long 1065353216 ## float 1.000000e+00 .long 1065353216 ## float 1.000000e+00 .long 1065353216 ## float 1.000000e+00 .long 1065353216 ## float 1.000000e+00 .section __TEXT,__text,regular,pure_instructions .globl _foo .align 4, 0x90 _foo: ## @foo cmpeqps %xmm1, %xmm0 andps LCPI0_0(%rip), %xmm0 retq The cvtdq2ps has been constant folded away and the floating point 1.0f vector lanes are materialized directly via the ModRM operand of andps. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213342 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 51 +++++++++++++++++++ .../X86/x86-setcc-int-to-fp-combine.ll | 18 +++++++ 2 files changed, 69 insertions(+) create mode 100644 test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e8fe5c9da6a..ae123569a89 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -21847,8 +21847,59 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, + SelectionDAG &DAG) { + // Take advantage of vector comparisons producing 0 or -1 in each lane to + // optimize away operation when it's from a constant. + // + // The general transformation is: + // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> + // AND(VECTOR_CMP(x,y), constant2) + // constant2 = UNARYOP(constant) + + // Early exit if this isn't a vector operation or if the operand of the + // unary operation isn't a bitwise AND. + EVT VT = N->getValueType(0); + if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || + N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC) + return SDValue(); + + // Now check that the other operand of the AND is a constant splat. We could + // make the transformation for non-constant splats as well, but it's unclear + // that would be a benefit as it would not eliminate any operations, just + // perform one more step in scalar code before moving to the vector unit. + if (BuildVectorSDNode *BV = + dyn_cast(N->getOperand(0)->getOperand(1))) { + // Bail out if the vector isn't a constant splat. + if (!BV->getConstantSplatNode()) + return SDValue(); + + // Everything checks out. Build up the new and improved node. + SDLoc DL(N); + EVT IntVT = BV->getValueType(0); + // Create a new constant of the appropriate type for the transformed + // DAG. + SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); + // The AND node needs bitcasts to/from an integer vector type around it. + SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, + N->getOperand(0)->getOperand(0), MaskConst); + SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); + return Res; + } + + return SDValue(); +} + static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) { + // First try to optimize away the conversion entirely when it's + // conditionally from a constant. Vectors only. + SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); + if (Res != SDValue()) + return Res; + + // Now move on to more general possibilities. SDValue Op0 = N->getOperand(0); EVT InVT = Op0->getValueType(0); diff --git a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll new file mode 100644 index 00000000000..d691685f1dd --- /dev/null +++ b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s + +define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind { +; CHECK-LABEL: LCPI0_0 +; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00 +; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00 +; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00 +; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00 +; CHECK-LABEL: foo: +; CHECK: cmpeqps %xmm1, %xmm0 +; CHECK-NEXT: andps LCPI0_0(%rip), %xmm0 +; CHECK-NEXT: retq + + %cmp = fcmp oeq <4 x float> %val, %test + %ext = zext <4 x i1> %cmp to <4 x i32> + %result = sitofp <4 x i32> %ext to <4 x float> + ret <4 x float> %result +} -- 2.34.1