From aec5861bb6ace3734163c000cb75ca2e22e29caa Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Tue, 13 Sep 2011 19:17:42 +0000 Subject: [PATCH] Add vselect target support for targets that do not support blend but do support xor/and/or (For example SSE2). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@139623 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/LegalizeVectorOps.cpp | 43 ++++++++++++++++++- lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 ++ lib/Target/X86/X86ISelLowering.cpp | 1 + test/CodeGen/X86/sse2-blend.ll | 37 ++++++++++++++++ 4 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/X86/sse2-blend.ll diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 6af8e5b7a90..c192185ba55 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -61,6 +61,9 @@ class VectorLegalizer { // Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if // SINT_TO_FLOAT and SHR on vectors isn't legal. SDValue ExpandUINT_TO_FLOAT(SDValue Op); + // Implement vselect in terms of XOR, AND,OR when blend is not supported + // by the target. + SDValue ExpandVSELECT(SDValue Op); SDValue ExpandFNEG(SDValue Op); // Implements vector promotion; this is essentially just bitcasting the // operands to a different type and bitcasting the result back to the @@ -157,6 +160,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::CTLZ: case ISD::CTPOP: case ISD::SELECT: + case ISD::VSELECT: case ISD::SELECT_CC: case ISD::SETCC: case ISD::ZERO_EXTEND: @@ -210,7 +214,9 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { // FALL THROUGH } case TargetLowering::Expand: - if (Node->getOpcode() == ISD::UINT_TO_FP) + if (Node->getOpcode() == ISD::VSELECT) + Result = ExpandVSELECT(Op); + else if (Node->getOpcode() == ISD::UINT_TO_FP) Result = ExpandUINT_TO_FLOAT(Op); else if (Node->getOpcode() == ISD::FNEG) Result = ExpandFNEG(Op); @@ -256,9 +262,42 @@ SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) { return DAG.getNode(ISD::BITCAST, dl, VT, Op); } -SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) { +SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) { + // Implement VSELECT in terms of XOR, AND, OR + // on platforms which do not support blend natively. + EVT VT = Op.getOperand(0).getValueType(); + EVT OVT = Op.getOperand(0).getValueType(); + DebugLoc DL = Op.getDebugLoc(); + + SDValue Mask = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + + // If we can't even use the basic vector operations of + // AND,OR,XOR, we will have to scalarize the op. + if (!TLI.isOperationLegalOrCustom(ISD::AND, VT) || + !TLI.isOperationLegalOrCustom(ISD::XOR, VT) || + !TLI.isOperationLegalOrCustom(ISD::OR, VT)) { + return DAG.UnrollVectorOp(Op.getNode()); + } + assert(VT.getSizeInBits() == OVT.getSizeInBits() && "Invalid mask size"); + // Bitcast the operands to be the same type as the mask. + // This is needed when we select between FP types because + // the mask is a vector of integers. + Op1 = DAG.getNode(ISD::BITCAST, DL, VT, Op1); + Op2 = DAG.getNode(ISD::BITCAST, DL, VT, Op2); + SDValue AllOnes = DAG.getConstant( + APInt::getAllOnesValue(VT.getScalarType().getSizeInBits()), VT); + SDValue NotMask = DAG.getNode(ISD::XOR, DL, VT, Mask, AllOnes); + + Op1 = DAG.getNode(ISD::AND, DL, VT, Op1, Mask); + Op2 = DAG.getNode(ISD::AND, DL, VT, Op2, NotMask); + return DAG.getNode(ISD::OR, DL, VT, Op1, Op2); +} + +SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) { EVT VT = Op.getOperand(0).getValueType(); DebugLoc DL = Op.getDebugLoc(); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 71399cef82c..171400a7fee 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6440,6 +6440,10 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, &Operands[0], Operands.size())); break; + case ISD::VSELECT: + Scalars.push_back(getNode(ISD::SELECT, dl, EltVT, + &Operands[0], Operands.size())); + break; case ISD::SHL: case ISD::SRA: case ISD::SRL: diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index efbc2ee7dd8..a2a140f1b8d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -727,6 +727,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::VSELECT, (MVT::SimpleValueType)VT, Expand); for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) setTruncStoreAction((MVT::SimpleValueType)VT, diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll new file mode 100644 index 00000000000..20b732508a9 --- /dev/null +++ b/test/CodeGen/X86/sse2-blend.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -march=x86 -mcpu=yonah -promote-elements -mattr=+sse2,-sse41 + +define void@vsel_float(<4 x float>* %v1, <4 x float>* %v2) { + %A = load <4 x float>* %v1 + %B = load <4 x float>* %v2 + %vsel = select <4 x i1> , <4 x float> %A, <4 x float> %B + store <4 x float > %vsel, <4 x float>* %v1 + ret void +} + +define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) { + %A = load <4 x i32>* %v1 + %B = load <4 x i32>* %v2 + %vsel = select <4 x i1> , <4 x i32> %A, <4 x i32> %B + store <4 x i32 > %vsel, <4 x i32>* %v1 + ret void +} + + +define void@vsel_i64(<4 x i64>* %v1, <4 x i64>* %v2) { + %A = load <4 x i64>* %v1 + %B = load <4 x i64>* %v2 + %vsel = select <4 x i1> , <4 x i64> %A, <4 x i64> %B + store <4 x i64 > %vsel, <4 x i64>* %v1 + ret void +} + + +define void@vsel_double(<4 x double>* %v1, <4 x double>* %v2) { + %A = load <4 x double>* %v1 + %B = load <4 x double>* %v2 + %vsel = select <4 x i1> , <4 x double> %A, <4 x double> %B + store <4 x double > %vsel, <4 x double>* %v1 + ret void +} + + -- 2.34.1