From aec5861bb6ace3734163c000cb75ca2e22e29caa Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nadav.rotem@intel.com>
Date: Tue, 13 Sep 2011 19:17:42 +0000
Subject: [PATCH] Add vselect target support for targets that do not support
 blend but do support xor/and/or (For example SSE2).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@139623 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SelectionDAG/LegalizeVectorOps.cpp        | 43 ++++++++++++++++++-
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |  4 ++
 lib/Target/X86/X86ISelLowering.cpp            |  1 +
 test/CodeGen/X86/sse2-blend.ll                | 37 ++++++++++++++++
 4 files changed, 83 insertions(+), 2 deletions(-)
 create mode 100644 test/CodeGen/X86/sse2-blend.ll

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 6af8e5b7a90..c192185ba55 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -61,6 +61,9 @@ class VectorLegalizer {
   // Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if
   // SINT_TO_FLOAT and SHR on vectors isn't legal.
   SDValue ExpandUINT_TO_FLOAT(SDValue Op);
+  // Implement vselect in terms of XOR, AND,OR when blend is not supported
+  //  by the target.
+  SDValue ExpandVSELECT(SDValue Op);
   SDValue ExpandFNEG(SDValue Op);
   // Implements vector promotion; this is essentially just bitcasting the
   // operands to a different type and bitcasting the result back to the
@@ -157,6 +160,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::CTLZ:
   case ISD::CTPOP:
   case ISD::SELECT:
+  case ISD::VSELECT:
   case ISD::SELECT_CC:
   case ISD::SETCC:
   case ISD::ZERO_EXTEND:
@@ -210,7 +214,9 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     // FALL THROUGH
   }
   case TargetLowering::Expand:
-    if (Node->getOpcode() == ISD::UINT_TO_FP)
+    if (Node->getOpcode() == ISD::VSELECT)
+      Result = ExpandVSELECT(Op);
+    else if (Node->getOpcode() == ISD::UINT_TO_FP)
       Result = ExpandUINT_TO_FLOAT(Op);
     else if (Node->getOpcode() == ISD::FNEG)
       Result = ExpandFNEG(Op);
@@ -256,9 +262,42 @@ SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) {
   return DAG.getNode(ISD::BITCAST, dl, VT, Op);
 }
 
-SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
+SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
+  // Implement VSELECT in terms of XOR, AND, OR
+  // on platforms which do not support blend natively.
+  EVT VT =  Op.getOperand(0).getValueType();
+  EVT OVT = Op.getOperand(0).getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+
+  SDValue Mask = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+
+  // If we can't even use the basic vector operations of
+  // AND,OR,XOR, we will have to scalarize the op.
+  if (!TLI.isOperationLegalOrCustom(ISD::AND, VT) ||
+      !TLI.isOperationLegalOrCustom(ISD::XOR, VT) ||
+      !TLI.isOperationLegalOrCustom(ISD::OR, VT)) {
+    return DAG.UnrollVectorOp(Op.getNode());
+  }
 
+  assert(VT.getSizeInBits() == OVT.getSizeInBits() && "Invalid mask size");
+  // Bitcast the operands to be the same type as the mask.
+  // This is needed when we select between FP types because
+  // the mask is a vector of integers.
+  Op1 = DAG.getNode(ISD::BITCAST, DL, VT, Op1);
+  Op2 = DAG.getNode(ISD::BITCAST, DL, VT, Op2);
 
+  SDValue AllOnes = DAG.getConstant(
+    APInt::getAllOnesValue(VT.getScalarType().getSizeInBits()), VT);
+  SDValue NotMask = DAG.getNode(ISD::XOR, DL, VT, Mask, AllOnes);
+
+  Op1 = DAG.getNode(ISD::AND, DL, VT, Op1, Mask);
+  Op2 = DAG.getNode(ISD::AND, DL, VT, Op2, NotMask);
+  return DAG.getNode(ISD::OR, DL, VT, Op1, Op2);
+}
+
+SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
   EVT VT = Op.getOperand(0).getValueType();
   DebugLoc DL = Op.getDebugLoc();
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 71399cef82c..171400a7fee 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6440,6 +6440,10 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
       Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
                                 &Operands[0], Operands.size()));
       break;
+    case ISD::VSELECT:
+      Scalars.push_back(getNode(ISD::SELECT, dl, EltVT,
+                                &Operands[0], Operands.size()));
+      break;
     case ISD::SHL:
     case ISD::SRA:
     case ISD::SRL:
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index efbc2ee7dd8..a2a140f1b8d 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -727,6 +727,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::VSELECT,  (MVT::SimpleValueType)VT, Expand);
     for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
          InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
       setTruncStoreAction((MVT::SimpleValueType)VT,
diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll
new file mode 100644
index 00000000000..20b732508a9
--- /dev/null
+++ b/test/CodeGen/X86/sse2-blend.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=x86 -mcpu=yonah -promote-elements -mattr=+sse2,-sse41
+
+define void@vsel_float(<4 x float>* %v1, <4 x float>* %v2) {
+  %A = load <4 x float>* %v1
+  %B = load <4 x float>* %v2
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %A, <4 x float> %B
+  store <4 x float > %vsel, <4 x float>* %v1
+  ret void
+}
+
+define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) {
+  %A = load <4 x i32>* %v1
+  %B = load <4 x i32>* %v2
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %A, <4 x i32> %B
+  store <4 x i32 > %vsel, <4 x i32>* %v1
+  ret void
+}
+
+
+define void@vsel_i64(<4 x i64>* %v1, <4 x i64>* %v2) {
+  %A = load <4 x i64>* %v1
+  %B = load <4 x i64>* %v2
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> %A, <4 x i64> %B
+  store <4 x i64 > %vsel, <4 x i64>* %v1
+  ret void
+}
+
+
+define void@vsel_double(<4 x double>* %v1, <4 x double>* %v2) {
+  %A = load <4 x double>* %v1
+  %B = load <4 x double>* %v2
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> %A, <4 x double> %B
+  store <4 x double > %vsel, <4 x double>* %v1
+  ret void
+}
+
+
-- 
2.34.1