Lower vselects into X86ISD::BLENDI when appropriate.

author Filipe Cabecinhas <me@filcab.net>

Fri, 16 May 2014 22:47:49 +0000 (22:47 +0000)

committer Filipe Cabecinhas <me@filcab.net>

Fri, 16 May 2014 22:47:49 +0000 (22:47 +0000)
author Filipe Cabecinhas <me@filcab.net>
Fri, 16 May 2014 22:47:49 +0000 (22:47 +0000)
committer Filipe Cabecinhas <me@filcab.net>
Fri, 16 May 2014 22:47:49 +0000 (22:47 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 860af1284b23c397987c461e82a8d3cb6c51f97d..c6e7730f827164698a853e817b2702790bbf5401 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7971,7 +7971,87 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    return SDValue();
  }
  
+// This function assumes its argument is a BUILD_VECTOR of constand or
+// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
+// true.
+static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
+                                    unsigned &MaskValue) {
+  MaskValue = 0;
+  unsigned NumElems = BuildVector->getNumOperands();
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
+  unsigned NumElemsInLane = NumElems / NumLanes;
+
+  // Blend for v16i16 should be symetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
+    SDValue EltCond = BuildVector->getOperand(i);
+    SDValue SndLaneEltCond =
+        (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
+
+    int Lane1Cond = -1, Lane2Cond = -1;
+    if (isa<ConstantSDNode>(EltCond))
+      Lane1Cond = !isZero(EltCond);
+    if (isa<ConstantSDNode>(SndLaneEltCond))
+      Lane2Cond = !isZero(SndLaneEltCond);
+
+    if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
+      MaskValue |= !!Lane1Cond << i;
+    else if (Lane1Cond < 0)
+      MaskValue |= !!Lane2Cond << i;
+    else
+      return false;
+  }
+  return true;
+}
+
+// Try to lower a vselect node into a simple blend instruction.
+static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget,
+                                   SelectionDAG &DAG) {
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  // Check the mask for BLEND and build the value.
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
+
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  MVT BlendVT = VT;
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+                               NumElems);
+    LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
+    RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+  }
+
+  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
+                            DAG.getConstant(MaskValue, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
  SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG);
+  if (BlendOp.getNode())
+    return BlendOp;
+
    // Some types for vselect were previously set to Expand, not Legal or
    // Custom. Return an empty SDValue so we fall-through to Expand, after
    // the Custom lowering phase.
@@ -7984,7 +8064,9 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
      return SDValue();
    }
  
-  // This node is Legal.
+  // We couldn't create a "Blend with immediate" node.
+  // This node should still be legal, but we'll have to emit a blendv*
+  // instruction.
    return Op;
  }
  
diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll

index 5fcd5ff5f4c324eb762c6a0337ac3cbc4d94949b..4757ce0c9fae6f10e7ed2a106804e63a9e6c2c16 100644 (file)
--- a/test/CodeGen/X86/avx-blend.ll
+++ b/test/CodeGen/X86/avx-blend.ll
@@ -3,7 +3,7 @@
  ; AVX128 tests:
  
  ;CHECK-LABEL: vsel_float:
-;CHECK: vblendvps
+;CHECK: vblendps    $5
  ;CHECK: ret
  define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
    %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
@@ -12,7 +12,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
  
  
  ;CHECK-LABEL: vsel_i32:
-;CHECK: vblendvps
+;CHECK: vblendps   $5
  ;CHECK: ret
  define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
    %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
@@ -52,7 +52,7 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
  
  ;CHECK-LABEL: vsel_float8:
  ;CHECK-NOT: vinsertf128
-;CHECK: vblendvps
+;CHECK: vblendps    $17
  ;CHECK: ret
  define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
    %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
@@ -61,7 +61,7 @@ define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
  
  ;CHECK-LABEL: vsel_i328:
  ;CHECK-NOT: vinsertf128
-;CHECK: vblendvps
+;CHECK: vblendps    $17
  ;CHECK-NEXT: ret
  define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
    %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
@@ -86,7 +86,7 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
  
  ;CHECK-LABEL: vsel_double4:
  ;CHECK-NOT: vinsertf128
-;CHECK: vblendvpd
+;CHECK: vblendpd    $5
  ;CHECK-NEXT: ret
  define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
    %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
diff --git a/test/CodeGen/X86/avx2.ll b/test/CodeGen/X86/avx2.ll

new file mode 100644 (file)

index 0000000..290d0b6
--- /dev/null
+++ b/test/CodeGen/X86/avx2.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+
+define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @blendvb_fallback_v4i32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: @blendvb_fallback_v8i32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %ret
+}
+
+define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @blendvb_fallback_v8f32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
+  ret <8 x float> %ret
+}
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll

index 6b465963292e7c56f9f7ad670c47e3b0be539d4d..4e17a714bf57f4495f8b3454952b0c96fdc29c15 100644 (file)
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -4,7 +4,7 @@
  ; Verify that we produce movss instead of blendvps when possible.
  
  ;CHECK-LABEL: vsel_float:
-;CHECK-NOT: blendvps
+;CHECK-NOT: blend
  ;CHECK: movss
  ;CHECK: ret
  define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
@@ -13,7 +13,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
  }
  
  ;CHECK-LABEL: vsel_4xi8:
-;CHECK-NOT: blendvps
+;CHECK-NOT: blend
  ;CHECK: movss
  ;CHECK: ret
  define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
@@ -21,14 +21,8 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
    ret <4 x i8> %vsel
  }
  
-
-; We do not have native support for v8i16 blends and we have to use the
-; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not
-; reduce the mask in this case.
  ;CHECK-LABEL: vsel_8xi16:
-;CHECK: andps
-;CHECK: andps
-;CHECK: orps
+;CHECK: pblendw $17
  ;CHECK: ret
  define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
    %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll

index 4681fde7548e93eeaa4bc473e6488a7655f447c2..951bb7dc854abd7fd8e3d351336d750386bd2509 100644 (file)
--- a/test/CodeGen/X86/sse41-blend.ll
+++ b/test/CodeGen/X86/sse41-blend.ll
@@ -1,7 +1,7 @@
  ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
  
  ;CHECK-LABEL: vsel_float:
-;CHECK: blendvps
+;CHECK: blendps
  ;CHECK: ret
  define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
    %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %v1, <4 x float> %v2
@@ -10,7 +10,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
  
  
  ;CHECK-LABEL: vsel_4xi8:
-;CHECK: blendvps
+;CHECK: blendps
  ;CHECK: ret
  define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
    %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
@@ -18,7 +18,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
  }
  
  ;CHECK-LABEL: vsel_4xi16:
-;CHECK: blendvps
+;CHECK: blendps
  ;CHECK: ret
  define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
    %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
@@ -27,7 +27,7 @@ define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
  
  
  ;CHECK-LABEL: vsel_i32:
-;CHECK: blendvps
+;CHECK: blendps
  ;CHECK: ret
  define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
    %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll

index db0d9c5c1168ddfa0fbd1aae9a144abb96e12354..3652d8c0d02292224d53d67b1b3c70c2ff83c162 100644 (file)
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -576,3 +576,11 @@ define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
    %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
    ret <4 x float> %res
  }
+
+define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: blendvb_fallback
+; CHECK: blendvb
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %ret
+}
author	Filipe Cabecinhas <me@filcab.net>
	Fri, 16 May 2014 22:47:49 +0000 (22:47 +0000)
committer	Filipe Cabecinhas <me@filcab.net>
	Fri, 16 May 2014 22:47:49 +0000 (22:47 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/avx-blend.ll		patch \| blob \| history
test/CodeGen/X86/avx2.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/blend-msb.ll		patch \| blob \| history
test/CodeGen/X86/sse41-blend.ll		patch \| blob \| history
test/CodeGen/X86/sse41.ll		patch \| blob \| history