canonicalize a v2f64 blendi of 2 registers

author Sanjay Patel <spatel@rotateright.com>

Fri, 20 Feb 2015 16:55:27 +0000 (16:55 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Fri, 20 Feb 2015 16:55:27 +0000 (16:55 +0000)
author Sanjay Patel <spatel@rotateright.com>
Fri, 20 Feb 2015 16:55:27 +0000 (16:55 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Fri, 20 Feb 2015 16:55:27 +0000 (16:55 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index da20cefdfbfade906c2595f9b316ab73999343a5..7acfc0e96f0a0f683d4c208e8a23e11685bd1f1c 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -23024,6 +23024,32 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
                       LoadScalarToVector, N->getOperand(2));
  }
  
+static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue V0 = N->getOperand(0);
+  SDValue V1 = N->getOperand(1);
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
+  // operands and changing the mask to 1. This saves us a bunch of
+  // pattern-matching possibilities related to scalar math ops in SSE/AVX.
+  // x86InstrInfo knows how to commute this back after instruction selection
+  // if it would help register allocation.
+  
+  // TODO: If optimizing for size or a processor that doesn't suffer from
+  // partial register update stalls, this should be transformed into a MOVSD
+  // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
+
+  if (VT == MVT::v2f64)
+    if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+      if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
+        SDValue NewMask = DAG.getConstant(1, MVT::i8);
+        return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
+      }
+
+  return SDValue();
+}
+
  // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
  // as "sbb reg,reg", since it can be extended without zext and produces
  // an all-ones bit which is more useful than 0/1 in some cases.
@@ -23440,6 +23466,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
        return PerformINSERTPSCombine(N, DAG, Subtarget);
      break;
    }
+  case X86ISD::BLENDI:    return PerformBLENDICombine(N, DAG);
    case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
    }
  
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td

index 079487d19b4af62dab0e64055ee9cabb0f54ef67..c64d35cf71f0b08ee23065d01a55b0d434291939 100644 (file)
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -3161,8 +3161,7 @@ let isCodeGenOnly = 1 in {
  //   addss %xmm1, %xmm0
  
  // TODO: Some canonicalization in lowering would simplify the number of
-// patterns we have to try to match. In particular, the reversed order blends
-// seem unnecessary.
+// patterns we have to try to match.
  multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
    let Predicates = [UseSSE1] in {
      // extracted scalar math op with insert via movss
@@ -3263,16 +3262,9 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
      def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
            (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
        (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
-
-    // vector math op with insert via blend (reversed order)
-    def : Pat<(v2f64 (X86Blendi
-          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)),
-          (v2f64 VR128:$dst), (i8 2))),
-      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
    }
  
-  // Repeat everything for AVX and add one more pattern
-  // (the scalar + blend reversed order) for good measure.
+  // Repeat everything for AVX.
    let Predicates = [HasAVX] in {
      // extracted scalar math op with insert via movsd
      def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
@@ -3288,13 +3280,6 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
        (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
            (COPY_TO_REGCLASS FR64:$src, VR128))>;
  
-    // extracted scalar math op with insert via blend (reversed order)
-    def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector
-          (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
-          FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
-      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
-          (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
      // vector math op with insert via movsd
      def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
            (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
@@ -3304,12 +3289,6 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
      def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
            (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
        (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
-
-    // vector math op with insert via blend (reversed order)
-    def : Pat<(v2f64 (X86Blendi
-          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)),
-          (v2f64 VR128:$dst), (i8 2))),
-      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
    }
  }
author	Sanjay Patel <spatel@rotateright.com>
	Fri, 20 Feb 2015 16:55:27 +0000 (16:55 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Fri, 20 Feb 2015 16:55:27 +0000 (16:55 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86InstrSSE.td		patch \| blob \| history