From: Duncan Sands <baldrick@free.fr>
Date: Thu, 22 Sep 2011 20:15:48 +0000 (+0000)
Subject: Synthesize SSE3/AVX 128 bit horizontal add/sub instructions from
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=17470bee5fd18bb2eae7825dae535c060a34ee7d;p=oota-llvm.git

Synthesize SSE3/AVX 128 bit horizontal add/sub instructions from
floating point add/sub of appropriate shuffle vectors.  Does not
synthesize the 256 bit AVX versions because they work differently.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@140332 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 7fef8529a36..afb37c86f31 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1137,6 +1137,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::STORE);
@@ -10647,6 +10649,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FMIN:               return "X86ISD::FMIN";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::FHADD:              return "X86ISD::FHADD";
+  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
@@ -13738,6 +13742,150 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
+/// and return the operands for the horizontal operation in LHS and RHS.  A
+/// horizontal operation performs the binary operation on successive elements
+/// of its first operand, then on successive elements of its second operand,
+/// returning the resulting values in a vector.  For example, if
+///   A = < float a0, float a1, float a2, float a3 >
+/// and
+///   B = < float b0, float b1, float b2, float b3 >
+/// then the result of doing a horizontal operation on A and B is
+///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
+/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
+/// A horizontal-op B, for some already available A and B, and if so then LHS is
+/// set to A, RHS to B, and the routine returns 'true'.
+/// Note that the binary operation should have the property that if one of the
+/// operands is UNDEF then the result is UNDEF.
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
+  // Look for the following pattern: if
+  //   A = < float a0, float a1, float a2, float a3 >
+  //   B = < float b0, float b1, float b2, float b3 >
+  // and
+  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
+  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
+  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
+  // which is A horizontal-op B.
+
+  // At least one of the operands should be a vector shuffle.
+  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
+      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
+    return false;
+
+  EVT VT = LHS.getValueType();
+  unsigned N = VT.getVectorNumElements();
+
+  // View LHS in the form
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  // If LHS is not a shuffle then pretend it is the shuffle
+  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
+  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
+  // type VT.
+  SDValue A, B;
+  SmallVector<int, 8> LMask(N);
+  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
+      A = LHS.getOperand(0);
+    if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
+      B = LHS.getOperand(1);
+    cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(LMask);
+  } else {
+    if (LHS.getOpcode() != ISD::UNDEF)
+      A = LHS;
+    for (unsigned i = 0; i != N; ++i)
+      LMask[i] = i;
+  }
+
+  // Likewise, view RHS in the form
+  //   RHS = VECTOR_SHUFFLE C, D, RMask
+  SDValue C, D;
+  SmallVector<int, 8> RMask(N);
+  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
+      C = RHS.getOperand(0);
+    if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
+      D = RHS.getOperand(1);
+    cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(RMask);
+  } else {
+    if (RHS.getOpcode() != ISD::UNDEF)
+      C = RHS;
+    for (unsigned i = 0; i != N; ++i)
+      RMask[i] = i;
+  }
+
+  // Check that the shuffles are both shuffling the same vectors.
+  if (!(A == C && B == D) && !(A == D && B == C))
+    return false;
+
+  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
+  if (!A.getNode() && !B.getNode())
+    return false;
+
+  // If A and B occur in reverse order in RHS, then "swap" them (which means
+  // rewriting the mask).
+  if (A != C)
+    for (unsigned i = 0; i != N; ++i) {
+      unsigned Idx = RMask[i];
+      if (Idx < N)
+        RMask[i] += N;
+      else if (Idx < 2*N)
+        RMask[i] -= N;
+    }
+
+  // At this point LHS and RHS are equivalent to
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  //   RHS = VECTOR_SHUFFLE A, B, RMask
+  // Check that the masks correspond to performing a horizontal operation.
+  for (unsigned i = 0; i != N; ++i) {
+    unsigned LIdx = LMask[i], RIdx = RMask[i];
+
+    // Ignore any UNDEF components.
+    if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N))
+        || (!B.getNode() && (LIdx >= N || RIdx >= N)))
+      continue;
+
+    // Check that successive elements are being operated on.  If not, this is
+    // not a horizontal operation.
+    if (!(LIdx == 2*i && RIdx == 2*i + 1) &&
+        !(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i))
+      return false;
+  }
+
+  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+  return true;
+}
+
+/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
+static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+      (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+      isHorizontalBinOp(LHS, RHS, true))
+    return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
+  return SDValue();
+}
+
+/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
+static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Try to synthesize horizontal subs from subs of shuffles.
+  if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+      (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+      isHorizontalBinOp(LHS, RHS, false))
+    return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
+  return SDValue();
+}
+
 /// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
 /// X86ISD::FXOR nodes.
 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
@@ -13975,6 +14123,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::LOAD:           return PerformLOADCombine(N, DAG, Subtarget);
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
+  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
+  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   case X86ISD::FXOR:
   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 408d78f2d8f..90255b554ec 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -178,6 +178,12 @@ namespace llvm {
       /// BLEND family of opcodes
       BLENDV,
 
+      /// FHADD - Floating point horizontal add.
+      FHADD,
+
+      /// FHSUB - Floating point horizontal sub.
+      FHSUB,
+
       /// FMAX, FMIN - Floating point max and min.
       ///
       FMAX, FMIN,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 5a093eab52e..af919fba8ee 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -39,6 +39,8 @@ def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
 def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
 def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
 def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
+def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
+def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
 def X86cmpss   : SDNode<"X86ISD::FSETCCss",    SDTX86Cmpss>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 3d0525ca6ad..7bc7ab2d5a8 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -4714,62 +4714,122 @@ let Constraints = "$src1 = $dst", Predicates = [HasSSE3],
 
 // Horizontal ops
 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
-                   X86MemOperand x86memop, Intrinsic IntId, bit Is2Addr = 1> {
+                   X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
   def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (IntId RC:$src1, RC:$src2)))]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>;
 
   def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>;
 }
 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
-                  X86MemOperand x86memop, Intrinsic IntId, bit Is2Addr = 1> {
+                  X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
   def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (IntId RC:$src1, RC:$src2)))]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>;
 
   def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>;
 }
 
 let Predicates = [HasAVX] in {
   defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
-                          int_x86_sse3_hadd_ps, 0>, VEX_4V;
+                          X86fhadd, 0>, VEX_4V;
   defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
-                          int_x86_sse3_hadd_pd, 0>, VEX_4V;
+                          X86fhadd, 0>, VEX_4V;
   defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
-                          int_x86_sse3_hsub_ps, 0>, VEX_4V;
+                          X86fhsub, 0>, VEX_4V;
   defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
-                          int_x86_sse3_hsub_pd, 0>, VEX_4V;
+                          X86fhsub, 0>, VEX_4V;
   defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
-                          int_x86_avx_hadd_ps_256, 0>, VEX_4V;
+                          X86fhadd, 0>, VEX_4V;
   defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
-                          int_x86_avx_hadd_pd_256, 0>, VEX_4V;
+                          X86fhadd, 0>, VEX_4V;
   defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
-                          int_x86_avx_hsub_ps_256, 0>, VEX_4V;
+                          X86fhsub, 0>, VEX_4V;
   defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
-                          int_x86_avx_hsub_pd_256, 0>, VEX_4V;
+                          X86fhsub, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse3_hadd_ps (v4f32 VR128:$src1), VR128:$src2),
+            (VHADDPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(int_x86_sse3_hadd_ps (v4f32 VR128:$src1), (memop addr:$src2)),
+            (VHADDPSrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(int_x86_sse3_hadd_pd (v2f64 VR128:$src1), VR128:$src2),
+            (VHADDPDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(int_x86_sse3_hadd_pd (v2f64 VR128:$src1), (memop addr:$src2)),
+            (VHADDPDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(int_x86_sse3_hsub_ps (v4f32 VR128:$src1), VR128:$src2),
+            (VHSUBPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(int_x86_sse3_hsub_ps (v4f32 VR128:$src1), (memop addr:$src2)),
+            (VHSUBPSrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(int_x86_sse3_hsub_pd (v2f64 VR128:$src1), VR128:$src2),
+            (VHSUBPDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(int_x86_sse3_hsub_pd (v2f64 VR128:$src1), (memop addr:$src2)),
+            (VHSUBPDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(int_x86_avx_hadd_ps_256 (v8f32 VR256:$src1), VR256:$src2),
+            (VHADDPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(int_x86_avx_hadd_ps_256 (v8f32 VR256:$src1), (memop addr:$src2)),
+            (VHADDPSYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(int_x86_avx_hadd_pd_256 (v4f64 VR256:$src1), VR256:$src2),
+            (VHADDPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(int_x86_avx_hadd_pd_256 (v4f64 VR256:$src1), (memop addr:$src2)),
+            (VHADDPDYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(int_x86_avx_hsub_ps_256 (v8f32 VR256:$src1), VR256:$src2),
+            (VHSUBPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(int_x86_avx_hsub_ps_256 (v8f32 VR256:$src1), (memop addr:$src2)),
+            (VHSUBPSYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(int_x86_avx_hsub_pd_256 (v4f64 VR256:$src1), VR256:$src2),
+            (VHSUBPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(int_x86_avx_hsub_pd_256 (v4f64 VR256:$src1), (memop addr:$src2)),
+            (VHSUBPDYrm VR256:$src1, addr:$src2)>;
 }
 
 let Constraints = "$src1 = $dst" in {
-  defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem,
-                        int_x86_sse3_hadd_ps>;
-  defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem,
-                       int_x86_sse3_hadd_pd>;
-  defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem,
-                        int_x86_sse3_hsub_ps>;
-  defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem,
-                       int_x86_sse3_hsub_pd>;
+  defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
+  defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
+  defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
+  defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
+}
+
+let Predicates = [HasSSE3] in {
+  def : Pat<(int_x86_sse3_hadd_ps (v4f32 VR128:$src1), VR128:$src2),
+            (HADDPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(int_x86_sse3_hadd_ps (v4f32 VR128:$src1), (memop addr:$src2)),
+            (HADDPSrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(int_x86_sse3_hadd_pd (v2f64 VR128:$src1), VR128:$src2),
+            (HADDPDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(int_x86_sse3_hadd_pd (v2f64 VR128:$src1), (memop addr:$src2)),
+            (HADDPDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(int_x86_sse3_hsub_ps (v4f32 VR128:$src1), VR128:$src2),
+            (HSUBPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(int_x86_sse3_hsub_ps (v4f32 VR128:$src1), (memop addr:$src2)),
+            (HSUBPSrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(int_x86_sse3_hsub_pd (v2f64 VR128:$src1), VR128:$src2),
+            (HSUBPDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(int_x86_sse3_hsub_pd (v2f64 VR128:$src1), (memop addr:$src2)),
+            (HSUBPDrm VR128:$src1, addr:$src2)>;
 }
 
 //===---------------------------------------------------------------------===//
diff --git a/test/CodeGen/X86/haddsub.ll b/test/CodeGen/X86/haddsub.ll
new file mode 100644
index 00000000000..91758ead636
--- /dev/null
+++ b/test/CodeGen/X86/haddsub.ll
@@ -0,0 +1,194 @@
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,-avx | FileCheck %s -check-prefix=SSE3
+; RUN: llc < %s -march=x86-64 -mattr=-sse3,+avx | FileCheck %s -check-prefix=AVX
+
+; SSE3: haddpd1:
+; SSE3-NOT: vhaddpd
+; SSE3: haddpd
+; AVX: haddpd1:
+; AVX: vhaddpd
+define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
+  %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
+  %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
+  %r = fadd <2 x double> %a, %b
+  ret <2 x double> %r
+}
+
+; SSE3: haddpd2:
+; SSE3-NOT: vhaddpd
+; SSE3: haddpd
+; AVX: haddpd2:
+; AVX: vhaddpd
+define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
+  %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
+  %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1>
+  %r = fadd <2 x double> %a, %b
+  ret <2 x double> %r
+}
+
+; SSE3: haddpd3:
+; SSE3-NOT: vhaddpd
+; SSE3: haddpd
+; AVX: haddpd3:
+; AVX: vhaddpd
+define <2 x double> @haddpd3(<2 x double> %x) {
+  %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
+  %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+  %r = fadd <2 x double> %a, %b
+  ret <2 x double> %r
+}
+
+; SSE3: haddps1:
+; SSE3-NOT: vhaddps
+; SSE3: haddps
+; AVX: haddps1:
+; AVX: vhaddps
+define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
+  %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %r = fadd <4 x float> %a, %b
+  ret <4 x float> %r
+}
+
+; SSE3: haddps2:
+; SSE3-NOT: vhaddps
+; SSE3: haddps
+; AVX: haddps2:
+; AVX: vhaddps
+define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
+  %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+  %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
+  %r = fadd <4 x float> %a, %b
+  ret <4 x float> %r
+}
+
+; SSE3: haddps3:
+; SSE3-NOT: vhaddps
+; SSE3: haddps
+; AVX: haddps3:
+; AVX: vhaddps
+define <4 x float> @haddps3(<4 x float> %x) {
+  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
+  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
+  %r = fadd <4 x float> %a, %b
+  ret <4 x float> %r
+}
+
+; SSE3: haddps4:
+; SSE3-NOT: vhaddps
+; SSE3: haddps
+; AVX: haddps4:
+; AVX: vhaddps
+define <4 x float> @haddps4(<4 x float> %x) {
+  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  %r = fadd <4 x float> %a, %b
+  ret <4 x float> %r
+}
+
+; SSE3: haddps5:
+; SSE3-NOT: vhaddps
+; SSE3: haddps
+; AVX: haddps5:
+; AVX: vhaddps
+define <4 x float> @haddps5(<4 x float> %x) {
+  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
+  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
+  %r = fadd <4 x float> %a, %b
+  ret <4 x float> %r
+}
+
+; SSE3: haddps6:
+; SSE3-NOT: vhaddps
+; SSE3: haddps
+; AVX: haddps6:
+; AVX: vhaddps
+define <4 x float> @haddps6(<4 x float> %x) {
+  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %r = fadd <4 x float> %a, %b
+  ret <4 x float> %r
+}
+
+; SSE3: haddps7:
+; SSE3-NOT: vhaddps
+; SSE3: haddps
+; AVX: haddps7:
+; AVX: vhaddps
+define <4 x float> @haddps7(<4 x float> %x) {
+  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
+  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
+  %r = fadd <4 x float> %a, %b
+  ret <4 x float> %r
+}
+
+; SSE3: hsubpd1:
+; SSE3-NOT: vhsubpd
+; SSE3: hsubpd
+; AVX: hsubpd1:
+; AVX: vhsubpd
+define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
+  %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
+  %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
+  %r = fsub <2 x double> %a, %b
+  ret <2 x double> %r
+}
+
+; SSE3: hsubpd2:
+; SSE3-NOT: vhsubpd
+; SSE3: hsubpd
+; AVX: hsubpd2:
+; AVX: vhsubpd
+define <2 x double> @hsubpd2(<2 x double> %x) {
+  %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
+  %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+  %r = fsub <2 x double> %a, %b
+  ret <2 x double> %r
+}
+
+; SSE3: hsubps1:
+; SSE3-NOT: vhsubps
+; SSE3: hsubps
+; AVX: hsubps1:
+; AVX: vhsubps
+define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
+  %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %r = fsub <4 x float> %a, %b
+  ret <4 x float> %r
+}
+
+; SSE3: hsubps2:
+; SSE3-NOT: vhsubps
+; SSE3: hsubps
+; AVX: hsubps2:
+; AVX: vhsubps
+define <4 x float> @hsubps2(<4 x float> %x) {
+  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
+  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
+  %r = fsub <4 x float> %a, %b
+  ret <4 x float> %r
+}
+
+; SSE3: hsubps3:
+; SSE3-NOT: vhsubps
+; SSE3: hsubps
+; AVX: hsubps3:
+; AVX: vhsubps
+define <4 x float> @hsubps3(<4 x float> %x) {
+  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  %r = fsub <4 x float> %a, %b
+  ret <4 x float> %r
+}
+
+; SSE3: hsubps4:
+; SSE3-NOT: vhsubps
+; SSE3: hsubps
+; AVX: hsubps4:
+; AVX: vhsubps
+define <4 x float> @hsubps4(<4 x float> %x) {
+  %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %r = fsub <4 x float> %a, %b
+  ret <4 x float> %r
+}