From: Duncan Sands Date: Thu, 22 Sep 2011 20:15:48 +0000 (+0000) Subject: Synthesize SSE3/AVX 128 bit horizontal add/sub instructions from X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=17470bee5fd18bb2eae7825dae535c060a34ee7d;p=oota-llvm.git Synthesize SSE3/AVX 128 bit horizontal add/sub instructions from floating point add/sub of appropriate shuffle vectors. Does not synthesize the 256 bit AVX versions because they work differently. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@140332 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 7fef8529a36..afb37c86f31 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1137,6 +1137,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); @@ -10647,6 +10649,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; + case X86ISD::FHADD: return "X86ISD::FHADD"; + case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; @@ -13738,6 +13742,150 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" +/// and return the operands for the horizontal operation in LHS and RHS. A +/// horizontal operation performs the binary operation on successive elements +/// of its first operand, then on successive elements of its second operand, +/// returning the resulting values in a vector. For example, if +/// A = < float a0, float a1, float a2, float a3 > +/// and +/// B = < float b0, float b1, float b2, float b3 > +/// then the result of doing a horizontal operation on A and B is +/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. +/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form +/// A horizontal-op B, for some already available A and B, and if so then LHS is +/// set to A, RHS to B, and the routine returns 'true'. +/// Note that the binary operation should have the property that if one of the +/// operands is UNDEF then the result is UNDEF. +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { + // Look for the following pattern: if + // A = < float a0, float a1, float a2, float a3 > + // B = < float b0, float b1, float b2, float b3 > + // and + // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> + // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> + // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > + // which is A horizontal-op B. + + // At least one of the operands should be a vector shuffle. + if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && + RHS.getOpcode() != ISD::VECTOR_SHUFFLE) + return false; + + EVT VT = LHS.getValueType(); + unsigned N = VT.getVectorNumElements(); + + // View LHS in the form + // LHS = VECTOR_SHUFFLE A, B, LMask + // If LHS is not a shuffle then pretend it is the shuffle + // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> + // NOTE: in what follows a default initialized SDValue represents an UNDEF of + // type VT. + SDValue A, B; + SmallVector LMask(N); + if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { + if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) + A = LHS.getOperand(0); + if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) + B = LHS.getOperand(1); + cast(LHS.getNode())->getMask(LMask); + } else { + if (LHS.getOpcode() != ISD::UNDEF) + A = LHS; + for (unsigned i = 0; i != N; ++i) + LMask[i] = i; + } + + // Likewise, view RHS in the form + // RHS = VECTOR_SHUFFLE C, D, RMask + SDValue C, D; + SmallVector RMask(N); + if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { + if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) + C = RHS.getOperand(0); + if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) + D = RHS.getOperand(1); + cast(RHS.getNode())->getMask(RMask); + } else { + if (RHS.getOpcode() != ISD::UNDEF) + C = RHS; + for (unsigned i = 0; i != N; ++i) + RMask[i] = i; + } + + // Check that the shuffles are both shuffling the same vectors. + if (!(A == C && B == D) && !(A == D && B == C)) + return false; + + // If everything is UNDEF then bail out: it would be better to fold to UNDEF. + if (!A.getNode() && !B.getNode()) + return false; + + // If A and B occur in reverse order in RHS, then "swap" them (which means + // rewriting the mask). + if (A != C) + for (unsigned i = 0; i != N; ++i) { + unsigned Idx = RMask[i]; + if (Idx < N) + RMask[i] += N; + else if (Idx < 2*N) + RMask[i] -= N; + } + + // At this point LHS and RHS are equivalent to + // LHS = VECTOR_SHUFFLE A, B, LMask + // RHS = VECTOR_SHUFFLE A, B, RMask + // Check that the masks correspond to performing a horizontal operation. + for (unsigned i = 0; i != N; ++i) { + unsigned LIdx = LMask[i], RIdx = RMask[i]; + + // Ignore any UNDEF components. + if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N)) + || (!B.getNode() && (LIdx >= N || RIdx >= N))) + continue; + + // Check that successive elements are being operated on. If not, this is + // not a horizontal operation. + if (!(LIdx == 2*i && RIdx == 2*i + 1) && + !(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i)) + return false; + } + + LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. + RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. + return true; +} + +/// PerformFADDCombine - Do target-specific dag combines on floating point adds. +static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Try to synthesize horizontal adds from adds of shuffles. + if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) && + (VT == MVT::v4f32 || VT == MVT::v2f64) && + isHorizontalBinOp(LHS, RHS, true)) + return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); + return SDValue(); +} + +/// PerformFSUBCombine - Do target-specific dag combines on floating point subs. +static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Try to synthesize horizontal subs from subs of shuffles. + if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) && + (VT == MVT::v4f32 || VT == MVT::v2f64) && + isHorizontalBinOp(LHS, RHS, false)) + return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); + return SDValue(); +} + /// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and /// X86ISD::FXOR nodes. static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { @@ -13975,6 +14123,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); + case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); + case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 408d78f2d8f..90255b554ec 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -178,6 +178,12 @@ namespace llvm { /// BLEND family of opcodes BLENDV, + /// FHADD - Floating point horizontal add. + FHADD, + + /// FHSUB - Floating point horizontal sub. + FHSUB, + /// FMAX, FMIN - Floating point max and min. /// FMAX, FMIN, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 5a093eab52e..af919fba8ee 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -39,6 +39,8 @@ def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>; +def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>; +def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; def X86cmpss : SDNode<"X86ISD::FSETCCss", SDTX86Cmpss>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 3d0525ca6ad..7bc7ab2d5a8 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4714,62 +4714,122 @@ let Constraints = "$src1 = $dst", Predicates = [HasSSE3], // Horizontal ops multiclass S3D_Int o, string OpcodeStr, ValueType vt, RegisterClass RC, - X86MemOperand x86memop, Intrinsic IntId, bit Is2Addr = 1> { + X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> { def rr : S3DI; + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>; def rm : S3DI; + [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>; } multiclass S3_Int o, string OpcodeStr, ValueType vt, RegisterClass RC, - X86MemOperand x86memop, Intrinsic IntId, bit Is2Addr = 1> { + X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> { def rr : S3I; + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>; def rm : S3I; + [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>; } let Predicates = [HasAVX] in { defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, - int_x86_sse3_hadd_ps, 0>, VEX_4V; + X86fhadd, 0>, VEX_4V; defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, - int_x86_sse3_hadd_pd, 0>, VEX_4V; + X86fhadd, 0>, VEX_4V; defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, - int_x86_sse3_hsub_ps, 0>, VEX_4V; + X86fhsub, 0>, VEX_4V; defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, - int_x86_sse3_hsub_pd, 0>, VEX_4V; + X86fhsub, 0>, VEX_4V; defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, - int_x86_avx_hadd_ps_256, 0>, VEX_4V; + X86fhadd, 0>, VEX_4V; defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, - int_x86_avx_hadd_pd_256, 0>, VEX_4V; + X86fhadd, 0>, VEX_4V; defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, - int_x86_avx_hsub_ps_256, 0>, VEX_4V; + X86fhsub, 0>, VEX_4V; defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, - int_x86_avx_hsub_pd_256, 0>, VEX_4V; + X86fhsub, 0>, VEX_4V; +} + +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse3_hadd_ps (v4f32 VR128:$src1), VR128:$src2), + (VHADDPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(int_x86_sse3_hadd_ps (v4f32 VR128:$src1), (memop addr:$src2)), + (VHADDPSrm VR128:$src1, addr:$src2)>; + + def : Pat<(int_x86_sse3_hadd_pd (v2f64 VR128:$src1), VR128:$src2), + (VHADDPDrr VR128:$src1, VR128:$src2)>; + def : Pat<(int_x86_sse3_hadd_pd (v2f64 VR128:$src1), (memop addr:$src2)), + (VHADDPDrm VR128:$src1, addr:$src2)>; + + def : Pat<(int_x86_sse3_hsub_ps (v4f32 VR128:$src1), VR128:$src2), + (VHSUBPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(int_x86_sse3_hsub_ps (v4f32 VR128:$src1), (memop addr:$src2)), + (VHSUBPSrm VR128:$src1, addr:$src2)>; + + def : Pat<(int_x86_sse3_hsub_pd (v2f64 VR128:$src1), VR128:$src2), + (VHSUBPDrr VR128:$src1, VR128:$src2)>; + def : Pat<(int_x86_sse3_hsub_pd (v2f64 VR128:$src1), (memop addr:$src2)), + (VHSUBPDrm VR128:$src1, addr:$src2)>; + + def : Pat<(int_x86_avx_hadd_ps_256 (v8f32 VR256:$src1), VR256:$src2), + (VHADDPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(int_x86_avx_hadd_ps_256 (v8f32 VR256:$src1), (memop addr:$src2)), + (VHADDPSYrm VR256:$src1, addr:$src2)>; + + def : Pat<(int_x86_avx_hadd_pd_256 (v4f64 VR256:$src1), VR256:$src2), + (VHADDPDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(int_x86_avx_hadd_pd_256 (v4f64 VR256:$src1), (memop addr:$src2)), + (VHADDPDYrm VR256:$src1, addr:$src2)>; + + def : Pat<(int_x86_avx_hsub_ps_256 (v8f32 VR256:$src1), VR256:$src2), + (VHSUBPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(int_x86_avx_hsub_ps_256 (v8f32 VR256:$src1), (memop addr:$src2)), + (VHSUBPSYrm VR256:$src1, addr:$src2)>; + + def : Pat<(int_x86_avx_hsub_pd_256 (v4f64 VR256:$src1), VR256:$src2), + (VHSUBPDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(int_x86_avx_hsub_pd_256 (v4f64 VR256:$src1), (memop addr:$src2)), + (VHSUBPDYrm VR256:$src1, addr:$src2)>; } let Constraints = "$src1 = $dst" in { - defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, - int_x86_sse3_hadd_ps>; - defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, - int_x86_sse3_hadd_pd>; - defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, - int_x86_sse3_hsub_ps>; - defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, - int_x86_sse3_hsub_pd>; + defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>; + defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>; + defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>; + defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>; +} + +let Predicates = [HasSSE3] in { + def : Pat<(int_x86_sse3_hadd_ps (v4f32 VR128:$src1), VR128:$src2), + (HADDPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(int_x86_sse3_hadd_ps (v4f32 VR128:$src1), (memop addr:$src2)), + (HADDPSrm VR128:$src1, addr:$src2)>; + + def : Pat<(int_x86_sse3_hadd_pd (v2f64 VR128:$src1), VR128:$src2), + (HADDPDrr VR128:$src1, VR128:$src2)>; + def : Pat<(int_x86_sse3_hadd_pd (v2f64 VR128:$src1), (memop addr:$src2)), + (HADDPDrm VR128:$src1, addr:$src2)>; + + def : Pat<(int_x86_sse3_hsub_ps (v4f32 VR128:$src1), VR128:$src2), + (HSUBPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(int_x86_sse3_hsub_ps (v4f32 VR128:$src1), (memop addr:$src2)), + (HSUBPSrm VR128:$src1, addr:$src2)>; + + def : Pat<(int_x86_sse3_hsub_pd (v2f64 VR128:$src1), VR128:$src2), + (HSUBPDrr VR128:$src1, VR128:$src2)>; + def : Pat<(int_x86_sse3_hsub_pd (v2f64 VR128:$src1), (memop addr:$src2)), + (HSUBPDrm VR128:$src1, addr:$src2)>; } //===---------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/haddsub.ll b/test/CodeGen/X86/haddsub.ll new file mode 100644 index 00000000000..91758ead636 --- /dev/null +++ b/test/CodeGen/X86/haddsub.ll @@ -0,0 +1,194 @@ +; RUN: llc < %s -march=x86-64 -mattr=+sse3,-avx | FileCheck %s -check-prefix=SSE3 +; RUN: llc < %s -march=x86-64 -mattr=-sse3,+avx | FileCheck %s -check-prefix=AVX + +; SSE3: haddpd1: +; SSE3-NOT: vhaddpd +; SSE3: haddpd +; AVX: haddpd1: +; AVX: vhaddpd +define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) { + %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> + %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> + %r = fadd <2 x double> %a, %b + ret <2 x double> %r +} + +; SSE3: haddpd2: +; SSE3-NOT: vhaddpd +; SSE3: haddpd +; AVX: haddpd2: +; AVX: vhaddpd +define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) { + %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> + %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> + %r = fadd <2 x double> %a, %b + ret <2 x double> %r +} + +; SSE3: haddpd3: +; SSE3-NOT: vhaddpd +; SSE3: haddpd +; AVX: haddpd3: +; AVX: vhaddpd +define <2 x double> @haddpd3(<2 x double> %x) { + %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> + %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> + %r = fadd <2 x double> %a, %b + ret <2 x double> %r +} + +; SSE3: haddps1: +; SSE3-NOT: vhaddps +; SSE3: haddps +; AVX: haddps1: +; AVX: vhaddps +define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) { + %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> + %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> + %r = fadd <4 x float> %a, %b + ret <4 x float> %r +} + +; SSE3: haddps2: +; SSE3-NOT: vhaddps +; SSE3: haddps +; AVX: haddps2: +; AVX: vhaddps +define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) { + %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> + %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> + %r = fadd <4 x float> %a, %b + ret <4 x float> %r +} + +; SSE3: haddps3: +; SSE3-NOT: vhaddps +; SSE3: haddps +; AVX: haddps3: +; AVX: vhaddps +define <4 x float> @haddps3(<4 x float> %x) { + %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %r = fadd <4 x float> %a, %b + ret <4 x float> %r +} + +; SSE3: haddps4: +; SSE3-NOT: vhaddps +; SSE3: haddps +; AVX: haddps4: +; AVX: vhaddps +define <4 x float> @haddps4(<4 x float> %x) { + %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %r = fadd <4 x float> %a, %b + ret <4 x float> %r +} + +; SSE3: haddps5: +; SSE3-NOT: vhaddps +; SSE3: haddps +; AVX: haddps5: +; AVX: vhaddps +define <4 x float> @haddps5(<4 x float> %x) { + %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %r = fadd <4 x float> %a, %b + ret <4 x float> %r +} + +; SSE3: haddps6: +; SSE3-NOT: vhaddps +; SSE3: haddps +; AVX: haddps6: +; AVX: vhaddps +define <4 x float> @haddps6(<4 x float> %x) { + %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %r = fadd <4 x float> %a, %b + ret <4 x float> %r +} + +; SSE3: haddps7: +; SSE3-NOT: vhaddps +; SSE3: haddps +; AVX: haddps7: +; AVX: vhaddps +define <4 x float> @haddps7(<4 x float> %x) { + %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %r = fadd <4 x float> %a, %b + ret <4 x float> %r +} + +; SSE3: hsubpd1: +; SSE3-NOT: vhsubpd +; SSE3: hsubpd +; AVX: hsubpd1: +; AVX: vhsubpd +define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) { + %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> + %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> + %r = fsub <2 x double> %a, %b + ret <2 x double> %r +} + +; SSE3: hsubpd2: +; SSE3-NOT: vhsubpd +; SSE3: hsubpd +; AVX: hsubpd2: +; AVX: vhsubpd +define <2 x double> @hsubpd2(<2 x double> %x) { + %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> + %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> + %r = fsub <2 x double> %a, %b + ret <2 x double> %r +} + +; SSE3: hsubps1: +; SSE3-NOT: vhsubps +; SSE3: hsubps +; AVX: hsubps1: +; AVX: vhsubps +define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) { + %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> + %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> + %r = fsub <4 x float> %a, %b + ret <4 x float> %r +} + +; SSE3: hsubps2: +; SSE3-NOT: vhsubps +; SSE3: hsubps +; AVX: hsubps2: +; AVX: vhsubps +define <4 x float> @hsubps2(<4 x float> %x) { + %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %r = fsub <4 x float> %a, %b + ret <4 x float> %r +} + +; SSE3: hsubps3: +; SSE3-NOT: vhsubps +; SSE3: hsubps +; AVX: hsubps3: +; AVX: vhsubps +define <4 x float> @hsubps3(<4 x float> %x) { + %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %r = fsub <4 x float> %a, %b + ret <4 x float> %r +} + +; SSE3: hsubps4: +; SSE3-NOT: vhsubps +; SSE3: hsubps +; AVX: hsubps4: +; AVX: vhsubps +define <4 x float> @hsubps4(<4 x float> %x) { + %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> + %r = fsub <4 x float> %a, %b + ret <4 x float> %r +}