From: Andrea Di Biagio Date: Thu, 19 Jun 2014 10:29:41 +0000 (+0000) Subject: [X86] Teach how to combine horizontal binop even in the presence of undefs. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=cfdf8052865b01e8b8d321640c3f51ff938cc3c4;p=oota-llvm.git [X86] Teach how to combine horizontal binop even in the presence of undefs. Before this change, the backend was unable to fold a build_vector dag node with UNDEF operands into a single horizontal add/sub. This patch teaches how to combine a build_vector with UNDEF operands into a horizontal add/sub when possible. The algorithm conservatively avoids to combine a build_vector with only a single non-UNDEF operand. Added test haddsub-undef.ll to verify that we correctly fold horizontal binop even in the presence of UNDEFs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211265 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 851607eac96..a7b6e707816 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6077,21 +6077,35 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { /// This function only analyzes elements of \p N whose indices are /// in range [BaseIdx, LastIdx). static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, + SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1) { + EVT VT = N->getValueType(0); + assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); - assert(N->getValueType(0).isVector() && - N->getValueType(0).getVectorNumElements() >= LastIdx && + assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; unsigned NumElts = LastIdx - BaseIdx; + V0 = DAG.getUNDEF(VT); + V1 = DAG.getUNDEF(VT); // Check if N implements a horizontal binop. for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { SDValue Op = N->getOperand(i + BaseIdx); + + // Skip UNDEFs. + if (Op->getOpcode() == ISD::UNDEF) { + // Update the expected vector extract index. + if (i * 2 == NumElts) + ExpectedVExtractIdx = BaseIdx; + ExpectedVExtractIdx += 2; + continue; + } + CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); if (!CanFold) @@ -6112,12 +6126,15 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); unsigned I1 = cast(Op1.getOperand(1))->getZExtValue(); - - if (i == 0) - V0 = Op0.getOperand(0); - else if (i * 2 == NumElts) { - V1 = Op0.getOperand(0); - ExpectedVExtractIdx = BaseIdx; + + if (i * 2 < NumElts) { + if (V0.getOpcode() == ISD::UNDEF) + V0 = Op0.getOperand(0); + } else { + if (V1.getOpcode() == ISD::UNDEF) + V1 = Op0.getOperand(0); + if (i * 2 == NumElts) + ExpectedVExtractIdx = BaseIdx; } SDValue Expected = (i * 2 < NumElts) ? V0 : V1; @@ -6163,9 +6180,14 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, /// Example: /// HADD V0_LO, V1_LO /// HADD V0_HI, V1_HI +/// +/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower +/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to +/// the upper 128-bits of the result. static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, SDLoc DL, SelectionDAG &DAG, - unsigned X86Opcode, bool Mode) { + unsigned X86Opcode, bool Mode, + bool isUndefLO, bool isUndefHI) { EVT VT = V0.getValueType(); assert(VT.is256BitVector() && VT == V1.getValueType() && "Invalid nodes in input!"); @@ -6177,13 +6199,24 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL); EVT NewVT = V0_LO.getValueType(); - SDValue LO, HI; + SDValue LO = DAG.getUNDEF(NewVT); + SDValue HI = DAG.getUNDEF(NewVT); + if (Mode) { - LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); - HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); + // Don't emit a horizontal binop if the result is expected to be UNDEF. + if (!isUndefLO && V0->getOpcode() != ISD::UNDEF) + LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); + if (!isUndefHI && V1->getOpcode() != ISD::UNDEF) + HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); } else { - LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); - HI = DAG.getNode(X86Opcode, DL, NewVT, V1_HI, V1_HI); + // Don't emit a horizontal binop if the result is expected to be UNDEF. + if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF || + V1_LO->getOpcode() != ISD::UNDEF)) + LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); + + if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF || + V1_HI->getOpcode() != ISD::UNDEF)) + HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); @@ -6198,19 +6231,37 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, SDValue InVec0, InVec1; // Try to match horizontal ADD/SUB. + unsigned NumUndefsLO = 0; + unsigned NumUndefsHI = 0; + unsigned Half = NumElts/2; + + // Count the number of UNDEF operands in the build_vector in input. + for (unsigned i = 0, e = Half; i != e; ++i) + if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) + NumUndefsLO++; + + for (unsigned i = Half, e = NumElts; i != e; ++i) + if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) + NumUndefsHI++; + + // Early exit if this is either a build_vector of all UNDEFs or all the + // operands but one are UNDEF. + if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) + return SDValue(); + if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { // Try to match an SSE3 float HADD/HSUB. - if (isHorizontalBinOp(BV, ISD::FADD, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); - if (isHorizontalBinOp(BV, ISD::FSUB, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { // Try to match an SSSE3 integer HADD/HSUB. - if (isHorizontalBinOp(BV, ISD::ADD, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); - if (isHorizontalBinOp(BV, ISD::SUB, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); } @@ -6221,16 +6272,20 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, // Try to match an AVX horizontal add/sub of packed single/double // precision floating point values from 256-bit vectors. SDValue InVec2, InVec3; - if (isHorizontalBinOp(BV, ISD::FADD, 0, NumElts/2, InVec0, InVec1) && - isHorizontalBinOp(BV, ISD::FADD, NumElts/2, NumElts, InVec2, InVec3) && - InVec0.getNode() == InVec2.getNode() && - InVec1.getNode() == InVec3.getNode()) + if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); - if (isHorizontalBinOp(BV, ISD::FSUB, 0, NumElts/2, InVec0, InVec1) && - isHorizontalBinOp(BV, ISD::FSUB, NumElts/2, NumElts, InVec2, InVec3) && - InVec0.getNode() == InVec2.getNode() && - InVec1.getNode() == InVec3.getNode()) + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { // Try to match an AVX2 horizontal add/sub of signed integers. @@ -6238,15 +6293,19 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, unsigned X86Opcode; bool CanFold = true; - if (isHorizontalBinOp(BV, ISD::ADD, 0, NumElts/2, InVec0, InVec1) && - isHorizontalBinOp(BV, ISD::ADD, NumElts/2, NumElts, InVec2, InVec3) && - InVec0.getNode() == InVec2.getNode() && - InVec1.getNode() == InVec3.getNode()) + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) X86Opcode = X86ISD::HADD; - else if (isHorizontalBinOp(BV, ISD::SUB, 0, NumElts/2, InVec0, InVec1) && - isHorizontalBinOp(BV, ISD::SUB, NumElts/2, NumElts, InVec2, InVec3) && - InVec0.getNode() == InVec2.getNode() && - InVec1.getNode() == InVec3.getNode()) + else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) X86Opcode = X86ISD::HSUB; else CanFold = false; @@ -6257,29 +6316,45 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, if (Subtarget->hasAVX2()) return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); + // Do not try to expand this build_vector into a pair of horizontal + // add/sub if we can emit a pair of scalar add/sub. + if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) + return SDValue(); + // Convert this build_vector into a pair of horizontal binop followed by // a concat vector. - return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false); + bool isUndefLO = NumUndefsLO == Half; + bool isUndefHI = NumUndefsHI == Half; + return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, + isUndefLO, isUndefHI); } } if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || VT == MVT::v16i16) && Subtarget->hasAVX()) { unsigned X86Opcode; - if (isHorizontalBinOp(BV, ISD::ADD, 0, NumElts, InVec0, InVec1)) + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HADD; - else if (isHorizontalBinOp(BV, ISD::SUB, 0, NumElts, InVec0, InVec1)) + else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HSUB; - else if (isHorizontalBinOp(BV, ISD::FADD, 0, NumElts, InVec0, InVec1)) + else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHADD; - else if (isHorizontalBinOp(BV, ISD::FSUB, 0, NumElts, InVec0, InVec1)) + else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHSUB; else return SDValue(); + // Don't try to expand this build_vector into a pair of horizontal add/sub + // if we can simply emit a pair of scalar add/sub. + if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) + return SDValue(); + // Convert this build_vector into two horizontal add/sub followed by // a concat vector. - return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true); + bool isUndefLO = NumUndefsLO == Half; + bool isUndefHI = NumUndefsHI == Half; + return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, + isUndefLO, isUndefHI); } return SDValue(); diff --git a/test/CodeGen/X86/haddsub-undef.ll b/test/CodeGen/X86/haddsub-undef.ll new file mode 100644 index 00000000000..954a9d994e6 --- /dev/null +++ b/test/CodeGen/X86/haddsub-undef.ll @@ -0,0 +1,325 @@ +; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+ssse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE +; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX +; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 + +; Verify that we correctly fold horizontal binop even in the presence of UNDEFs. + +define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %a, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 0 + %vecext2 = extractelement <4 x float> %a, i32 2 + %vecext3 = extractelement <4 x float> %a, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 + %vecext10 = extractelement <4 x float> %b, i32 2 + %vecext11 = extractelement <4 x float> %b, i32 3 + %add12 = fadd float %vecext10, %vecext11 + %vecinit13 = insertelement <4 x float> %vecinit5, float %add12, i32 3 + ret <4 x float> %vecinit13 +} +; CHECK-LABEL: test1_undef +; SSE: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NEXT: ret + + +define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %a, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 0 + %vecext6 = extractelement <4 x float> %b, i32 0 + %vecext7 = extractelement <4 x float> %b, i32 1 + %add8 = fadd float %vecext6, %vecext7 + %vecinit9 = insertelement <4 x float> %vecinit, float %add8, i32 2 + %vecext10 = extractelement <4 x float> %b, i32 2 + %vecext11 = extractelement <4 x float> %b, i32 3 + %add12 = fadd float %vecext10, %vecext11 + %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3 + ret <4 x float> %vecinit13 +} +; CHECK-LABEL: test2_undef +; SSE: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NEXT: ret + + +define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %a, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 0 + %vecext2 = extractelement <4 x float> %a, i32 2 + %vecext3 = extractelement <4 x float> %a, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 + %vecext6 = extractelement <4 x float> %b, i32 0 + %vecext7 = extractelement <4 x float> %b, i32 1 + %add8 = fadd float %vecext6, %vecext7 + %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2 + ret <4 x float> %vecinit9 +} +; CHECK-LABEL: test3_undef +; SSE: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NEXT: ret + + +define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %a, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 0 + ret <4 x float> %vecinit +} +; CHECK-LABEL: test4_undef +; CHECK-NOT: haddps +; CHECK: ret + + +define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) { + %vecext = extractelement <2 x double> %a, i32 0 + %vecext1 = extractelement <2 x double> %a, i32 1 + %add = fadd double %vecext, %vecext1 + %vecinit = insertelement <2 x double> undef, double %add, i32 0 + ret <2 x double> %vecinit +} +; CHECK-LABEL: test5_undef +; CHECK-NOT: haddpd +; CHECK: ret + + +define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %a, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 0 + %vecext2 = extractelement <4 x float> %a, i32 2 + %vecext3 = extractelement <4 x float> %a, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 + ret <4 x float> %vecinit5 +} +; CHECK-LABEL: test6_undef +; SSE: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NEXT: ret + + +define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %b, i32 0 + %vecext1 = extractelement <4 x float> %b, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 2 + %vecext2 = extractelement <4 x float> %b, i32 2 + %vecext3 = extractelement <4 x float> %b, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3 + ret <4 x float> %vecinit5 +} +; CHECK-LABEL: test7_undef +; SSE: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NEXT: ret + + +define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %a, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 0 + %vecext2 = extractelement <4 x float> %a, i32 2 + %vecext3 = extractelement <4 x float> %a, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 2 + ret <4 x float> %vecinit5 +} +; CHECK-LABEL: test8_undef +; CHECK-NOT: haddps +; CHECK: ret + + +define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) { + %vecext = extractelement <4 x float> %a, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <4 x float> undef, float %add, i32 0 + %vecext2 = extractelement <4 x float> %b, i32 2 + %vecext3 = extractelement <4 x float> %b, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3 + ret <4 x float> %vecinit5 +} +; CHECK-LABEL: test9_undef +; CHECK: haddps +; CHECK-NEXT: ret + +define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) { + %vecext = extractelement <8 x float> %a, i32 0 + %vecext1 = extractelement <8 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <8 x float> undef, float %add, i32 0 + %vecext2 = extractelement <8 x float> %b, i32 2 + %vecext3 = extractelement <8 x float> %b, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 3 + ret <8 x float> %vecinit5 +} +; CHECK-LABEL: test10_undef +; SSE: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NOT: haddps +; CHECK: ret + +define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) { + %vecext = extractelement <8 x float> %a, i32 0 + %vecext1 = extractelement <8 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <8 x float> undef, float %add, i32 0 + %vecext2 = extractelement <8 x float> %b, i32 4 + %vecext3 = extractelement <8 x float> %b, i32 5 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 6 + ret <8 x float> %vecinit5 +} +; CHECK-LABEL: test11_undef +; SSE-NOT: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK: ret + +define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) { + %vecext = extractelement <8 x float> %a, i32 0 + %vecext1 = extractelement <8 x float> %a, i32 1 + %add = fadd float %vecext, %vecext1 + %vecinit = insertelement <8 x float> undef, float %add, i32 0 + %vecext2 = extractelement <8 x float> %a, i32 2 + %vecext3 = extractelement <8 x float> %a, i32 3 + %add4 = fadd float %vecext2, %vecext3 + %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1 + ret <8 x float> %vecinit5 +} +; CHECK-LABEL: test12_undef +; SSE: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NOT: haddps +; CHECK: ret + +define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) { + %vecext = extractelement <8 x float> %a, i32 0 + %vecext1 = extractelement <8 x float> %a, i32 1 + %add1 = fadd float %vecext, %vecext1 + %vecinit1 = insertelement <8 x float> undef, float %add1, i32 0 + %vecext2 = extractelement <8 x float> %a, i32 2 + %vecext3 = extractelement <8 x float> %a, i32 3 + %add2 = fadd float %vecext2, %vecext3 + %vecinit2 = insertelement <8 x float> %vecinit1, float %add2, i32 1 + %vecext4 = extractelement <8 x float> %a, i32 4 + %vecext5 = extractelement <8 x float> %a, i32 5 + %add3 = fadd float %vecext4, %vecext5 + %vecinit3 = insertelement <8 x float> %vecinit2, float %add3, i32 2 + %vecext6 = extractelement <8 x float> %a, i32 6 + %vecext7 = extractelement <8 x float> %a, i32 7 + %add4 = fadd float %vecext6, %vecext7 + %vecinit4 = insertelement <8 x float> %vecinit3, float %add4, i32 3 + ret <8 x float> %vecinit4 +} +; CHECK-LABEL: test13_undef +; SSE: haddps +; SSE-NOT: haddps +; AVX: vhaddps +; AVX2: vhaddps +; CHECK-NOT: haddps +; CHECK: ret + +define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) { + %vecext = extractelement <8 x i32> %a, i32 0 + %vecext1 = extractelement <8 x i32> %a, i32 1 + %add = add i32 %vecext, %vecext1 + %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 + %vecext2 = extractelement <8 x i32> %b, i32 2 + %vecext3 = extractelement <8 x i32> %b, i32 3 + %add4 = add i32 %vecext2, %vecext3 + %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3 + ret <8 x i32> %vecinit5 +} +; CHECK-LABEL: test14_undef +; SSE: phaddd +; AVX: vphaddd +; AVX2: vphaddd +; CHECK-NOT: phaddd +; CHECK: ret + +; On AVX2, the following sequence can be folded into a single horizontal add. +; If the Subtarget doesn't support AVX2, then we avoid emitting two packed +; integer horizontal adds instead of two scalar adds followed by vector inserts. +define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) { + %vecext = extractelement <8 x i32> %a, i32 0 + %vecext1 = extractelement <8 x i32> %a, i32 1 + %add = add i32 %vecext, %vecext1 + %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 + %vecext2 = extractelement <8 x i32> %b, i32 4 + %vecext3 = extractelement <8 x i32> %b, i32 5 + %add4 = add i32 %vecext2, %vecext3 + %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6 + ret <8 x i32> %vecinit5 +} +; CHECK-LABEL: test15_undef +; SSE-NOT: phaddd +; AVX-NOT: vphaddd +; AVX2: vphaddd +; CHECK: ret + +define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) { + %vecext = extractelement <8 x i32> %a, i32 0 + %vecext1 = extractelement <8 x i32> %a, i32 1 + %add = add i32 %vecext, %vecext1 + %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 + %vecext2 = extractelement <8 x i32> %a, i32 2 + %vecext3 = extractelement <8 x i32> %a, i32 3 + %add4 = add i32 %vecext2, %vecext3 + %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 + ret <8 x i32> %vecinit5 +} +; CHECK-LABEL: test16_undef +; SSE: phaddd +; AVX: vphaddd +; AVX2: vphaddd +; CHECK-NOT: haddps +; CHECK: ret + +define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) { + %vecext = extractelement <8 x i32> %a, i32 0 + %vecext1 = extractelement <8 x i32> %a, i32 1 + %add1 = add i32 %vecext, %vecext1 + %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0 + %vecext2 = extractelement <8 x i32> %a, i32 2 + %vecext3 = extractelement <8 x i32> %a, i32 3 + %add2 = add i32 %vecext2, %vecext3 + %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1 + %vecext4 = extractelement <8 x i32> %a, i32 4 + %vecext5 = extractelement <8 x i32> %a, i32 5 + %add3 = add i32 %vecext4, %vecext5 + %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2 + %vecext6 = extractelement <8 x i32> %a, i32 6 + %vecext7 = extractelement <8 x i32> %a, i32 7 + %add4 = add i32 %vecext6, %vecext7 + %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3 + ret <8 x i32> %vecinit4 +} +; CHECK-LABEL: test17_undef +; SSE: phaddd +; AVX: vphaddd +; AVX2: vphaddd +; CHECK-NOT: haddps +; CHECK: ret +