From: Victor Umansky Date: Thu, 5 Jan 2012 08:46:19 +0000 (+0000) Subject: Peephole optimization of ptest-conditioned branch in X86 arch. Performs instruction... X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=19d8559019b75edfd7f5b05ffa266bc278127854;p=oota-llvm.git Peephole optimization of ptest-conditioned branch in X86 arch. Performs instruction combining of sequences generated by ptestz/ptestc intrinsics to ptest+jcc pair for SSE and AVX. Testing: passed 'make check' including LIT tests for all sequences being handled (both SSE and AVX) Reviewers: Evan Cheng, David Blaikie, Bruno Lopes, Elena Demikhovsky, Chad Rosier, Anton Korobeynikov git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@147601 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 16c25f5165b..58bdc77ccdb 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -14611,6 +14611,146 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, return OptimizeConditionalInDecrement(N, DAG); } +// Helper which returns index of constant operand of a two-operand node. +static inline int GetConstOpIndexFor2OpNode(SDValue Op) { + if (isa(Op.getOperand(0))) + return 0; + if (isa(Op.getOperand(1))) + return 1; + return -1; +} + +SDValue X86TargetLowering::PerformBrcondCombine(SDNode* N, SelectionDAG &DAG, + DAGCombinerInfo &DCI) const { + // Simplification of the PTEST-and-BRANCH pattern. + // + // The LLVM IR patterns targeted are: + // %res = call i32 @llvm.x86.(...) + // %one = icmp {ne|eq} i32 %res, {0|1} + // br i1 %one, label %bb1, label %bb2 + // and + // %res = call i32 @llvm.x86.(...) + // %one = trunc i32 %res to i1 + // br i1 %one, label %bb1, label %bb2 + // where is one of: + // sse41.ptestz + // sse41.ptestc + // avx.ptestz.256 + // avx.ptestc.256 + // + // The simplification is in folding of the following SDNode sequence: + // X86ISD::PTEST + // {X86ISD::SETCC | X86ISD::SETCC_CARRY} + // [ISD::ZERO_EXTEND][[[ISD::AND,]ISD::TRUNCATE,]ISD::AND] + // X86ISD::CMP + // X86ISD::BRCOND(cond) + // to the code sequence: + // X86ISD::PTEST + // X86ISD::BRCOND(!cond) + + // The optimization is relevant only once the DAG contains x86 ISA (i.e. after + // operation legalization). + if (DCI.isBeforeLegalize() || DCI.isBeforeLegalizeOps() || DCI.isCalledByLegalizer()) + return SDValue(); + + // Below we iterate through DAG upwards, starting from BRCOND node and finishing + // at PTEST node. We stop the iteration once we cannot find match with any of + // the patterns which we are able to simplify. + + // Indices for constant and variable operands in two-operand nodes + int ConstOpIdx; + unsigned int VarOpIdx; + + // Validate that we're starting from the BRCOND node. + assert(N->getOpcode() == X86ISD::BRCOND && "Should start from conditional branch!"); + // Check that the BRCOND condition is ZF. + if (!isa(N->getOperand(2))) + return SDValue(); + uint64_t BranchCond = N->getConstantOperandVal(2); + if (BranchCond != X86::COND_NE && BranchCond != X86::COND_E) + return SDValue(); + + // 1st step upwards: verify CMP use. + SDValue CmpValue = N->getOperand(3); + if (CmpValue.getOpcode() != X86ISD::CMP) + return SDValue(); + // Check that the CMP comparison is with 0. + if ((ConstOpIdx = GetConstOpIndexFor2OpNode(CmpValue)) == -1) + return SDValue(); + VarOpIdx = (ConstOpIdx == 0)? 1:0; + uint64_t CompareWith = CmpValue.getConstantOperandVal((unsigned int)ConstOpIdx); + if (CompareWith != 0 && CompareWith != 1) + return SDValue(); + + // 2rd step upwards: cover alternative paths between pre-BRCOND CMP and PTEST + // return value analysis. + + SDValue SVOp = CmpValue.getOperand(VarOpIdx); + // Verify optional AND use. + if (SVOp.getOpcode() == ISD::AND) { + // Check that the AND is with 0x1. + if ((ConstOpIdx = GetConstOpIndexFor2OpNode(SVOp)) == -1) + return SDValue(); + VarOpIdx = (ConstOpIdx == 0)? 1:0; + if (SVOp.getConstantOperandVal((unsigned int)ConstOpIdx) != 1) + return SDValue(); + // Step upwards: verify optional TRUNCATE use. + SVOp = SVOp.getOperand(VarOpIdx); + if (SVOp.getOpcode() == ISD::TRUNCATE) { + // Step upwards: verify optional AND or ZERO_EXTEND use. + SVOp = SVOp.getOperand(0); + if (SVOp.getOpcode() == ISD::AND) { + // Check that the AND is with 0x1. + if ((ConstOpIdx = GetConstOpIndexFor2OpNode(SVOp)) == -1) + return SDValue(); + VarOpIdx = (ConstOpIdx == 0)? 1:0; + if (SVOp.getConstantOperandVal((unsigned int)ConstOpIdx) != 1) + return SDValue(); + // Step upwards. + SVOp = SVOp.getOperand(VarOpIdx); + } + } + } + // Verify optional ZERO_EXTEND use + if (SVOp.getOpcode() == ISD::ZERO_EXTEND) { + // Step upwards. + SVOp = SVOp.getOperand(0); + } + + // 3rd step upwards: verify SETCC or SETCC_CARRY use. + unsigned SetCcOP = SVOp.getOpcode(); + if (SetCcOP != X86ISD::SETCC && SetCcOP != X86ISD::SETCC_CARRY) + return SDValue(); + // Check that the SETCC/SETCC_CARRY flag is 'COND_E' (for ptestz) or 'COND_B' (for ptestc) + if ((ConstOpIdx = GetConstOpIndexFor2OpNode(SVOp)) == -1) + return SDValue(); + VarOpIdx = (ConstOpIdx == 0)? 1:0; + uint64_t SetCond = SVOp.getConstantOperandVal((unsigned int)ConstOpIdx); + if (SetCond != X86::COND_E && SetCond != X86::COND_B) + return SDValue(); + + // 4th step upwards: verify PTEST use. + SDValue PtestValue = SVOp.getOperand(VarOpIdx); + if (PtestValue.getOpcode() != X86ISD::PTEST) + return SDValue(); + + // The chain to be folded is recognized. We can fold it now. + + // At first - select the branch condition. + SDValue CC = DAG.getConstant(SetCond, MVT::i8); + if ((CompareWith == 1 && BranchCond == X86::COND_NE) || + (CompareWith == 0 && BranchCond == X86::COND_E)) { + // Invert branch condition. + CC = (SetCond == X86::COND_E? DAG.getConstant(X86::COND_NE, MVT::i8): + DAG.getConstant(X86::COND_AE, MVT::i8)); + } + // Then - update the BRCOND node. + // Resno is set to 0 as X86ISD::BRCOND has single return value. + return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1), + CC, PtestValue), 0); + +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -14657,6 +14797,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERMILP: case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); + case X86ISD::BRCOND: return PerformBrcondCombine(N, DAG, DCI); } return SDValue(); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 574af58ffde..ee7720a5976 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -836,6 +836,7 @@ namespace llvm { SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue PerformBrcondCombine(SDNode* N, SelectionDAG &DAG, DAGCombinerInfo &DCI) const; // Utility functions to help LowerVECTOR_SHUFFLE SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, SelectionDAG &DAG) const; diff --git a/test/CodeGen/X86/avx-brcond.ll b/test/CodeGen/X86/avx-brcond.ll new file mode 100755 index 00000000000..cecacc25bd0 --- /dev/null +++ b/test/CodeGen/X86/avx-brcond.ll @@ -0,0 +1,244 @@ +; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +declare i32 @llvm.x86.avx.ptestz.256(<4 x i64> %p1, <4 x i64> %p2) nounwind +declare i32 @llvm.x86.avx.ptestc.256(<4 x i64> %p1, <4 x i64> %p2) nounwind + +define <4 x float> @test1(<4 x i64> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test1: +; CHECK: vptest +; CHECK-NEXT: jne +; CHECK: ret + + %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a, <4 x i64> %a) nounwind + %one = icmp ne i32 %res, 0 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test2(<4 x i64> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test2: +; CHECK: vptest +; CHECK-NEXT: je +; CHECK: ret + + %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a, <4 x i64> %a) nounwind + %one = icmp eq i32 %res, 0 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test3(<4 x i64> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test3: +; CHECK: vptest +; CHECK-NEXT: jne +; CHECK: ret + + %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a, <4 x i64> %a) nounwind + %one = trunc i32 %res to i1 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test4(<4 x i64> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test4: +; CHECK: vptest +; CHECK-NEXT: jae +; CHECK: ret + + %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a, <4 x i64> %a) nounwind + %one = icmp ne i32 %res, 0 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test5(<4 x i64> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test5: +; CHECK: vptest +; CHECK-NEXT: jb +; CHECK: ret + + %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a, <4 x i64> %a) nounwind + %one = icmp eq i32 %res, 0 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test6(<4 x i64> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test6: +; CHECK: vptest +; CHECK-NEXT: jae +; CHECK: ret + + %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a, <4 x i64> %a) nounwind + %one = trunc i32 %res to i1 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test7(<4 x i64> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test7: +; CHECK: vptest +; CHECK-NEXT: jne +; CHECK: ret + + %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a, <4 x i64> %a) nounwind + %one = icmp eq i32 %res, 1 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test8(<4 x i64> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test8: +; CHECK: vptest +; CHECK-NEXT: je +; CHECK: ret + + %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a, <4 x i64> %a) nounwind + %one = icmp ne i32 %res, 1 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test9(<4 x i64> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test9: +; CHECK: vptest +; CHECK-NEXT: jae +; CHECK: ret + + %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a, <4 x i64> %a) nounwind + %one = icmp eq i32 %res, 1 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test10(<4 x i64> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test10: +; CHECK: vptest +; CHECK-NEXT: jb +; CHECK: ret + + %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a, <4 x i64> %a) nounwind + %one = icmp ne i32 %res, 1 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} diff --git a/test/CodeGen/X86/brcond.ll b/test/CodeGen/X86/brcond.ll index 5cdc1000f3c..ff3b2e45ee6 100644 --- a/test/CodeGen/X86/brcond.ll +++ b/test/CodeGen/X86/brcond.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=core2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=penryn | FileCheck %s + ; rdar://7475489 define i32 @test1(i32 %a, i32 %b) nounwind ssp { @@ -106,3 +107,246 @@ bb2: ; preds = %entry, %bb1 %.0 = fptrunc double %.0.in to float ; [#uses=1] ret float %.0 } + +declare i32 @llvm.x86.sse41.ptestz(<4 x float> %p1, <4 x float> %p2) nounwind +declare i32 @llvm.x86.sse41.ptestc(<4 x float> %p1, <4 x float> %p2) nounwind + +define <4 x float> @test5(<4 x float> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test5: +; CHECK: ptest +; CHECK-NEXT: jne +; CHECK: ret + + %res = call i32 @llvm.x86.sse41.ptestz(<4 x float> %a, <4 x float> %a) nounwind + %one = icmp ne i32 %res, 0 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test6(<4 x float> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test6: +; CHECK: ptest +; CHECK-NEXT: je +; CHECK: ret + + %res = call i32 @llvm.x86.sse41.ptestz(<4 x float> %a, <4 x float> %a) nounwind + %one = icmp eq i32 %res, 0 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test7(<4 x float> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test7: +; CHECK: ptest +; CHECK-NEXT: jne +; CHECK: ret + + %res = call i32 @llvm.x86.sse41.ptestz(<4 x float> %a, <4 x float> %a) nounwind + %one = trunc i32 %res to i1 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test8(<4 x float> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test8: +; CHECK: ptest +; CHECK-NEXT: jae +; CHECK: ret + + %res = call i32 @llvm.x86.sse41.ptestc(<4 x float> %a, <4 x float> %a) nounwind + %one = icmp ne i32 %res, 0 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test9(<4 x float> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test9: +; CHECK: ptest +; CHECK-NEXT: jb +; CHECK: ret + + %res = call i32 @llvm.x86.sse41.ptestc(<4 x float> %a, <4 x float> %a) nounwind + %one = icmp eq i32 %res, 0 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test10(<4 x float> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test10: +; CHECK: ptest +; CHECK-NEXT: jae +; CHECK: ret + + %res = call i32 @llvm.x86.sse41.ptestc(<4 x float> %a, <4 x float> %a) nounwind + %one = trunc i32 %res to i1 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test11(<4 x float> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test11: +; CHECK: ptest +; CHECK-NEXT: jne +; CHECK: ret + + %res = call i32 @llvm.x86.sse41.ptestz(<4 x float> %a, <4 x float> %a) nounwind + %one = icmp eq i32 %res, 1 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test12(<4 x float> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test12: +; CHECK: ptest +; CHECK-NEXT: je +; CHECK: ret + + %res = call i32 @llvm.x86.sse41.ptestz(<4 x float> %a, <4 x float> %a) nounwind + %one = icmp ne i32 %res, 1 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test13(<4 x float> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test13: +; CHECK: ptest +; CHECK-NEXT: jae +; CHECK: ret + + %res = call i32 @llvm.x86.sse41.ptestc(<4 x float> %a, <4 x float> %a) nounwind + %one = icmp eq i32 %res, 1 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +} + +define <4 x float> @test14(<4 x float> %a, <4 x float> %b) nounwind { +entry: +; CHECK: test14: +; CHECK: ptest +; CHECK-NEXT: jb +; CHECK: ret + + %res = call i32 @llvm.x86.sse41.ptestc(<4 x float> %a, <4 x float> %a) nounwind + %one = icmp ne i32 %res, 1 + br i1 %one, label %bb1, label %bb2 + +bb1: + %c = fadd <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +bb2: + %d = fdiv <4 x float> %b, < float 1.000000e+002, float 2.000000e+002, float 3.000000e+002, float 4.000000e+002 > + br label %return + +return: + %e = phi <4 x float> [%c, %bb1], [%d, %bb2] + ret <4 x float> %e +}