From 1fc0f627b142076f3aa39db6cc9288b673e9cdae Mon Sep 17 00:00:00 2001 From: Michael Kuperstein Date: Thu, 6 Aug 2015 08:45:34 +0000 Subject: [PATCH] [X86] Improve EmitLoweredSelect for contiguous CMOV pseudo instructions. This change improves EmitLoweredSelect() so that multiple contiguous CMOV pseudo instructions with the same (or exactly opposite) conditions get lowered using a single new basic-block. This eliminates unnecessary extra basic-blocks (and CFG merge points) when contiguous CMOVs are being lowered. Patch by: kevin.b.smith@intel.com Differential Revision: http://reviews.llvm.org/D11428 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@244202 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 192 +++++++++++++++--- test/CodeGen/X86/pseudo_cmov_lower.ll | 267 +++++++++++++++++++++++++ test/CodeGen/X86/pseudo_cmov_lower1.ll | 39 ++++ test/CodeGen/X86/pseudo_cmov_lower2.ll | 100 +++++++++ 4 files changed, 565 insertions(+), 33 deletions(-) create mode 100644 test/CodeGen/X86/pseudo_cmov_lower.ll create mode 100644 test/CodeGen/X86/pseudo_cmov_lower1.ll create mode 100644 test/CodeGen/X86/pseudo_cmov_lower2.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b8f132c28f2..482d2a5ecaa 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19947,6 +19947,39 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, return true; } +// Return true if it is OK for this CMOV pseudo-opcode to be cascaded +// together with other CMOV pseudo-opcodes into a single basic-block with +// conditional jump around it. +static bool isCMOVPseudo(MachineInstr *MI) { + switch (MI->getOpcode()) { + case X86::CMOV_FR32: + case X86::CMOV_FR64: + case X86::CMOV_GR8: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: + case X86::CMOV_V2F64: + case X86::CMOV_V2I64: + case X86::CMOV_V4F32: + case X86::CMOV_V4F64: + case X86::CMOV_V4I64: + case X86::CMOV_V16F32: + case X86::CMOV_V8F32: + case X86::CMOV_V8F64: + case X86::CMOV_V8I64: + case X86::CMOV_V8I1: + case X86::CMOV_V16I1: + case X86::CMOV_V32I1: + case X86::CMOV_V64I1: + return true; + + default: + return false; + } +} + MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -19970,8 +20003,41 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *thisMBB = BB; MachineFunction *F = BB->getParent(); - // We also lower double CMOVs: + // This code lowers all pseudo-CMOV instructions. Generally it lowers these + // as described above, by inserting a BB, and then making a PHI at the join + // point to select the true and false operands of the CMOV in the PHI. + // + // The code also handles two different cases of multiple CMOV opcodes + // in a row. + // + // Case 1: + // In this case, there are multiple CMOVs in a row, all which are based on + // the same condition setting (or the exact opposite condition setting). + // In this case we can lower all the CMOVs using a single inserted BB, and + // then make a number of PHIs at the join point to model the CMOVs. The only + // trickiness here, is that in a case like: + // + // t2 = CMOV cond1 t1, f1 + // t3 = CMOV cond1 t2, f2 + // + // when rewriting this into PHIs, we have to perform some renaming on the + // temps since you cannot have a PHI operand refer to a PHI result earlier + // in the same block. The "simple" but wrong lowering would be: + // + // t2 = PHI t1(BB1), f1(BB2) + // t3 = PHI t2(BB1), f2(BB2) + // + // but clearly t2 is not defined in BB1, so that is incorrect. The proper + // renaming is to note that on the path through BB1, t2 is really just a + // copy of t1, and do that renaming, properly generating: + // + // t2 = PHI t1(BB1), f1(BB2) + // t3 = PHI t1(BB1), f2(BB2) + // + // Case 2, we lower cascaded CMOVs such as + // // (CMOV (CMOV F, T, cc1), T, cc2) + // // to two successives branches. For that, we look for another CMOV as the // following instruction. // @@ -20037,19 +20103,42 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // .LBB5_4: // retq // - MachineInstr *NextCMOV = nullptr; + MachineInstr *CascadedCMOV = nullptr; + MachineInstr *LastCMOV = MI; + X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm()); + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineBasicBlock::iterator NextMIIt = std::next(MachineBasicBlock::iterator(MI)); - if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && + + // Check for case 1, where there are multiple CMOVs with the same condition + // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the + // number of jumps the most. + + if (isCMOVPseudo(MI)) { + // See if we have a string of CMOVS with the same condition. + while (NextMIIt != BB->end() && + isCMOVPseudo(NextMIIt) && + (NextMIIt->getOperand(3).getImm() == CC || + NextMIIt->getOperand(3).getImm() == OppCC)) { + LastCMOV = &*NextMIIt; + ++NextMIIt; + } + } + + // This checks for case 2, but only do this if we didn't already find + // case 1, as indicated by LastCMOV == MI. + if (LastCMOV == MI && + NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && - NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) - NextCMOV = &*NextMIIt; + NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) { + CascadedCMOV = &*NextMIIt; + } MachineBasicBlock *jcc1MBB = nullptr; - // If we have a double CMOV, we lower it to two successive branches to + // If we have a cascaded CMOV, we lower it to two successive branches to // the same block. EFLAGS is used by both, so mark it as live in the second. - if (NextCMOV) { + if (CascadedCMOV) { jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, jcc1MBB); jcc1MBB->addLiveIn(X86::EFLAGS); @@ -20064,7 +20153,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); - MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI; + MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV; if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); @@ -20073,12 +20162,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); + std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); // Add the true and fallthrough blocks as its successors. - if (NextCMOV) { - // The fallthrough block may be jcc1MBB, if we have a double CMOV. + if (CascadedCMOV) { + // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV. BB->addSuccessor(jcc1MBB); // In that case, jcc1MBB will itself fallthrough the copy0MBB, and @@ -20093,13 +20182,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, BB->addSuccessor(sinkMBB); // Create the conditional branch instruction. - unsigned Opc = - X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); + unsigned Opc = X86::GetCondBranchFromCond(CC); BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); - if (NextCMOV) { + if (CascadedCMOV) { unsigned Opc2 = X86::GetCondBranchFromCond( - (X86::CondCode)NextCMOV->getOperand(3).getImm()); + (X86::CondCode)CascadedCMOV->getOperand(3).getImm()); BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); } @@ -20111,24 +20199,62 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... - MachineInstrBuilder MIB = - BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), - MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) - .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); + MachineBasicBlock::iterator MIItEnd = + std::next(MachineBasicBlock::iterator(LastCMOV)); + MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin(); + DenseMap> RegRewriteTable; + MachineInstrBuilder MIB; + + // As we are creating the PHIs, we have to be careful if there is more than + // one. Later CMOVs may reference the results of earlier CMOVs, but later + // PHIs have to reference the individual true/false inputs from earlier PHIs. + // That also means that PHI construction must work forward from earlier to + // later, and that the code must maintain a mapping from earlier PHI's + // destination registers, and the registers that went into the PHI. + + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { + unsigned DestReg = MIIt->getOperand(0).getReg(); + unsigned Op1Reg = MIIt->getOperand(1).getReg(); + unsigned Op2Reg = MIIt->getOperand(2).getReg(); + + // If this CMOV we are generating is the opposite condition from + // the jump we generated, then we have to swap the operands for the + // PHI that is going to be generated. + if (MIIt->getOperand(3).getImm() == OppCC) + std::swap(Op1Reg, Op2Reg); + + if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end()) + Op1Reg = RegRewriteTable[Op1Reg].first; + + if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end()) + Op2Reg = RegRewriteTable[Op2Reg].second; + + MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL, + TII->get(X86::PHI), DestReg) + .addReg(Op1Reg).addMBB(copy0MBB) + .addReg(Op2Reg).addMBB(thisMBB); + + // Add this PHI to the rewrite table. + RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); + } - // If we have a double CMOV, the second Jcc provides the same incoming + // If we have a cascaded CMOV, the second Jcc provides the same incoming // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). - if (NextCMOV) { + if (CascadedCMOV) { MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB); // Copy the PHI result to the register defined by the second CMOV. BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), - DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg()) + DL, TII->get(TargetOpcode::COPY), + CascadedCMOV->getOperand(0).getReg()) .addReg(MI->getOperand(0).getReg()); - NextCMOV->eraseFromParent(); + CascadedCMOV->eraseFromParent(); } - MI->eraseFromParent(); // The pseudo instruction is gone now. + // Now remove the CMOV(s). + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ) + (MIIt++)->eraseFromParent(); + return sinkMBB; } @@ -20703,23 +20829,23 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); - case X86::CMOV_GR8: case X86::CMOV_FR32: case X86::CMOV_FR64: - case X86::CMOV_V4F32: + case X86::CMOV_GR8: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: case X86::CMOV_V2F64: case X86::CMOV_V2I64: - case X86::CMOV_V8F32: + case X86::CMOV_V4F32: case X86::CMOV_V4F64: case X86::CMOV_V4I64: case X86::CMOV_V16F32: + case X86::CMOV_V8F32: case X86::CMOV_V8F64: case X86::CMOV_V8I64: - case X86::CMOV_GR16: - case X86::CMOV_GR32: - case X86::CMOV_RFP32: - case X86::CMOV_RFP64: - case X86::CMOV_RFP80: case X86::CMOV_V8I1: case X86::CMOV_V16I1: case X86::CMOV_V32I1: diff --git a/test/CodeGen/X86/pseudo_cmov_lower.ll b/test/CodeGen/X86/pseudo_cmov_lower.ll new file mode 100644 index 00000000000..c59e3478ff5 --- /dev/null +++ b/test/CodeGen/X86/pseudo_cmov_lower.ll @@ -0,0 +1,267 @@ +; RUN: llc < %s -mtriple=i386-linux-gnu -o - | FileCheck %s + +; This test checks that only a single js gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +; CHECK-LABEL: foo1: +; CHECK: js +; CHECK-NOT: js +define i32 @foo1(i32 %v1, i32 %v2, i32 %v3) nounwind { +entry: + %cmp = icmp slt i32 %v1, 0 + %v2.v3 = select i1 %cmp, i32 %v2, i32 %v3 + %v1.v2 = select i1 %cmp, i32 %v1, i32 %v2 + %sub = sub i32 %v1.v2, %v2.v3 + ret i32 %sub +} + +; This test checks that only a single js gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. This makes +; sure the code for the lowering for opposite conditions gets tested. +; CHECK-LABEL: foo11: +; CHECK: js +; CHECK-NOT: js +; CHECK-NOT: jns +define i32 @foo11(i32 %v1, i32 %v2, i32 %v3) nounwind { +entry: + %cmp1 = icmp slt i32 %v1, 0 + %v2.v3 = select i1 %cmp1, i32 %v2, i32 %v3 + %cmp2 = icmp sge i32 %v1, 0 + %v1.v2 = select i1 %cmp2, i32 %v1, i32 %v2 + %sub = sub i32 %v1.v2, %v2.v3 + ret i32 %sub +} + +; This test checks that only a single js gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +; CHECK-LABEL: foo2: +; CHECK: js +; CHECK-NOT: js +define i32 @foo2(i8 %v1, i8 %v2, i8 %v3) nounwind { +entry: + %cmp = icmp slt i8 %v1, 0 + %v2.v3 = select i1 %cmp, i8 %v2, i8 %v3 + %v1.v2 = select i1 %cmp, i8 %v1, i8 %v2 + %t1 = sext i8 %v2.v3 to i32 + %t2 = sext i8 %v1.v2 to i32 + %sub = sub i32 %t1, %t2 + ret i32 %sub +} + +; This test checks that only a single js gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +; CHECK-LABEL: foo3: +; CHECK: js +; CHECK-NOT: js +define i32 @foo3(i16 %v1, i16 %v2, i16 %v3) nounwind { +entry: + %cmp = icmp slt i16 %v1, 0 + %v2.v3 = select i1 %cmp, i16 %v2, i16 %v3 + %v1.v2 = select i1 %cmp, i16 %v1, i16 %v2 + %t1 = sext i16 %v2.v3 to i32 + %t2 = sext i16 %v1.v2 to i32 + %sub = sub i32 %t1, %t2 + ret i32 %sub +} + +; This test checks that only a single js gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +; CHECK-LABEL: foo4: +; CHECK: js +; CHECK-NOT: js +define float @foo4(i32 %v1, float %v2, float %v3, float %v4) nounwind { +entry: + %cmp = icmp slt i32 %v1, 0 + %t1 = select i1 %cmp, float %v2, float %v3 + %t2 = select i1 %cmp, float %v3, float %v4 + %sub = fsub float %t1, %t2 + ret float %sub +} + +; This test checks that only a single je gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +; CHECK-LABEL: foo5: +; CHECK: je +; CHECK-NOT: je +define double @foo5(i32 %v1, double %v2, double %v3, double %v4) nounwind { +entry: + %cmp = icmp eq i32 %v1, 0 + %t1 = select i1 %cmp, double %v2, double %v3 + %t2 = select i1 %cmp, double %v3, double %v4 + %sub = fsub double %t1, %t2 + ret double %sub +} + +; This test checks that only a single je gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +; CHECK-LABEL: foo6: +; CHECK: je +; CHECK-NOT: je +define <4 x float> @foo6(i32 %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> %v4) nounwind { +entry: + %cmp = icmp eq i32 %v1, 0 + %t1 = select i1 %cmp, <4 x float> %v2, <4 x float> %v3 + %t2 = select i1 %cmp, <4 x float> %v3, <4 x float> %v4 + %sub = fsub <4 x float> %t1, %t2 + ret <4 x float> %sub +} + +; This test checks that only a single je gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +; CHECK-LABEL: foo7: +; CHECK: je +; CHECK-NOT: je +define <2 x double> @foo7(i32 %v1, <2 x double> %v2, <2 x double> %v3, <2 x double> %v4) nounwind { +entry: + %cmp = icmp eq i32 %v1, 0 + %t1 = select i1 %cmp, <2 x double> %v2, <2 x double> %v3 + %t2 = select i1 %cmp, <2 x double> %v3, <2 x double> %v4 + %sub = fsub <2 x double> %t1, %t2 + ret <2 x double> %sub +} + +; This test checks that only a single ja gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. This combines +; all the supported types together into one long string of selects based +; on the same condition. +; CHECK-LABEL: foo8: +; CHECK: ja +; CHECK-NOT: ja +define void @foo8(i32 %v1, + i8 %v2, i8 %v3, + i16 %v12, i16 %v13, + i32 %v22, i32 %v23, + float %v32, float %v33, + double %v42, double %v43, + <4 x float> %v52, <4 x float> %v53, + <2 x double> %v62, <2 x double> %v63, + <8 x float> %v72, <8 x float> %v73, + <4 x double> %v82, <4 x double> %v83, + <16 x float> %v92, <16 x float> %v93, + <8 x double> %v102, <8 x double> %v103, + i8 * %dst) nounwind { +entry: + %add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 2 + %a11 = bitcast i8* %add.ptr11 to i16* + + %add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4 + %a21 = bitcast i8* %add.ptr21 to i32* + + %add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8 + %a31 = bitcast i8* %add.ptr31 to float* + + %add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16 + %a41 = bitcast i8* %add.ptr41 to double* + + %add.ptr51 = getelementptr inbounds i8, i8* %dst, i32 32 + %a51 = bitcast i8* %add.ptr51 to <4 x float>* + + %add.ptr61 = getelementptr inbounds i8, i8* %dst, i32 48 + %a61 = bitcast i8* %add.ptr61 to <2 x double>* + + %add.ptr71 = getelementptr inbounds i8, i8* %dst, i32 64 + %a71 = bitcast i8* %add.ptr71 to <8 x float>* + + %add.ptr81 = getelementptr inbounds i8, i8* %dst, i32 128 + %a81 = bitcast i8* %add.ptr81 to <4 x double>* + + %add.ptr91 = getelementptr inbounds i8, i8* %dst, i32 64 + %a91 = bitcast i8* %add.ptr91 to <16 x float>* + + %add.ptr101 = getelementptr inbounds i8, i8* %dst, i32 128 + %a101 = bitcast i8* %add.ptr101 to <8 x double>* + + ; These operations are necessary, because select of two single use loads + ; ends up getting optimized into a select of two leas, followed by a + ; single load of the selected address. + %t13 = xor i16 %v13, 11 + %t23 = xor i32 %v23, 1234 + %t33 = fadd float %v33, %v32 + %t43 = fadd double %v43, %v42 + %t53 = fadd <4 x float> %v53, %v52 + %t63 = fadd <2 x double> %v63, %v62 + %t73 = fsub <8 x float> %v73, %v72 + %t83 = fsub <4 x double> %v83, %v82 + %t93 = fsub <16 x float> %v93, %v92 + %t103 = fsub <8 x double> %v103, %v102 + + %cmp = icmp ugt i32 %v1, 31 + %t11 = select i1 %cmp, i16 %v12, i16 %t13 + %t21 = select i1 %cmp, i32 %v22, i32 %t23 + %t31 = select i1 %cmp, float %v32, float %t33 + %t41 = select i1 %cmp, double %v42, double %t43 + %t51 = select i1 %cmp, <4 x float> %v52, <4 x float> %t53 + %t61 = select i1 %cmp, <2 x double> %v62, <2 x double> %t63 + %t71 = select i1 %cmp, <8 x float> %v72, <8 x float> %t73 + %t81 = select i1 %cmp, <4 x double> %v82, <4 x double> %t83 + %t91 = select i1 %cmp, <16 x float> %v92, <16 x float> %t93 + %t101 = select i1 %cmp, <8 x double> %v102, <8 x double> %t103 + + store i16 %t11, i16* %a11, align 2 + store i32 %t21, i32* %a21, align 4 + store float %t31, float* %a31, align 4 + store double %t41, double* %a41, align 8 + store <4 x float> %t51, <4 x float>* %a51, align 16 + store <2 x double> %t61, <2 x double>* %a61, align 16 + store <8 x float> %t71, <8 x float>* %a71, align 32 + store <4 x double> %t81, <4 x double>* %a81, align 32 + store <16 x float> %t91, <16 x float>* %a91, align 32 + store <8 x double> %t101, <8 x double>* %a101, align 32 + + ret void +} + +; This test checks that only a single ja gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +; on the same condition. +; Contrary to my expectations, this doesn't exercise the code for +; CMOV_V8I1, CMOV_V16I1, CMOV_V32I1, or CMOV_V64I1. Instead the selects all +; get lowered into vector length number of selects, which all eventually turn +; into a huge number of CMOV_GR8, which are all contiguous, so the optimization +; kicks in as long as CMOV_GR8 is supported. I couldn't find a way to get +; CMOV_V*I1 pseudo-opcodes to get generated. If a way exists to get CMOV_V*1 +; pseudo-opcodes to be generated, this test should be replaced with one that +; tests those opcodes. +; +; CHECK-LABEL: foo9: +; CHECK: ja +; CHECK-NOT: ja +define void @foo9(i32 %v1, + <8 x i1> %v12, <8 x i1> %v13, + <16 x i1> %v22, <16 x i1> %v23, + <32 x i1> %v32, <32 x i1> %v33, + <64 x i1> %v42, <64 x i1> %v43, + i8 * %dst) nounwind { +entry: + %add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 0 + %a11 = bitcast i8* %add.ptr11 to <8 x i1>* + + %add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4 + %a21 = bitcast i8* %add.ptr21 to <16 x i1>* + + %add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8 + %a31 = bitcast i8* %add.ptr31 to <32 x i1>* + + %add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16 + %a41 = bitcast i8* %add.ptr41 to <64 x i1>* + + ; These operations are necessary, because select of two single use loads + ; ends up getting optimized into a select of two leas, followed by a + ; single load of the selected address. + %t13 = xor <8 x i1> %v13, %v12 + %t23 = xor <16 x i1> %v23, %v22 + %t33 = xor <32 x i1> %v33, %v32 + %t43 = xor <64 x i1> %v43, %v42 + + %cmp = icmp ugt i32 %v1, 31 + %t11 = select i1 %cmp, <8 x i1> %v12, <8 x i1> %t13 + %t21 = select i1 %cmp, <16 x i1> %v22, <16 x i1> %t23 + %t31 = select i1 %cmp, <32 x i1> %v32, <32 x i1> %t33 + %t41 = select i1 %cmp, <64 x i1> %v42, <64 x i1> %t43 + + store <8 x i1> %t11, <8 x i1>* %a11, align 16 + store <16 x i1> %t21, <16 x i1>* %a21, align 4 + store <32 x i1> %t31, <32 x i1>* %a31, align 8 + store <64 x i1> %t41, <64 x i1>* %a41, align 16 + + ret void +} diff --git a/test/CodeGen/X86/pseudo_cmov_lower1.ll b/test/CodeGen/X86/pseudo_cmov_lower1.ll new file mode 100644 index 00000000000..4ce131bb864 --- /dev/null +++ b/test/CodeGen/X86/pseudo_cmov_lower1.ll @@ -0,0 +1,39 @@ +; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2 -o - | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s + +; This test checks that only a single jae gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +; CHECK-LABEL: foo1: +; CHECK: jae +; CHECK-NOT: jae +define double @foo1(float %p1, double %p2, double %p3) nounwind { +entry: + %c1 = fcmp oge float %p1, 0.000000e+00 + %d0 = fadd double %p2, 1.25e0 + %d1 = fadd double %p3, 1.25e0 + %d2 = select i1 %c1, double %d0, double %d1 + %d3 = select i1 %c1, double %d0, double %p2 + %d4 = select i1 %c1, double %p3, double %d1 + %d5 = fsub double %d2, %d3 + %d6 = fadd double %d5, %d4 + ret double %d6 +} + +; This test checks that only a single jae gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +; CHECK-LABEL: foo2: +; CHECK: jae +; CHECK-NOT: jae +define float @foo2(float %p1, float %p2, float %p3) nounwind { +entry: + %c1 = fcmp oge float %p1, 0.000000e+00 + %d0 = fadd float %p2, 1.25e0 + %d1 = fadd float %p3, 1.25e0 + %d2 = select i1 %c1, float %d0, float %d1 + %d3 = select i1 %c1, float %d1, float %p2 + %d4 = select i1 %c1, float %d0, float %p3 + %d5 = fsub float %d2, %d3 + %d6 = fadd float %d5, %d4 + ret float %d6 +} + diff --git a/test/CodeGen/X86/pseudo_cmov_lower2.ll b/test/CodeGen/X86/pseudo_cmov_lower2.ll new file mode 100644 index 00000000000..0133963b36d --- /dev/null +++ b/test/CodeGen/X86/pseudo_cmov_lower2.ll @@ -0,0 +1,100 @@ +; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s + +; This test checks that only a single jae gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. The tricky part +; of this test is that it tests the special PHI operand rewriting code in +; X86TargetLowering::EmitLoweredSelect. +; +; CHECK-LABEL: foo1: +; CHECK: jae +; CHECK-NOT: jae +define double @foo1(float %p1, double %p2, double %p3) nounwind { +entry: + %c1 = fcmp oge float %p1, 0.000000e+00 + %d0 = fadd double %p2, 1.25e0 + %d1 = fadd double %p3, 1.25e0 + %d2 = select i1 %c1, double %d0, double %d1 + %d3 = select i1 %c1, double %d2, double %p2 + %d4 = select i1 %c1, double %d3, double %p3 + %d5 = fsub double %d2, %d3 + %d6 = fadd double %d5, %d4 + ret double %d6 +} + +; This test checks that only a single jae gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. The tricky part +; of this test is that it tests the special PHI operand rewriting code in +; X86TargetLowering::EmitLoweredSelect. +; +; CHECK-LABEL: foo2: +; CHECK: jae +; CHECK-NOT: jae +define double @foo2(float %p1, double %p2, double %p3) nounwind { +entry: + %c1 = fcmp oge float %p1, 0.000000e+00 + %d0 = fadd double %p2, 1.25e0 + %d1 = fadd double %p3, 1.25e0 + %d2 = select i1 %c1, double %d0, double %d1 + %d3 = select i1 %c1, double %p2, double %d2 + %d4 = select i1 %c1, double %p3, double %d3 + %d5 = fsub double %d2, %d3 + %d6 = fadd double %d5, %d4 + ret double %d6 +} + +; This test checks that only a single js gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. The tricky part +; of this test is that it tests the special PHI operand rewriting code in +; X86TargetLowering::EmitLoweredSelect. It also tests to make sure all +; the operands of the resulting instructions are from the proper places. +; +; CHECK-LABEL: foo3: +; CHECK: js +; CHECK-NOT: js +; CHECK-LABEL: # BB#1: +; CHECK-DAG: movapd %xmm2, %xmm1 +; CHECK-DAG: movapd %xmm2, %xmm0 +; CHECK-LABEL:.LBB2_2: +; CHECK: divsd %xmm1, %xmm0 +; CHECK: ret +define double @foo3(i32 %p1, double %p2, double %p3, + double %p4, double %p5) nounwind { +entry: + %c1 = icmp slt i32 %p1, 0 + %d2 = select i1 %c1, double %p2, double %p3 + %d3 = select i1 %c1, double %p3, double %p4 + %d4 = select i1 %c1, double %d2, double %d3 + %d5 = fdiv double %d4, %d3 + ret double %d5 +} + +; This test checks that only a single js gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. The tricky part +; of this test is that it tests the special PHI operand rewriting code in +; X86TargetLowering::EmitLoweredSelect. It also tests to make sure all +; the operands of the resulting instructions are from the proper places +; when the "opposite condition" handling code in the compiler is used. +; This should be the same code as foo3 above, because we use the opposite +; condition code in the second two selects, but we also swap the operands +; of the selects to give the same actual computation. +; +; CHECK-LABEL: foo4: +; CHECK: js +; CHECK-NOT: js +; CHECK-LABEL: # BB#1: +; CHECK-DAG: movapd %xmm2, %xmm1 +; CHECK-DAG: movapd %xmm2, %xmm0 +; CHECK-LABEL:.LBB3_2: +; CHECK: divsd %xmm1, %xmm0 +; CHECK: ret +define double @foo4(i32 %p1, double %p2, double %p3, + double %p4, double %p5) nounwind { +entry: + %c1 = icmp slt i32 %p1, 0 + %d2 = select i1 %c1, double %p2, double %p3 + %c2 = icmp sge i32 %p1, 0 + %d3 = select i1 %c2, double %p4, double %p3 + %d4 = select i1 %c2, double %d3, double %d2 + %d5 = fdiv double %d4, %d3 + ret double %d5 +} -- 2.34.1