From: Dan Gohman Date: Sat, 15 Aug 2009 01:38:56 +0000 (+0000) Subject: On x86-64, for a varargs function, don't store the xmm registers to X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=d6708eade079c30b0790789a00a8d737d84f52b7;p=oota-llvm.git On x86-64, for a varargs function, don't store the xmm registers to the register save area if %al is 0. This avoids touching xmm regsiters when they aren't actually used. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@79061 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e8f4f43dc3d..15436310207 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1527,37 +1527,44 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Store the integer parameter registers. SmallVector MemOps; SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); - SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, - DAG.getIntPtrConstant(VarArgsGPOffset)); + unsigned Offset = VarArgsGPOffset; for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { + SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, + DAG.getIntPtrConstant(Offset)); unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], X86::GR64RegisterClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0); + PseudoSourceValue::getFixedStack(RegSaveFrameIndex), + Offset); MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, - DAG.getIntPtrConstant(8)); + Offset += 8; } + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOps[0], MemOps.size()); + // Now store the XMM (fp + vector) parameter registers. - FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, - DAG.getIntPtrConstant(VarArgsFPOffset)); + SmallVector SaveXMMOps; + SaveXMMOps.push_back(Chain); + + unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); + SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); + SaveXMMOps.push_back(ALVal); + + SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex)); + SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset)); + for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], X86::VR128RegisterClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); - SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0); - MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, - DAG.getIntPtrConstant(16)); + SaveXMMOps.push_back(Val); } - if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - &MemOps[0], MemOps.size()); + Chain = DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, MVT::Other, + &SaveXMMOps[0], SaveXMMOps.size()); } } @@ -7090,6 +7097,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::DEC: return "X86ISD::DEC"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; + case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; } } @@ -7513,7 +7521,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, F->insert(MBBIter, newMBB); F->insert(MBBIter, nextMBB); - // Move all successors to thisMBB to nextMBB + // Move all successors of thisMBB to nextMBB nextMBB->transferSuccessors(thisMBB); // Update thisMBB to fall through to newMBB @@ -7585,6 +7593,73 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, return nextMBB; } +MachineBasicBlock * +X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( + MachineInstr *MI, + MachineBasicBlock *MBB) const { + // Emit code to save XMM registers to the stack. The ABI says that the + // number of registers to save is given in %al, so it's theoretically + // possible to do an indirect jump trick to avoid saving all of them, + // however this code takes a simpler approach and just executes all + // of the stores if %al is non-zero. It's less code, and it's probably + // easier on the hardware branch predictor, and stores aren't all that + // expensive anyway. + + // Create the new basic blocks. One block contains all the XMM stores, + // and one block is the final destination regardless of whether any + // stores were performed. + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineFunction *F = MBB->getParent(); + MachineFunction::iterator MBBIter = MBB; + ++MBBIter; + MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(MBBIter, XMMSaveMBB); + F->insert(MBBIter, EndMBB); + + // Set up the CFG. + // Move any original successors of MBB to the end block. + EndMBB->transferSuccessors(MBB); + // The original block will now fall through to the XMM save block. + MBB->addSuccessor(XMMSaveMBB); + // The XMMSaveMBB will fall through to the end block. + XMMSaveMBB->addSuccessor(EndMBB); + + // Now add the instructions. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + unsigned CountReg = MI->getOperand(0).getReg(); + int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); + int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); + + if (!Subtarget->isTargetWin64()) { + // If %al is 0, branch around the XMM save block. + BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); + BuildMI(MBB, DL, TII->get(X86::JE)).addMBB(EndMBB); + MBB->addSuccessor(EndMBB); + } + + // In the XMM save block, save all the XMM argument registers. + for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { + int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; + BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) + .addFrameIndex(RegSaveFrameIndex) + .addImm(/*Scale=*/1) + .addReg(/*IndexReg=*/0) + .addImm(/*Disp=*/Offset) + .addReg(/*Segment=*/0) + .addReg(MI->getOperand(i).getReg()) + .addMemOperand(MachineMemOperand( + PseudoSourceValue::getFixedStack(RegSaveFrameIndex), + MachineMemOperand::MOStore, Offset, + /*Size=*/16, /*Align=*/16)); + } + + F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. + + return EndMBB; +} MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, @@ -7888,6 +7963,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, X86::MOV32rr, X86::MOV32rr, X86::MOV32ri, X86::MOV32ri, false); + case X86::VASTART_SAVE_XMM_REGS: + return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); } } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 9e6cd819e16..3ac6e51bbb0 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -243,7 +243,12 @@ namespace llvm { MUL_IMM, // PTEST - Vector bitwise comparisons - PTEST + PTEST, + + // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack, + // according to %al. An operator is needed so that this can be expanded + // with control flow. + VASTART_SAVE_XMM_REGS }; } @@ -715,6 +720,11 @@ namespace llvm { MachineBasicBlock *BB, unsigned cmovOpc) const; + /// Utility function to emit the xmm reg save portion of va_start. + MachineBasicBlock *EmitVAStartSaveXMMRegsWithCustomInserter( + MachineInstr *BInstr, + MachineBasicBlock *BB) const; + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent, for use with the given x86 condition code. SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG); diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index ecb1b208f20..f13102640a9 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -56,6 +56,10 @@ def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; +def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>, + SDTCisVT<1, iPTR>, + SDTCisVT<2, iPTR>]>; + def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>; def SDTX86RdTsc : SDTypeProfile<0, 0, []>; @@ -114,6 +118,11 @@ def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary, def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, [SDNPHasChain, SDNPOptInFlag]>; +def X86vastart_save_xmm_regs : + SDNode<"X86ISD::VASTART_SAVE_XMM_REGS", + SDT_X86VASTART_SAVE_XMM_REGS, + [SDNPHasChain]>; + def X86callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart, [SDNPHasChain, SDNPOutFlag]>; @@ -511,6 +520,18 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), Requires<[In32BitMode]>; } +// x86-64 va_start lowering magic. +let usesCustomDAGSchedInserter = 1 in +def VASTART_SAVE_XMM_REGS : I<0, Pseudo, + (outs), + (ins GR8:$al, + i64imm:$regsavefi, i64imm:$offset, + variable_ops), + "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset", + [(X86vastart_save_xmm_regs GR8:$al, + imm:$regsavefi, + imm:$offset)]>; + // Nop let neverHasSideEffects = 1 in { def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>; diff --git a/test/CodeGen/X86/stdarg.ll b/test/CodeGen/X86/stdarg.ll new file mode 100644 index 00000000000..7207057729f --- /dev/null +++ b/test/CodeGen/X86/stdarg.ll @@ -0,0 +1,20 @@ +; RUN: llvm-as < %s | llc -march=x86-64 | grep {testb \[%\]al, \[%\]al} + +%struct.__va_list_tag = type { i32, i32, i8*, i8* } + +define void @foo(i32 %x, ...) nounwind { +entry: + %ap = alloca [1 x %struct.__va_list_tag], align 8; <[1 x %struct.__va_list_tag]*> [#uses=2] + %ap12 = bitcast [1 x %struct.__va_list_tag]* %ap to i8*; [#uses=2] + call void @llvm.va_start(i8* %ap12) + %ap3 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0; <%struct.__va_list_tag*> [#uses=1] + call void @bar(%struct.__va_list_tag* %ap3) nounwind + call void @llvm.va_end(i8* %ap12) + ret void +} + +declare void @llvm.va_start(i8*) nounwind + +declare void @bar(%struct.__va_list_tag*) + +declare void @llvm.va_end(i8*) nounwind