From 27744344f8f1fb924b1ee7f00f008702c0f07b86 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 12 May 2015 14:18:14 +0000 Subject: [PATCH] R600/SI: Remove explicit m0 operand from s_sendmsg Instead add m0 as an implicit operand. This allows us to avoid using the M0Reg register class and eliminates a number of unnecessary spills when using s_sendmsg instructions. This impacts one shader in the shader-db: SGPRS: 48 -> 40 (-16.67 %) VGPRS: 112 -> 108 (-3.57 %) Code Size: 40132 -> 38796 (-3.33 %) bytes LDS: 0 -> 0 (0.00 %) blocks Scratch: 2048 -> 0 (-100.00 %) bytes per wave git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@237133 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUISelLowering.cpp | 1 + lib/Target/R600/AMDGPUISelLowering.h | 1 + lib/Target/R600/AMDGPUInstrInfo.td | 4 ++++ lib/Target/R600/SIISelLowering.cpp | 25 ++++++++++++++++++++++++- lib/Target/R600/SIISelLowering.h | 1 + lib/Target/R600/SIInstructions.td | 12 +++++------- 6 files changed, 36 insertions(+), 8 deletions(-) diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 9c987c28d7f..ceb7e151959 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -2677,6 +2677,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; + NODE_NAME_CASE(SENDMSG) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index fe657d5d731..a6bfbbd1572 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -288,6 +288,7 @@ enum NodeType : unsigned { BUILD_VERTICAL_VECTOR, /// Pointer to the start of the shader's constant data. CONST_DATA_PTR, + SENDMSG, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, STORE_MSKOR, LOAD_CONSTANT, diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index d72cb1d7f8c..e251cb48e0d 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -219,6 +219,10 @@ def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, [] >; +def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", + SDTypeProfile<0, 1, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPInGlue]>; + //===----------------------------------------------------------------------===// // Flow Control Profile Types //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 6ff2a9689e0..56214cd07dc 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -838,6 +838,23 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); } +SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, + SDValue V) const { + // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, + // so we will end up with redundant moves to m0. + // + // We can't use S_MOV_B32, because there is no way to specify m0 as the + // destination register. + // + // We have to use them both. Machine cse will combine all the S_MOV_B32 + // instructions and the register coalescer eliminate the extra copies. + SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V); + return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32), + SDValue(M0, 0), SDValue()); // Glue + // A Null SDValue creates + // a glue result. +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -940,12 +957,18 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); + SDLoc DL(Op); SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = cast(Op.getOperand(1))->getZExtValue(); switch (IntrinsicID) { + case AMDGPUIntrinsic::SI_sendmsg: { + Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); + SDValue Glue = Chain.getValue(1); + return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, + Op.getOperand(2), Glue); + } case AMDGPUIntrinsic::SI_tbuffer_store: { - SDLoc DL(Op); SDValue Ops[] = { Chain, Op.getOperand(2), diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index a6bc7c607fa..a95354c3881 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -117,6 +117,7 @@ public: std::pair getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, const std::string &Constraint, MVT VT) const override; + SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const; }; } // End namespace llvm diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 838b645f2ba..0449ba8cd18 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -488,13 +488,11 @@ def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">; def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">; -let Uses = [EXEC] in { - def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "s_sendmsg $simm16", - [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)] - > { - let DisableEncoding = "$m0"; - } -} // End Uses = [EXEC] +let Uses = [EXEC, M0] in { + def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16", + [(AMDGPUsendmsg (i32 imm:$simm16))] + >; +} // End Uses = [EXEC, M0] def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">; def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; -- 2.34.1