From: Matt Arsenault Date: Tue, 10 Jun 2014 19:18:24 +0000 (+0000) Subject: R600/SI: Implement i64 ctpop X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=69891c0115542d191a43023f60eca4e0dfd8dbcb;p=oota-llvm.git R600/SI: Implement i64 ctpop git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@210568 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 48ce7d8b7ea..d05d71add82 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -212,6 +212,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : } setOperationAction(ISD::CTPOP, MVT::i32, Legal); + setOperationAction(ISD::CTPOP, MVT::i64, Legal); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index fdebb2ffb4d..5436fc0e846 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -1178,6 +1178,11 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { Inst->eraseFromParent(); continue; + case AMDGPU::S_BCNT1_I32_B64: + splitScalar64BitBCNT(Worklist, Inst); + Inst->eraseFromParent(); + continue; + case AMDGPU::S_BFE_U64: case AMDGPU::S_BFE_I64: case AMDGPU::S_BFM_B64: @@ -1419,6 +1424,46 @@ void SIInstrInfo::splitScalar64BitBinaryOp( Worklist.push_back(HiHalf); } +void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src = Inst->getOperand(1); + + const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32); + const TargetRegisterClass *SrcRC = Src.isReg() ? + MRI.getRegClass(Src.getReg()) : + &AMDGPU::SGPR_32RegClass; + + unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); + + MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, + AMDGPU::sub0, SrcSubRC); + MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, + AMDGPU::sub1, SrcSubRC); + + MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg) + .addOperand(SrcRegSub0) + .addImm(0); + + MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg) + .addOperand(SrcRegSub1) + .addReg(MidReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + + Worklist.push_back(First); + Worklist.push_back(Second); +} + void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, MachineInstr *Inst) const { // Add the implict and explicit register definitions. diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index a9b014f8805..a9de2d7635d 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -50,6 +50,9 @@ private: void splitScalar64BitBinaryOp(SmallVectorImpl &Worklist, MachineInstr *Inst, unsigned Opcode) const; + void splitScalar64BitBCNT(SmallVectorImpl &Worklist, + MachineInstr *Inst) const; + void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; public: diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index 3368d49ab4d..77ef19084dd 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -187,6 +187,12 @@ class SOP1_64 op, string opName, list pattern> : SOP1 < opName#" $dst, $src0", pattern >; +// 64-bit input, 32-bit output. +class SOP1_32_64 op, string opName, list pattern> : SOP1 < + op, (outs SReg_32:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern +>; + class SOP2_32 op, string opName, list pattern> : SOP2 < op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 6525b49f3a5..06b09fedfc4 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -110,7 +110,8 @@ def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>; def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "S_BCNT1_I32_B32", [(set i32:$dst, (ctpop i32:$src0))] >; -////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>; +def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "S_BCNT1_I32_B64", []>; + ////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>; ////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>; ////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>; @@ -2515,6 +2516,13 @@ def : Pat < (V_BCNT_U32_B32_e32 $popcnt, $val) >; +def : Pat < + (i64 (ctpop i64:$src)), + (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (S_BCNT1_I32_B64 $src), sub0), + (S_MOV_B32 0), sub1) +>; + //============================================================================// // Miscellaneous Optimization Patterns //============================================================================// diff --git a/test/CodeGen/R600/ctpop64.ll b/test/CodeGen/R600/ctpop64.ll new file mode 100644 index 00000000000..798d8f55ecc --- /dev/null +++ b/test/CodeGen/R600/ctpop64.ll @@ -0,0 +1,91 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i64 @llvm.ctpop.i64(i64) nounwind readnone +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone +declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone +declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone +declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone + +; FUNC-LABEL: @s_ctpop_i64: +; SI: S_LOAD_DWORDX2 [[SVAL:s\[[0-9]+:[0-9]+\]]], +; SI: S_BCNT1_I32_B64 [[SRESULT:s[0-9]+]], [[SVAL]] +; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: BUFFER_STORE_DWORD [[VRESULT]], +; SI: S_ENDPGM +define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind { + %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone + %truncctpop = trunc i64 %ctpop to i32 + store i32 %truncctpop, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_ctpop_i64: +; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, +; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0 +; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], [[VZERO]] +; SI-NEXT: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; SI: BUFFER_STORE_DWORD [[RESULT]], +; SI: S_ENDPGM +define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %val = load i64 addrspace(1)* %in, align 8 + %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone + %truncctpop = trunc i64 %ctpop to i32 + store i32 %truncctpop, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @s_ctpop_v2i64: +; SI: S_BCNT1_I32_B64 +; SI: S_BCNT1_I32_B64 +; SI: S_ENDPGM +define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind { + %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone + %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> + store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @s_ctpop_v4i64: +; SI: S_BCNT1_I32_B64 +; SI: S_BCNT1_I32_B64 +; SI: S_BCNT1_I32_B64 +; SI: S_BCNT1_I32_B64 +; SI: S_ENDPGM +define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind { + %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone + %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> + store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @v_ctpop_v2i64: +; SI: V_BCNT_U32_B32 +; SI: V_BCNT_U32_B32 +; SI: V_BCNT_U32_B32 +; SI: V_BCNT_U32_B32 +; SI: S_ENDPGM +define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind { + %val = load <2 x i64> addrspace(1)* %in, align 16 + %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone + %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> + store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @v_ctpop_v4i64: +; SI: V_BCNT_U32_B32 +; SI: V_BCNT_U32_B32 +; SI: V_BCNT_U32_B32 +; SI: V_BCNT_U32_B32 +; SI: V_BCNT_U32_B32 +; SI: V_BCNT_U32_B32 +; SI: V_BCNT_U32_B32 +; SI: V_BCNT_U32_B32 +; SI: S_ENDPGM +define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind { + %val = load <4 x i64> addrspace(1)* %in, align 32 + %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone + %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> + store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 + ret void +}