From: Tom Stellard Date: Fri, 28 Jun 2013 15:46:59 +0000 (+0000) Subject: R600: Add support for GROUP_BARRIER instruction X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=cedcfee405a22b245e869abe8609f094df34085a;p=oota-llvm.git R600: Add support for GROUP_BARRIER instruction Reviewed-by: Vincent Lejeune git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185161 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td index eecb25b04f7..9f975bf9bf4 100644 --- a/lib/Target/R600/AMDGPUIntrinsics.td +++ b/lib/Target/R600/AMDGPUIntrinsics.td @@ -50,6 +50,8 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; + + def int_AMDGPU_barrier_local : Intrinsic<[], [], []>; } let TargetPrefix = "TGSI", isTarget = 1 in { diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp index ff5ce5a2dd7..0aea2d7c030 100644 --- a/lib/Target/R600/R600EmitClauseMarkers.cpp +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp @@ -177,7 +177,14 @@ private: AluInstCount ++; continue; } - if (I->getOpcode() == AMDGPU::KILLGT) { + // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as: + // + // * KILL or INTERP instructions + // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits + // * Uses waterfalling (i.e. INDEX_MODE = AR.X) + // + // XXX: These checks have not been implemented yet. + if (TII->mustBeLastInClause(I->getOpcode())) { I++; break; } diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index f267ee92cd9..3b1a24015a8 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -163,6 +163,16 @@ bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const { usesTextureCache(MI->getOpcode()); } +bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { + switch (Opcode) { + case AMDGPU::KILLGT: + case AMDGPU::GROUP_BARRIER: + return true; + default: + return false; + } +} + SmallVector, 3> R600InstrInfo::getSrcs(MachineInstr *MI) const { SmallVector, 3> Result; diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index f06abf6081c..3c2e50be662 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -72,6 +72,8 @@ namespace llvm { bool usesTextureCache(unsigned Opcode) const; bool usesTextureCache(const MachineInstr *MI) const; + bool mustBeLastInClause(unsigned Opcode) const; + /// \returns a pair for each src of an ALU instructions. /// The first member of a pair is the register id. /// If register is ALU_CONST, second member is SEL. diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index b0a82ff628c..f42501a2fd0 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1499,6 +1499,36 @@ let hasSideEffects = 1 in { def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>; +def GROUP_BARRIER : InstR600 < + (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local)], AnyALU>, + R600ALU_Word0, + R600ALU_Word1_OP2 <0x54> { + + let dst = 0; + let dst_rel = 0; + let src0 = 0; + let src0_rel = 0; + let src0_neg = 0; + let src0_abs = 0; + let src1 = 0; + let src1_rel = 0; + let src1_neg = 0; + let src1_abs = 0; + let write = 0; + let omod = 0; + let clamp = 0; + let last = 1; + let bank_swizzle = 0; + let pred_sel = 0; + let update_exec_mask = 0; + let update_pred = 0; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + + let ALUInst = 1; +} + // TRUNC is used for the FLT_TO_INT instructions to work around a // perceived problem where the rounding modes are applied differently // depending on the instruction and the slot they are in. diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index a330d885743..acc1b4d6ee3 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -269,10 +269,14 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { } // Does the instruction take a whole IG ? + // XXX: Is it possible to add a helper function in R600InstrInfo that can + // be used here and in R600PacketizerList::isSoloInstruction() ? if(TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode()) || - TII->isReductionOp(MI->getOpcode())) + TII->isReductionOp(MI->getOpcode()) || + MI->getOpcode() == AMDGPU::GROUP_BARRIER) { return AluT_XYZW; + } // Is the result already assigned to a channel ? unsigned DestSubReg = MI->getOperand(0).getSubReg(); diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp index 6024fd5c85f..4c72d229675 100644 --- a/lib/Target/R600/R600Packetizer.cpp +++ b/lib/Target/R600/R600Packetizer.cpp @@ -82,7 +82,11 @@ private: int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) continue; - unsigned Dst = BI->getOperand(0).getReg(); + int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst); + if (DstIdx == -1) { + continue; + } + unsigned Dst = BI->getOperand(DstIdx).getReg(); if (BI->getOpcode() == AMDGPU::DOT4_r600 || BI->getOpcode() == AMDGPU::DOT4_eg) { Result[Dst] = AMDGPU::PV_X; @@ -154,6 +158,8 @@ public: return true; if (TII->isTransOnly(MI)) return true; + if (MI->getOpcode() == AMDGPU::GROUP_BARRIER) + return true; return false; } diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll new file mode 100644 index 00000000000..8d3c9ca2230 --- /dev/null +++ b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: GROUP_BARRIER + +define void @test(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() + %1 = getelementptr i32 addrspace(1)* %out, i32 %0 + store i32 %0, i32 addrspace(1)* %1 + call void @llvm.AMDGPU.barrier.local() + %2 = call i32 @llvm.r600.read.local.size.x() + %3 = sub i32 %2, 1 + %4 = sub i32 %3, %0 + %5 = getelementptr i32 addrspace(1)* %out, i32 %4 + %6 = load i32 addrspace(1)* %5 + store i32 %6, i32 addrspace(1)* %1 + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #0 +declare void @llvm.AMDGPU.barrier.local() +declare i32 @llvm.r600.read.local.size.x() #0 + +attributes #0 = { readnone }