From c9c70b1651ddeab278429889c7b625650eb3342e Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 6 Aug 2014 00:29:43 +0000 Subject: [PATCH] R600/SI: Implement areLoadsFromSameBasePtr This currently has a noticable effect on the kernel argument loads. LDS and global loads are more problematic, I think because of how copies are currently inserted to ensure that the address is a VGPR. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214942 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/SIInstrInfo.cpp | 98 +++++++++++++++++++ lib/Target/R600/SIInstrInfo.h | 4 + test/CodeGen/R600/address-space.ll | 6 +- test/CodeGen/R600/atomic_cmp_swap_local.ll | 4 +- test/CodeGen/R600/ctpop64.ll | 2 +- test/CodeGen/R600/cvt_f32_ubyte.ll | 6 +- test/CodeGen/R600/extract_vector_elt_i16.ll | 4 +- test/CodeGen/R600/fcopysign.f32.ll | 2 +- test/CodeGen/R600/fcopysign.f64.ll | 4 +- test/CodeGen/R600/llvm.memcpy.ll | 47 ++++----- test/CodeGen/R600/mubuf.ll | 4 +- test/CodeGen/R600/rotl.i64.ll | 2 +- test/CodeGen/R600/schedule-global-loads.ll | 26 +++++ .../CodeGen/R600/schedule-kernel-arg-loads.ll | 12 +++ test/CodeGen/R600/trunc.ll | 9 +- test/CodeGen/R600/wait.ll | 60 +++++++----- 16 files changed, 220 insertions(+), 70 deletions(-) create mode 100644 test/CodeGen/R600/schedule-global-loads.ll create mode 100644 test/CodeGen/R600/schedule-kernel-arg-loads.ll diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 5511fb74e42..28a60aaef1e 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -32,6 +32,104 @@ SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) // TargetInstrInfo callbacks //===----------------------------------------------------------------------===// +static unsigned getNumOperandsNoGlue(SDNode *Node) { + unsigned N = Node->getNumOperands(); + while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) + --N; + return N; +} + +static SDValue findChainOperand(SDNode *Load) { + SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); + assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); + return LastOp; +} + +bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, + int64_t &Offset0, + int64_t &Offset1) const { + if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) + return false; + + unsigned Opc0 = Load0->getMachineOpcode(); + unsigned Opc1 = Load1->getMachineOpcode(); + + // Make sure both are actually loads. + if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) + return false; + + if (isDS(Opc0) && isDS(Opc1)) { + assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); + + // TODO: Also shouldn't see read2st + assert(Opc0 != AMDGPU::DS_READ2_B32 && + Opc0 != AMDGPU::DS_READ2_B64 && + Opc1 != AMDGPU::DS_READ2_B32 && + Opc1 != AMDGPU::DS_READ2_B64); + + // Check base reg. + if (Load0->getOperand(1) != Load1->getOperand(1)) + return false; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + Offset0 = cast(Load0->getOperand(2))->getZExtValue(); + Offset1 = cast(Load1->getOperand(2))->getZExtValue(); + return true; + } + + if (isSMRD(Opc0) && isSMRD(Opc1)) { + assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); + + // Check base reg. + if (Load0->getOperand(0) != Load1->getOperand(0)) + return false; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + Offset0 = cast(Load0->getOperand(1))->getZExtValue(); + Offset1 = cast(Load1->getOperand(1))->getZExtValue(); + return true; + } + + // MUBUF and MTBUF can access the same addresses. + if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { + // Skip if an SGPR offset is applied. I don't think we ever emit any of + // variants that use this currently. + int SoffsetIdx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::soffset); + if (SoffsetIdx != -1) + return false; + + // getNamedOperandIdx returns the index for the MachineInstr's operands, + // which includes the result as the first operand. We are indexing into the + // MachineSDNode's operands, so we need to skip the result operand to get + // the real index. + --SoffsetIdx; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + // MUBUF and MTBUF have vaddr at different indices. + int VaddrIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::vaddr) - 1; + int VaddrIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::vaddr) - 1; + if (Load0->getOperand(VaddrIdx0) != Load1->getOperand(VaddrIdx1)) + return false; + + int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset) - 1; + int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset) - 1; + Offset0 = cast(Load0->getOperand(OffIdx0))->getZExtValue(); + Offset1 = cast(Load1->getOperand(OffIdx1))->getZExtValue(); + return true; + } + + return false; +} + bool SIInstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, unsigned &Offset, const TargetRegisterInfo *TRI) const { diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index 9564d4e9a5e..f1060637a10 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -62,6 +62,10 @@ public: return RI; } + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, + int64_t &Offset2) const override; + bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, unsigned &Offset, const TargetRegisterInfo *TRI) const final; diff --git a/test/CodeGen/R600/address-space.ll b/test/CodeGen/R600/address-space.ll index f75a8ac5e6a..7f52472c384 100644 --- a/test/CodeGen/R600/address-space.ll +++ b/test/CodeGen/R600/address-space.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s ; Test that codegenprepare understands address space sizes @@ -10,8 +10,8 @@ ; CHECK-LABEL: @do_as_ptr_calcs: ; CHECK: S_LOAD_DWORD [[SREG1:s[0-9]+]], ; CHECK: V_MOV_B32_e32 [[VREG1:v[0-9]+]], [[SREG1]] -; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VREG1]], 0x14 -; CHECK: DS_READ_B32 v{{[0-9]+}}, v{{[0-9]+}}, 0xc +; CHECK-DAG: DS_READ_B32 v{{[0-9]+}}, [[VREG1]], 0xc +; CHECK-DAG: DS_READ_B32 v{{[0-9]+}}, v{{[0-9]+}}, 0x14 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind { entry: %x = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0 diff --git a/test/CodeGen/R600/atomic_cmp_swap_local.ll b/test/CodeGen/R600/atomic_cmp_swap_local.ll index eb9539eec51..b04874bfcf1 100644 --- a/test/CodeGen/R600/atomic_cmp_swap_local.ll +++ b/test/CodeGen/R600/atomic_cmp_swap_local.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: @lds_atomic_cmpxchg_ret_i32_offset: -; SI: S_LOAD_DWORD [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc ; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: S_LOAD_DWORD [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc ; SI-DAG: V_MOV_B32_e32 [[VCMP:v[0-9]+]], 7 ; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; SI-DAG: V_MOV_B32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] @@ -17,8 +17,8 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs } ; FUNC-LABEL: @lds_atomic_cmpxchg_ret_i64_offset: -; SI: S_LOAD_DWORDX2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: S_LOAD_DWORDX2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI: S_MOV_B64 s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7 ; SI-DAG: V_MOV_B32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]] ; SI-DAG: V_MOV_B32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]] diff --git a/test/CodeGen/R600/ctpop64.ll b/test/CodeGen/R600/ctpop64.ll index b36ecc68d89..37a174f2ec7 100644 --- a/test/CodeGen/R600/ctpop64.ll +++ b/test/CodeGen/R600/ctpop64.ll @@ -7,7 +7,7 @@ declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone ; FUNC-LABEL: @s_ctpop_i64: -; SI: S_LOAD_DWORDX2 [[SVAL:s\[[0-9]+:[0-9]+\]]], +; SI: S_LOAD_DWORDX2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SI: S_BCNT1_I32_B64 [[SRESULT:s[0-9]+]], [[SVAL]] ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; SI: BUFFER_STORE_DWORD [[VRESULT]], diff --git a/test/CodeGen/R600/cvt_f32_ubyte.ll b/test/CodeGen/R600/cvt_f32_ubyte.ll index 06a601065c3..3e667b1822f 100644 --- a/test/CodeGen/R600/cvt_f32_ubyte.ll +++ b/test/CodeGen/R600/cvt_f32_ubyte.ll @@ -68,13 +68,13 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> ; SI-LABEL: @load_v4i8_to_v4f32_2_uses: ; SI: BUFFER_LOAD_UBYTE -; SI: V_CVT_F32_UBYTE0_e32 ; SI: BUFFER_LOAD_UBYTE -; SI: V_CVT_F32_UBYTE0_e32 ; SI: BUFFER_LOAD_UBYTE -; SI: V_CVT_F32_UBYTE0_e32 ; SI: BUFFER_LOAD_UBYTE ; SI: V_CVT_F32_UBYTE0_e32 +; SI: V_CVT_F32_UBYTE0_e32 +; SI: V_CVT_F32_UBYTE0_e32 +; SI: V_CVT_F32_UBYTE0_e32 ; XXX - replace with this when v4i8 loads aren't scalarized anymore. ; XSI: BUFFER_LOAD_DWORD diff --git a/test/CodeGen/R600/extract_vector_elt_i16.ll b/test/CodeGen/R600/extract_vector_elt_i16.ll index 5cd1b04bd1d..26b8f2c9320 100644 --- a/test/CodeGen/R600/extract_vector_elt_i16.ll +++ b/test/CodeGen/R600/extract_vector_elt_i16.ll @@ -2,9 +2,9 @@ ; FUNC-LABEL: @extract_vector_elt_v2i16 ; SI: BUFFER_LOAD_USHORT -; SI: BUFFER_STORE_SHORT ; SI: BUFFER_LOAD_USHORT ; SI: BUFFER_STORE_SHORT +; SI: BUFFER_STORE_SHORT define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind { %p0 = extractelement <2 x i16> %foo, i32 0 %p1 = extractelement <2 x i16> %foo, i32 1 @@ -16,9 +16,9 @@ define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) no ; FUNC-LABEL: @extract_vector_elt_v4i16 ; SI: BUFFER_LOAD_USHORT -; SI: BUFFER_STORE_SHORT ; SI: BUFFER_LOAD_USHORT ; SI: BUFFER_STORE_SHORT +; SI: BUFFER_STORE_SHORT define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind { %p0 = extractelement <4 x i16> %foo, i32 0 %p1 = extractelement <4 x i16> %foo, i32 2 diff --git a/test/CodeGen/R600/fcopysign.f32.ll b/test/CodeGen/R600/fcopysign.f32.ll index 7b4425bed72..26ddd0998ea 100644 --- a/test/CodeGen/R600/fcopysign.f32.ll +++ b/test/CodeGen/R600/fcopysign.f32.ll @@ -8,9 +8,9 @@ declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind read ; Try to identify arg based on higher address. ; FUNC-LABEL: @test_copysign_f32: +; SI: S_LOAD_DWORD [[SMAG:s[0-9]+]], {{.*}} 0xb ; SI: S_LOAD_DWORD [[SSIGN:s[0-9]+]], {{.*}} 0xc ; SI: V_MOV_B32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]] -; SI-DAG: S_LOAD_DWORD [[SMAG:s[0-9]+]], {{.*}} 0xb ; SI-DAG: V_MOV_B32_e32 [[VMAG:v[0-9]+]], [[SMAG]] ; SI-DAG: S_MOV_B32 [[SCONST:s[0-9]+]], 0x7fffffff ; SI: V_BFI_B32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]] diff --git a/test/CodeGen/R600/fcopysign.f64.ll b/test/CodeGen/R600/fcopysign.f64.ll index ea7a6db67f3..c72329ca012 100644 --- a/test/CodeGen/R600/fcopysign.f64.ll +++ b/test/CodeGen/R600/fcopysign.f64.ll @@ -5,9 +5,9 @@ declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind r declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone ; FUNC-LABEL: @test_copysign_f64: -; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI: V_MOV_B32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]] ; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: V_MOV_B32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]] ; SI-DAG: V_MOV_B32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]] ; SI-DAG: S_MOV_B32 [[SCONST:s[0-9]+]], 0x7fffffff ; SI: V_BFI_B32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]] diff --git a/test/CodeGen/R600/llvm.memcpy.ll b/test/CodeGen/R600/llvm.memcpy.ll index 6e9a51ef4dd..cd8b532a792 100644 --- a/test/CodeGen/R600/llvm.memcpy.ll +++ b/test/CodeGen/R600/llvm.memcpy.ll @@ -40,37 +40,37 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace ; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 - ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 + +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 ; SI: DS_WRITE_B8 ; SI: S_ENDPGM @@ -100,20 +100,21 @@ define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias % ; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 + +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 ; SI: DS_WRITE_B16 ; SI: S_ENDPGM diff --git a/test/CodeGen/R600/mubuf.ll b/test/CodeGen/R600/mubuf.ll index 75b1b6d0dc0..b978b4dde88 100644 --- a/test/CodeGen/R600/mubuf.ll +++ b/test/CodeGen/R600/mubuf.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck %s +; RUN: llc -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s ;;;==========================================================================;;; ;;; MUBUF LOAD TESTS @@ -28,7 +28,7 @@ entry: ; MUBUF load with an immediate byte offset that doesn't fit into 12-bits ; CHECK-LABEL: @mubuf_load2 -; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 ; encoding: [0x00,0x80 +; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 ; encoding: [0x00,0x80 define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = getelementptr i32 addrspace(1)* %in, i64 1024 diff --git a/test/CodeGen/R600/rotl.i64.ll b/test/CodeGen/R600/rotl.i64.ll index bda0b6694a8..cf4a40944a2 100644 --- a/test/CodeGen/R600/rotl.i64.ll +++ b/test/CodeGen/R600/rotl.i64.ll @@ -1,9 +1,9 @@ ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: @s_rotl_i64: -; SI: S_LSHL_B64 ; SI: S_SUB_I32 ; SI: S_LSHR_B64 +; SI: S_LSHL_B64 ; SI: S_OR_B64 define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { entry: diff --git a/test/CodeGen/R600/schedule-global-loads.ll b/test/CodeGen/R600/schedule-global-loads.ll new file mode 100644 index 00000000000..f73d3030564 --- /dev/null +++ b/test/CodeGen/R600/schedule-global-loads.ll @@ -0,0 +1,26 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s + + +declare i32 @llvm.r600.read.tidig.x() #1 + +; FIXME: This currently doesn't do a great job of clustering the +; loads, which end up with extra moves between them. Right now, it +; seems the only things areLoadsFromSameBasePtr is accomplishing is +; ordering the loads so that the lower address loads come first. + +; FUNC-LABEL: @cluster_global_arg_loads +; SI: BUFFER_LOAD_DWORD [[REG0:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI: BUFFER_LOAD_DWORD [[REG1:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4 +; SI: BUFFER_STORE_DWORD [[REG0]] +; SI: BUFFER_STORE_DWORD [[REG1]] +define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 { + %load0 = load i32 addrspace(1)* %ptr, align 4 + %gep = getelementptr i32 addrspace(1)* %ptr, i32 1 + %load1 = load i32 addrspace(1)* %gep, align 4 + store i32 %load0, i32 addrspace(1)* %out0, align 4 + store i32 %load1, i32 addrspace(1)* %out1, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/schedule-kernel-arg-loads.ll b/test/CodeGen/R600/schedule-kernel-arg-loads.ll new file mode 100644 index 00000000000..34b709810a9 --- /dev/null +++ b/test/CodeGen/R600/schedule-kernel-arg-loads.ll @@ -0,0 +1,12 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s + +; FUNC-LABEL: @cluster_arg_loads +; SI: S_LOAD_DWORDX2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; SI-NEXT: S_LOAD_DWORDX2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-NEXT: S_LOAD_DWORD s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-NEXT: S_LOAD_DWORD s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe +define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind { + store i32 %x, i32 addrspace(1)* %out0, align 4 + store i32 %y, i32 addrspace(1)* %out1, align 4 + ret void +} diff --git a/test/CodeGen/R600/trunc.ll b/test/CodeGen/R600/trunc.ll index 31cdfcd1a88..4a278fbcce0 100644 --- a/test/CodeGen/R600/trunc.ll +++ b/test/CodeGen/R600/trunc.ll @@ -30,10 +30,11 @@ define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) { } ; SI-LABEL: @trunc_shl_i64: -; SI: S_LOAD_DWORDX2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, -; SI: S_ADD_I32 s[[LO_ADD:[0-9]+]], s[[LO_SREG]], -; SI: S_LSHL_B64 s{{\[}}[[LO_SREG2:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_ADD]]:{{[0-9]+\]}}, 2 -; SI: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]] +; SI: S_LOAD_DWORDX2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI: S_ADD_I32 s[[LO_SREG2:[0-9]+]], s[[LO_SREG]], +; SI: S_ADDC_U32 +; SI: S_LSHL_B64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG2]]:{{[0-9]+\]}}, 2 +; SI: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SHL]] ; SI: BUFFER_STORE_DWORD v[[LO_VREG]], define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) { %aa = add i64 %a, 234 ; Prevent shrinking store. diff --git a/test/CodeGen/R600/wait.ll b/test/CodeGen/R600/wait.ll index 2cf88fe9f73..17e0b82e8a7 100644 --- a/test/CodeGen/R600/wait.ll +++ b/test/CodeGen/R600/wait.ll @@ -1,37 +1,45 @@ -; RUN: llc < %s -march=r600 -mcpu=SI --verify-machineinstrs | FileCheck %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -;CHECK-LABEL: @main -;CHECK: S_WAITCNT lgkmcnt(0) -;CHECK: S_WAITCNT vmcnt(0) -;CHECK: S_WAITCNT expcnt(0) lgkmcnt(0) - -define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, i32 inreg, i32, i32, i32, i32) #0 { +; CHECK-LABEL: @main +; CHECK: S_LOAD_DWORDX4 +; CHECK: S_LOAD_DWORDX4 +; CHECK: S_WAITCNT lgkmcnt(0) +; CHECK: S_WAITCNT vmcnt(0) +; CHECK: S_WAITCNT expcnt(0) lgkmcnt(0) +define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 { main_body: - %10 = getelementptr <16 x i8> addrspace(2)* %3, i32 0 - %11 = load <16 x i8> addrspace(2)* %10, !tbaa !0 - %12 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %11, i32 0, i32 %6) - %13 = extractelement <4 x float> %12, i32 0 - %14 = extractelement <4 x float> %12, i32 1 - %15 = extractelement <4 x float> %12, i32 2 - %16 = extractelement <4 x float> %12, i32 3 - %17 = getelementptr <16 x i8> addrspace(2)* %3, i32 1 - %18 = load <16 x i8> addrspace(2)* %17, !tbaa !0 - %19 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %18, i32 0, i32 %6) - %20 = extractelement <4 x float> %19, i32 0 - %21 = extractelement <4 x float> %19, i32 1 - %22 = extractelement <4 x float> %19, i32 2 - %23 = extractelement <4 x float> %19, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %20, float %21, float %22, float %23) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %13, float %14, float %15, float %16) + %tmp = getelementptr <16 x i8> addrspace(2)* %arg3, i32 0 + %tmp10 = load <16 x i8> addrspace(2)* %tmp, !tbaa !0 + %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6) + %tmp12 = extractelement <4 x float> %tmp11, i32 0 + %tmp13 = extractelement <4 x float> %tmp11, i32 1 + call void @llvm.AMDGPU.barrier.global() #1 + %tmp14 = extractelement <4 x float> %tmp11, i32 2 +; %tmp15 = extractelement <4 x float> %tmp11, i32 3 + %tmp15 = load float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt + %tmp16 = getelementptr <16 x i8> addrspace(2)* %arg3, i32 1 + %tmp17 = load <16 x i8> addrspace(2)* %tmp16, !tbaa !0 + %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6) + %tmp19 = extractelement <4 x float> %tmp18, i32 0 + %tmp20 = extractelement <4 x float> %tmp18, i32 1 + %tmp21 = extractelement <4 x float> %tmp18, i32 2 + %tmp22 = extractelement <4 x float> %tmp18, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22) + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15) ret void } +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.global() #1 + ; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1 +declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) attributes #0 = { "ShaderType"="1" } -attributes #1 = { nounwind readnone } +attributes #1 = { noduplicate nounwind } +attributes #2 = { nounwind readnone } -!0 = metadata !{metadata !"const", null, i32 1} +!0 = metadata !{metadata !1, metadata !1, i64 0, i32 1} +!1 = metadata !{metadata !"const", null} -- 2.34.1