From ce961477be78b3945e6ec4b7e22066f237a89846 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 7 Jun 2013 20:28:55 +0000 Subject: [PATCH] R600: Fix the fetch limits for R600 generation GPUs Reviewed-by: Vincent Lejeune https://bugs.freedesktop.org/show_bug.cgi?id=64257 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@183560 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPU.td | 9 +++ lib/Target/R600/AMDGPUSubtarget.cpp | 11 +--- lib/Target/R600/AMDILDeviceInfo.h | 3 +- lib/Target/R600/Processors.td | 34 ++++++----- test/CodeGen/R600/fetch-limits.r600.ll | 48 +++++++++++++++ test/CodeGen/R600/fetch-limits.r700+.ll | 81 +++++++++++++++++++++++++ 6 files changed, 159 insertions(+), 27 deletions(-) create mode 100644 test/CodeGen/R600/fetch-limits.r600.ll create mode 100644 test/CodeGen/R600/fetch-limits.r700+.ll diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index 335b26cbf2c..099a4918474 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -75,7 +75,16 @@ def FeatureVertexCache : SubtargetFeature<"HasVertexCache", "true", "Specify use of dedicated vertex cache.">; +class SubtargetFeatureFetchLimit : + SubtargetFeature <"fetch"#Value, + "TexVTXClauseSize", + Value, + "Limit the maximum number of fetches in a clause to "#Value>; +def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">; +def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">; + +//===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { let guessInstructionProperties = 1; diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index da342892da5..158903020f6 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -34,19 +34,10 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) : DefaultSize[1] = 1; DefaultSize[2] = 1; HasVertexCache = false; + TexVTXClauseSize = 0; ParseSubtargetFeatures(GPU, FS); DevName = GPU; Device = AMDGPUDeviceInfo::getDeviceFromName(DevName, this, Is64bit); - - // FIXME: The code in the comment below was the original code. But the - // condition is always true, generating a warning when compiled with - // gcc. Vincent Lejeune indicated in a mail to llvm-commits 2013-05-23 that he - // will look into this. The code 'TexVTXClauseSize = 16' is just a temporary - // equivalent replacement, to get rid of the compiler warning. - - // TexVTXClauseSize = (Device->getGeneration() >= AMDGPUDeviceInfo::HD4XXX)?16:8; - - TexVTXClauseSize = 16; } AMDGPUSubtarget::~AMDGPUSubtarget() { diff --git a/lib/Target/R600/AMDILDeviceInfo.h b/lib/Target/R600/AMDILDeviceInfo.h index 4b2c3a53c79..04530e6110e 100644 --- a/lib/Target/R600/AMDILDeviceInfo.h +++ b/lib/Target/R600/AMDILDeviceInfo.h @@ -71,7 +71,8 @@ namespace llvm { /// These have to be in order with the older generations /// having the lower number enumerations. enum Generation { - HD4XXX = 0, ///< 7XX based devices. + HD3XXX = 0, ///< 6XX based devices. + HD4XXX, ///< 7XX based devices. HD5XXX, ///< Evergreen based devices. HD6XXX, ///< NI/Evergreen+ based devices. HD7XXX, ///< Southern Islands based devices. diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td index 0cbe919d810..4c377d05b9e 100644 --- a/lib/Target/R600/Processors.td +++ b/lib/Target/R600/Processors.td @@ -10,37 +10,39 @@ class Proc Features> : Processor; def : Proc<"", R600_VLIW5_Itin, - [FeatureR600ALUInst, FeatureVertexCache]>; + [FeatureR600ALUInst, FeatureVertexCache, FeatureFetchLimit8]>; def : Proc<"r600", R600_VLIW5_Itin, - [FeatureR600ALUInst , FeatureVertexCache]>; + [FeatureR600ALUInst , FeatureVertexCache, FeatureFetchLimit8]>; def : Proc<"rs880", R600_VLIW5_Itin, - [FeatureR600ALUInst]>; + [FeatureR600ALUInst, FeatureFetchLimit8]>; def : Proc<"rv670", R600_VLIW5_Itin, - [FeatureR600ALUInst, FeatureFP64, FeatureVertexCache]>; + [FeatureR600ALUInst, FeatureFP64, FeatureVertexCache, FeatureFetchLimit8]>; def : Proc<"rv710", R600_VLIW5_Itin, - [FeatureVertexCache]>; + [FeatureVertexCache, FeatureFetchLimit16]>; def : Proc<"rv730", R600_VLIW5_Itin, - [FeatureVertexCache]>; + [FeatureVertexCache, FeatureFetchLimit16]>; def : Proc<"rv770", R600_VLIW5_Itin, - [FeatureFP64, FeatureVertexCache]>; + [FeatureFP64, FeatureVertexCache, FeatureFetchLimit16]>; def : Proc<"cedar", R600_VLIW5_Itin, - [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; + [FeatureByteAddress, FeatureImages, FeatureVertexCache, FeatureFetchLimit16]>; def : Proc<"redwood", R600_VLIW5_Itin, - [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; + [FeatureByteAddress, FeatureImages, FeatureVertexCache, FeatureFetchLimit16]>; def : Proc<"sumo", R600_VLIW5_Itin, - [FeatureByteAddress, FeatureImages]>; + [FeatureByteAddress, FeatureImages, FeatureFetchLimit16]>; def : Proc<"juniper", R600_VLIW5_Itin, - [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; + [FeatureByteAddress, FeatureImages, FeatureVertexCache, FeatureFetchLimit16]>; def : Proc<"cypress", R600_VLIW5_Itin, - [FeatureByteAddress, FeatureImages, FeatureFP64, FeatureVertexCache]>; + [FeatureByteAddress, FeatureImages, FeatureFP64, FeatureVertexCache, FeatureFetchLimit16]>; def : Proc<"barts", R600_VLIW5_Itin, - [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; + [FeatureByteAddress, FeatureImages, FeatureVertexCache, FeatureFetchLimit16]>; def : Proc<"turks", R600_VLIW5_Itin, - [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; + [FeatureByteAddress, FeatureImages, FeatureVertexCache, FeatureFetchLimit16]>; def : Proc<"caicos", R600_VLIW5_Itin, - [FeatureByteAddress, FeatureImages]>; + [FeatureByteAddress, FeatureImages, FeatureFetchLimit16]>; def : Proc<"cayman", R600_VLIW4_Itin, - [FeatureByteAddress, FeatureImages, FeatureFP64]>;def : Proc<"SI", SI_Itin, [Feature64BitPtr, FeatureFP64]>; + [FeatureByteAddress, FeatureImages, FeatureFP64, FeatureFetchLimit16]>; + +def : Proc<"SI", SI_Itin, [Feature64BitPtr, FeatureFP64]>; def : Proc<"tahiti", SI_Itin, [Feature64BitPtr, FeatureFP64]>; def : Proc<"pitcairn", SI_Itin, [Feature64BitPtr, FeatureFP64]>; def : Proc<"verde", SI_Itin, [Feature64BitPtr, FeatureFP64]>; diff --git a/test/CodeGen/R600/fetch-limits.r600.ll b/test/CodeGen/R600/fetch-limits.r600.ll new file mode 100644 index 00000000000..f78d1d968e5 --- /dev/null +++ b/test/CodeGen/R600/fetch-limits.r600.ll @@ -0,0 +1,48 @@ +; RUN: llc < %s -march=r600 -mcpu=r600 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=rs880 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=rv670 | FileCheck %s + +; R600 supports 8 fetches in a clause +; CHECK: @fetch_limits_r600 +; CHECK: Fetch clause +; CHECK: Fetch clause + +define void @fetch_limits_r600() #0 { +entry: + %0 = load <4 x float> addrspace(8)* null + %1 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %2 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %3 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %5 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %6 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %7 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %8 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1) + %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1) + %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1) + %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1) + %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1) + %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1) + %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1) + %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1) + %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1) + %a = fadd <4 x float> %res0, %res1 + %b = fadd <4 x float> %res2, %res3 + %c = fadd <4 x float> %res4, %res5 + %d = fadd <4 x float> %res6, %res7 + %e = fadd <4 x float> %res8, %a + + %bc = fadd <4 x float> %b, %c + %de = fadd <4 x float> %d, %e + + %bcde = fadd <4 x float> %bc, %de + + call void @llvm.R600.store.swizzle(<4 x float> %bcde, i32 0, i32 1) + ret void +} + +attributes #0 = { "ShaderType"="0" } ; Pixel Shader + +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/test/CodeGen/R600/fetch-limits.r700+.ll b/test/CodeGen/R600/fetch-limits.r700+.ll new file mode 100644 index 00000000000..1a8a43fccc7 --- /dev/null +++ b/test/CodeGen/R600/fetch-limits.r700+.ll @@ -0,0 +1,81 @@ +; RUN: llc < %s -march=r600 -mcpu=rv710 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=rv730 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=rv770 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=sumo | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=juniper | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=barts | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=turks | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=caicos | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s + +; r700+ supports 16 fetches in a clause +; CHECK: @fetch_limits_r700 +; CHECK: Fetch clause +; CHECK: Fetch clause + +define void @fetch_limits_r700() #0 { +entry: + %0 = load <4 x float> addrspace(8)* null + %1 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %2 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %3 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %5 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %6 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %7 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %8 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %9 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) + %11 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) + %12 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12) + %13 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13) + %14 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) + %15 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15) + %16 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) + %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1) + %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1) + %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1) + %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1) + %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1) + %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1) + %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1) + %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1) + %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1) + %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %9, i32 0, i32 0, i32 1) + %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %10, i32 0, i32 0, i32 1) + %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %11, i32 0, i32 0, i32 1) + %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %12, i32 0, i32 0, i32 1) + %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %13, i32 0, i32 0, i32 1) + %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %14, i32 0, i32 0, i32 1) + %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %15, i32 0, i32 0, i32 1) + %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %16, i32 0, i32 0, i32 1) + %a = fadd <4 x float> %res0, %res1 + %b = fadd <4 x float> %res2, %res3 + %c = fadd <4 x float> %res4, %res5 + %d = fadd <4 x float> %res6, %res7 + %e = fadd <4 x float> %res8, %res9 + %f = fadd <4 x float> %res10, %res11 + %g = fadd <4 x float> %res12, %res13 + %h = fadd <4 x float> %res14, %res15 + %i = fadd <4 x float> %res16, %a + + %bc = fadd <4 x float> %b, %c + %de = fadd <4 x float> %d, %e + %fg = fadd <4 x float> %f, %g + %hi = fadd <4 x float> %h, %i + + %bcde = fadd <4 x float> %bc, %de + %fghi = fadd <4 x float> %fg, %hi + + %bcdefghi = fadd <4 x float> %bcde, %fghi + call void @llvm.R600.store.swizzle(<4 x float> %bcdefghi, i32 0, i32 1) + ret void +} + +attributes #0 = { "ShaderType"="0" } ; Pixel Shader + +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) -- 2.34.1