From bf0ce512c5ecaff0631b0c12ff85b0d3e2b5de12 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 6 Nov 2015 17:54:43 +0000 Subject: [PATCH] AMDGPU: Hack for VS_32 register pressure For some reason VS_32 ends up factoring into the pressure heuristics even though we should never see a virtual register with this class. When SGPRs are reserved for register spilling, this for some reason triggers reg-crit scheduling. Setting isAllocatable = 0 may help with this since that seems to remove it from the default implementation's generated table. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@252321 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIRegisterInfo.cpp | 14 ++++++++++---- lib/Target/AMDGPU/SIRegisterInfo.h | 7 +++++++ test/CodeGen/AMDGPU/salu-to-valu.ll | 18 +++++++++--------- test/CodeGen/AMDGPU/valu-i1.ll | 4 ++-- 4 files changed, 28 insertions(+), 15 deletions(-) diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 429bb2fbd87..436808b5287 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -73,26 +73,32 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { - const AMDGPUSubtarget &STI = MF.getSubtarget(); // FIXME: We should adjust the max number of waves based on LDS size. unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(), STI.getMaxWavesPerCU()); unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); + unsigned VSLimit = SGPRLimit + VGPRLimit; + for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I) { + const TargetRegisterClass *RC = *I; - unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1); + unsigned NumSubRegs = std::max((int)RC->getSize() / 4, 1); unsigned Limit; - if (isSGPRClass(*I)) { + if (isPseudoRegClass(RC)) { + // FIXME: This is a hack. We should never be considering the pressure of + // these since no virtual register should ever have this class. + Limit = VSLimit; + } else if (isSGPRClass(RC)) { Limit = SGPRLimit / NumSubRegs; } else { Limit = VGPRLimit / NumSubRegs; } - const int *Sets = getRegClassPressureSets(*I); + const int *Sets = getRegClassPressureSets(RC); assert(Sets); for (unsigned i = 0; Sets[i] != -1; ++i) { if (Sets[i] == (int)Idx) diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 3458cec0923..b1389533ec3 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -59,6 +59,13 @@ public: /// \returns true if this class contains VGPR registers. bool hasVGPRs(const TargetRegisterClass *RC) const; + /// returns true if this is a pseudoregister class combination of VGPRs and + /// SGPRs for operand modeling. FIXME: We should set isAllocatable = 0 on + /// them. + static bool isPseudoRegClass(const TargetRegisterClass *RC) { + return RC == &AMDGPU::VS_32RegClass || RC == &AMDGPU::VS_64RegClass; + } + /// \returns A VGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentVGPRClass( const TargetRegisterClass *SRC) const; diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll index 65b4a7dadba..b74f31bcdda 100644 --- a/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -185,18 +185,18 @@ entry: ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16: ; GCN: s_mov_b32 s[[OFFSET0:[0-9]+]], 0x13480{{$}} -; SI: s_add_i32 s[[OFFSET1:[0-9]+]], s[[OFFSET0]], 16 -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET0]]:{{[0-9]+}}], 0 addr64{{$}} +; SI-DAG: s_add_i32 s[[OFFSET1:[0-9]+]], s[[OFFSET0]], 16 +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET0]]:{{[0-9]+}}], 0 addr64{{$}} -; CI: s_mov_b32 s[[OFFSET1:[0-9]+]], 0x13490{{$}} -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET1]]:{{[0-9]+}}], 0 addr64{{$}} +; CI-DAG: s_mov_b32 s[[OFFSET1:[0-9]+]], 0x13490{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET1]]:{{[0-9]+}}], 0 addr64{{$}} -; SI: s_add_i32 s[[OFFSET2:[0-9]+]], s[[OFFSET0]], 32 -; CI: s_mov_b32 s[[OFFSET2:[0-9]+]], 0x134a0 +; SI-DAG: s_add_i32 s[[OFFSET2:[0-9]+]], s[[OFFSET0]], 32 +; CI-DAG: s_mov_b32 s[[OFFSET2:[0-9]+]], 0x134a0 -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET2]]:{{[0-9]+}}], 0 addr64{{$}} -; GCN: s_add_i32 s[[OFFSET3:[0-9]+]], s[[OFFSET2]], 16 -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET3]]:{{[0-9]+}}], 0 addr64{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET2]]:{{[0-9]+}}], 0 addr64{{$}} +; GCN-DAG: s_add_i32 s[[OFFSET3:[0-9]+]], s[[OFFSET2]], 16 +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET3]]:{{[0-9]+}}], 0 addr64{{$}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll index c27702813a8..1cbefba60c9 100644 --- a/test/CodeGen/AMDGPU/valu-i1.ll +++ b/test/CodeGen/AMDGPU/valu-i1.ll @@ -78,8 +78,8 @@ exit: ; SI: BB2_3: ; SI: buffer_load_dword -; SI: buffer_store_dword -; SI: v_cmp_eq_i32_e32 vcc, +; SI-DAG: buffer_store_dword +; SI-DAG: v_cmp_eq_i32_e32 vcc, ; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]] ; SI: s_andn2_b64 exec, exec, [[OR_SREG]] ; SI: s_cbranch_execnz BB2_3 -- 2.34.1