From bb9c0afde53073f3fcd30c851420d58c950127fa Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 24 Sep 2015 07:16:37 +0000 Subject: [PATCH] AMDGPU: Reduce number of copies emitted Instead of always inserting a copy in case the super register is itself a subregister, only extract to the super reg class if this is actually the case. This shouldn't really change codegen, but makes looking at the output of SIFixSGPRCopies easier to read. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@248467 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIInstrInfo.cpp | 14 +++++++++----- test/CodeGen/AMDGPU/and.ll | 3 ++- test/CodeGen/AMDGPU/ctpop64.ll | 5 ++--- test/CodeGen/AMDGPU/s_movk_i32.ll | 18 ++++++------------ 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index c7218dedddc..e2b6d3c1403 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1556,17 +1556,21 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, unsigned SubIdx, const TargetRegisterClass *SubRC) const { - assert(SuperReg.isReg()); - - unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); + MachineBasicBlock *MBB = MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); unsigned SubReg = MRI.createVirtualRegister(SubRC); + if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) + .addReg(SuperReg.getReg(), 0, SubIdx); + return SubReg; + } + // Just in case the super register is itself a sub-register, copy it to a new // value so we don't need to worry about merging its subreg index with the // SubIdx passed to this function. The register coalescer should be able to // eliminate this extra copy. - MachineBasicBlock *MBB = MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); + unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll index 5672d470bd7..8c7c1bc3626 100644 --- a/test/CodeGen/AMDGPU/and.ll +++ b/test/CodeGen/AMDGPU/and.ll @@ -147,9 +147,10 @@ endif: ret void } +; FIXME: and 0 should be replaced witht copy ; FUNC-LABEL: {{^}}v_and_constant_i64: ; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}} define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %and = and i64 %a, 1234567 diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll index 82cdd52f2ce..dd2840bd851 100644 --- a/test/CodeGen/AMDGPU/ctpop64.ll +++ b/test/CodeGen/AMDGPU/ctpop64.ll @@ -36,15 +36,14 @@ define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noali ret void } -; FIXME: We shouldn't emit the v_mov_b32 0 +; FIXME: or 0 should be replaxed with copy ; FUNC-LABEL: {{^}}v_ctpop_i64_user: ; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, ; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]] -; GCN-DAG: v_or_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[ZERO]] +; GCN-DAG: v_or_b32_e64 v[[RESULT_HI:[0-9]+]], 0, s{{[0-9]+}} ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} ; GCN: s_endpgm define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind { diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll index 6b1a36c979c..47c7fbb6dd6 100644 --- a/test/CodeGen/AMDGPU/s_movk_i32.ll +++ b/test/CodeGen/AMDGPU/s_movk_i32.ll @@ -3,10 +3,9 @@ ; SI-LABEL: {{^}}s_movk_i32_k0: ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -17,10 +16,9 @@ define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-LABEL: {{^}}s_movk_i32_k1: ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -31,10 +29,9 @@ define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-LABEL: {{^}}s_movk_i32_k2: ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 64{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 64, v[[HI_VREG]] ; SI: s_endpgm define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -45,10 +42,9 @@ define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-LABEL: {{^}}s_movk_i32_k3: ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -59,10 +55,9 @@ define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-LABEL: {{^}}s_movk_i32_k4: ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -87,10 +82,9 @@ define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add ; SI-LABEL: {{^}}s_movk_i32_k6: ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 63{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 63, v[[HI_VREG]] ; SI: s_endpgm define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 -- 2.34.1