From 58f8a138a9db8ef45f1061cc81e9bf2ab541cbb0 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Wed, 24 Jun 2015 20:20:16 +0000 Subject: [PATCH] Add NVPTXPeephole pass to reduce unnecessary address cast Summary: This patch first change the register that holds local address for stack frame to %SPL. Then the new NVPTXPeephole pass will try to scan the following pattern %vreg0 = LEA_ADDRi64 , 4 %vreg1 = cvta_to_local %vreg0 and transform it into %vreg1 = LEA_ADDRi64 %VRFrameLocal, 4 Patched by Xuetian Weng Test Plan: test/CodeGen/NVPTX/local-stack-frame.ll Reviewers: jholewinski, jingyue Reviewed By: jingyue Subscribers: eliben, jholewinski, llvm-commits Differential Revision: http://reviews.llvm.org/D10549 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@240587 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/NVPTX/CMakeLists.txt | 1 + lib/Target/NVPTX/NVPTX.h | 1 + lib/Target/NVPTX/NVPTXFrameLowering.cpp | 37 +++-- lib/Target/NVPTX/NVPTXPeephole.cpp | 151 ++++++++++++++++++ lib/Target/NVPTX/NVPTXRegisterInfo.td | 2 +- lib/Target/NVPTX/NVPTXTargetMachine.cpp | 2 + test/CodeGen/NVPTX/call-with-alloca-buffer.ll | 3 +- test/CodeGen/NVPTX/local-stack-frame.ll | 48 +++++- 8 files changed, 223 insertions(+), 22 deletions(-) create mode 100644 lib/Target/NVPTX/NVPTXPeephole.cpp diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt index 99e950eba80..05fe06dbc07 100644 --- a/lib/Target/NVPTX/CMakeLists.txt +++ b/lib/Target/NVPTX/CMakeLists.txt @@ -22,6 +22,7 @@ set(NVPTXCodeGen_sources NVPTXLowerAggrCopies.cpp NVPTXLowerKernelArgs.cpp NVPTXLowerAlloca.cpp + NVPTXPeephole.cpp NVPTXMCExpr.cpp NVPTXPrologEpilogPass.cpp NVPTXRegisterInfo.cpp diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h index 28ae3e8f393..fe28214e958 100644 --- a/lib/Target/NVPTX/NVPTX.h +++ b/lib/Target/NVPTX/NVPTX.h @@ -71,6 +71,7 @@ MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); FunctionPass *createNVPTXImageOptimizerPass(); FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM); BasicBlockPass *createNVPTXLowerAllocaPass(); +MachineFunctionPass *createNVPTXPeephole(); bool isImageOrSamplerVal(const Value *, const Module *); diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp index 5503494fc3c..0b0cb87e5bc 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -36,33 +36,40 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF, if (MF.getFrameInfo()->hasStackObjects()) { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); // Insert "mov.u32 %SP, %Depot" - MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineInstr *MI = MBB.begin(); + MachineRegisterInfo &MR = MF.getRegInfo(); + // This instruction really occurs before first instruction // in the BB, so giving it no debug location. DebugLoc dl = DebugLoc(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - // mov %SPL, %depot; // cvta.local %SP, %SPL; if (static_cast(MF.getTarget()).is64Bit()) { - unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass); - MachineInstr *MI = - BuildMI(MBB, MBBI, dl, MF.getSubtarget().getInstrInfo()->get( - NVPTX::cvta_local_yes_64), - NVPTX::VRFrame).addReg(LocalReg); + // Check if %SP is actually used + if (MR.hasOneNonDBGUse(NVPTX::VRFrame)) { + MI = BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get( + NVPTX::cvta_local_yes_64), + NVPTX::VRFrame) + .addReg(NVPTX::VRFrameLocal); + } + BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64), - LocalReg).addImm(MF.getFunctionNumber()); + NVPTX::VRFrameLocal) + .addImm(MF.getFunctionNumber()); } else { - unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass); - MachineInstr *MI = - BuildMI(MBB, MBBI, dl, - MF.getSubtarget().getInstrInfo()->get(NVPTX::cvta_local_yes), - NVPTX::VRFrame).addReg(LocalReg); + // Check if %SP is actually used + if (MR.hasOneNonDBGUse(NVPTX::VRFrame)) { + MI = BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get( + NVPTX::cvta_local_yes), + NVPTX::VRFrame) + .addReg(NVPTX::VRFrameLocal); + } BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR), - LocalReg).addImm(MF.getFunctionNumber()); + NVPTX::VRFrameLocal) + .addImm(MF.getFunctionNumber()); } } } diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp new file mode 100644 index 00000000000..bdb463f9de5 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXPeephole.cpp @@ -0,0 +1,151 @@ +//===-- NVPTXPeephole.cpp - NVPTX Peephole Optimiztions -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// In NVPTX, NVPTXFrameLowering will emit following instruction at the beginning +// of a MachineFunction. +// +// mov %SPL, %depot +// cvta.local %SP, %SPL +// +// Because Frame Index is a generic address and alloca can only return generic +// pointer, without this pass the instructions producing alloca'ed address will +// be based on %SP. NVPTXLowerAlloca tends to help replace store and load on +// this address with their .local versions, but this may introduce a lot of +// cvta.to.local instructions. Performance can be improved if we avoid casting +// address back and forth and directly calculate local address based on %SPL. +// This peephole pass optimizes these cases, for example +// +// It will transform the following pattern +// %vreg0 = LEA_ADDRi64 , 4 +// %vreg1 = cvta_to_local_yes_64 %vreg0 +// +// into +// %vreg1 = LEA_ADDRi64 %VRFrameLocal, 4 +// +// %VRFrameLocal is the virtual register name of %SPL +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "nvptx-peephole" + +namespace llvm { +void initializeNVPTXPeepholePass(PassRegistry &); +} + +namespace { +struct NVPTXPeephole : public MachineFunctionPass { + public: + static char ID; + NVPTXPeephole() : MachineFunctionPass(ID) { + initializeNVPTXPeepholePass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "NVPTX optimize redundant cvta.to.local instruction"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} + +char NVPTXPeephole::ID = 0; + +INITIALIZE_PASS(NVPTXPeephole, "nvptx-peephole", "NVPTX Peephole", false, false) + +static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) { + auto &MBB = *Root.getParent(); + auto &MF = *MBB.getParent(); + // Check current instruction is cvta.to.local + if (Root.getOpcode() != NVPTX::cvta_to_local_yes_64 && + Root.getOpcode() != NVPTX::cvta_to_local_yes) + return false; + + auto &Op = Root.getOperand(1); + const auto &MRI = MF.getRegInfo(); + MachineInstr *GenericAddrDef = nullptr; + if (Op.isReg() && TargetRegisterInfo::isVirtualRegister(Op.getReg())) { + GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg()); + } + + // Check the register operand is uniquely defined by LEA_ADDRi instruction + if (!GenericAddrDef || GenericAddrDef->getParent() != &MBB || + (GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi64 && + GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi)) { + return false; + } + + // Check the LEA_ADDRi operand is Frame index + auto &BaseAddrOp = GenericAddrDef->getOperand(1); + if (BaseAddrOp.getType() == MachineOperand::MO_FrameIndex) { + return true; + } + + return false; +} + +static void CombineCVTAToLocal(MachineInstr &Root) { + auto &MBB = *Root.getParent(); + auto &MF = *MBB.getParent(); + const auto &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + auto &Prev = *MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + + // Get the correct offset + int FrameIndex = Prev.getOperand(1).getIndex(); + int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + + Prev.getOperand(2).getImm(); + + MachineInstrBuilder MIB = + BuildMI(MF, Root.getDebugLoc(), TII->get(Prev.getOpcode()), + Root.getOperand(0).getReg()) + .addReg(NVPTX::VRFrameLocal) + .addOperand(MachineOperand::CreateImm(Offset)); + + MBB.insert((MachineBasicBlock::iterator)&Root, MIB); + + // Check if MRI has only one non dbg use, which is Root + if (MRI.hasOneNonDBGUse(Prev.getOperand(0).getReg())) { + Prev.eraseFromParentAndMarkDBGValuesForRemoval(); + } + Root.eraseFromParentAndMarkDBGValuesForRemoval(); +} + +bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + // Loop over all of the basic blocks. + for (auto &MBB : MF) { + // Traverse the basic block. + auto BlockIter = MBB.begin(); + + while (BlockIter != MBB.end()) { + auto &MI = *BlockIter++; + if (isCVTAToLocalCombinationCandidate(MI)) { + CombineCVTAToLocal(MI); + Changed = true; + } + } // Instruction + } // Basic Block + return Changed; +} + +MachineFunctionPass *llvm::createNVPTXPeephole() { return new NVPTXPeephole(); } diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td index efcee6b6f2b..ff6ccc457db 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -65,5 +65,5 @@ def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>; def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>; // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used. -def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot, +def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRFrameLocal, VRDepot, (sequence "ENVREG%u", 0, 31))>; diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index c071ee82abc..dc3e34f9647 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -205,6 +205,8 @@ bool NVPTXPassConfig::addInstSelector() { if (!ST.hasImageHandles()) addPass(createNVPTXReplaceImageHandlesPass()); + addPass(createNVPTXPeephole()); + return false; } diff --git a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll index 8ff762aa7c4..7ca31bbbaf2 100644 --- a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -20,8 +20,7 @@ entry: %buf = alloca [16 x i8], align 4 ; CHECK: .local .align 4 .b8 __local_depot0[16] -; CHECK: mov.u64 %rd[[BUF_REG:[0-9]+]] -; CHECK: cvta.local.u64 %SP, %rd[[BUF_REG]] +; CHECK: mov.u64 %SPL ; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0] ; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]] diff --git a/test/CodeGen/NVPTX/local-stack-frame.ll b/test/CodeGen/NVPTX/local-stack-frame.ll index 377eee9170e..1058e4b0a57 100644 --- a/test/CodeGen/NVPTX/local-stack-frame.ll +++ b/test/CodeGen/NVPTX/local-stack-frame.ll @@ -3,12 +3,12 @@ ; Ensure we access the local stack properly -; PTX32: mov.u32 %r{{[0-9]+}}, __local_depot{{[0-9]+}}; -; PTX32: cvta.local.u32 %SP, %r{{[0-9]+}}; +; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}}; +; PTX32: cvta.local.u32 %SP, %SPL; ; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo_param_0]; ; PTX32: st.volatile.u32 [%SP+0], %r{{[0-9]+}}; -; PTX64: mov.u64 %rd{{[0-9]+}}, __local_depot{{[0-9]+}}; -; PTX64: cvta.local.u64 %SP, %rd{{[0-9]+}}; +; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}}; +; PTX64: cvta.local.u64 %SP, %SPL; ; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo_param_0]; ; PTX64: st.volatile.u32 [%SP+0], %r{{[0-9]+}}; define void @foo(i32 %a) { @@ -16,3 +16,43 @@ define void @foo(i32 %a) { store volatile i32 %a, i32* %local ret void } + +; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}}; +; PTX32: cvta.local.u32 %SP, %SPL; +; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo2_param_0]; +; PTX32: add.u32 %r[[SP_REG:[0-9]+]], %SPL, 0; +; PTX32: st.local.u32 [%r[[SP_REG]]], %r{{[0-9]+}}; +; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}}; +; PTX64: cvta.local.u64 %SP, %SPL; +; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo2_param_0]; +; PTX64: add.u64 %rd[[SP_REG:[0-9]+]], %SPL, 0; +; PTX64: st.local.u32 [%rd[[SP_REG]]], %r{{[0-9]+}}; +define void @foo2(i32 %a) { + %local = alloca i32, align 4 + store i32 %a, i32* %local + call void @bar(i32* %local) + ret void +} + +declare void @bar(i32* %a) + +!nvvm.annotations = !{!0} +!0 = !{void (i32)* @foo2, !"kernel", i32 1} + +; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}}; +; PTX32-NOT: cvta.local.u32 %SP, %SPL; +; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo3_param_0]; +; PTX32: add.u32 %r{{[0-9]+}}, %SPL, 0; +; PTX32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}; +; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}}; +; PTX64-NOT: cvta.local.u64 %SP, %SPL; +; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo3_param_0]; +; PTX64: add.u64 %rd{{[0-9]+}}, %SPL, 0; +; PTX64: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}; +define void @foo3(i32 %a) { + %local = alloca [3 x i32], align 4 + %1 = bitcast [3 x i32]* %local to i32* + %2 = getelementptr inbounds i32, i32* %1, i32 %a + store i32 %a, i32* %2 + ret void +} -- 2.34.1