NVPTXLowerAggrCopies.cpp
NVPTXLowerKernelArgs.cpp
NVPTXLowerAlloca.cpp
+ NVPTXPeephole.cpp
NVPTXMCExpr.cpp
NVPTXPrologEpilogPass.cpp
NVPTXRegisterInfo.cpp
FunctionPass *createNVPTXImageOptimizerPass();
FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM);
BasicBlockPass *createNVPTXLowerAllocaPass();
+MachineFunctionPass *createNVPTXPeephole();
bool isImageOrSamplerVal(const Value *, const Module *);
if (MF.getFrameInfo()->hasStackObjects()) {
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
// Insert "mov.u32 %SP, %Depot"
- MachineBasicBlock::iterator MBBI = MBB.begin();
+ MachineInstr *MI = MBB.begin();
+ MachineRegisterInfo &MR = MF.getRegInfo();
+
// This instruction really occurs before first instruction
// in the BB, so giving it no debug location.
DebugLoc dl = DebugLoc();
- MachineRegisterInfo &MRI = MF.getRegInfo();
-
// mov %SPL, %depot;
// cvta.local %SP, %SPL;
if (static_cast<const NVPTXTargetMachine &>(MF.getTarget()).is64Bit()) {
- unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass);
- MachineInstr *MI =
- BuildMI(MBB, MBBI, dl, MF.getSubtarget().getInstrInfo()->get(
- NVPTX::cvta_local_yes_64),
- NVPTX::VRFrame).addReg(LocalReg);
+ // Check if %SP is actually used
+ if (MR.hasOneNonDBGUse(NVPTX::VRFrame)) {
+ MI = BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(
+ NVPTX::cvta_local_yes_64),
+ NVPTX::VRFrame)
+ .addReg(NVPTX::VRFrameLocal);
+ }
+
BuildMI(MBB, MI, dl,
MF.getSubtarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64),
- LocalReg).addImm(MF.getFunctionNumber());
+ NVPTX::VRFrameLocal)
+ .addImm(MF.getFunctionNumber());
} else {
- unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass);
- MachineInstr *MI =
- BuildMI(MBB, MBBI, dl,
- MF.getSubtarget().getInstrInfo()->get(NVPTX::cvta_local_yes),
- NVPTX::VRFrame).addReg(LocalReg);
+ // Check if %SP is actually used
+ if (MR.hasOneNonDBGUse(NVPTX::VRFrame)) {
+ MI = BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(
+ NVPTX::cvta_local_yes),
+ NVPTX::VRFrame)
+ .addReg(NVPTX::VRFrameLocal);
+ }
BuildMI(MBB, MI, dl,
MF.getSubtarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR),
- LocalReg).addImm(MF.getFunctionNumber());
+ NVPTX::VRFrameLocal)
+ .addImm(MF.getFunctionNumber());
}
}
}
--- /dev/null
+//===-- NVPTXPeephole.cpp - NVPTX Peephole Optimiztions -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// In NVPTX, NVPTXFrameLowering will emit following instruction at the beginning
+// of a MachineFunction.
+//
+// mov %SPL, %depot
+// cvta.local %SP, %SPL
+//
+// Because Frame Index is a generic address and alloca can only return generic
+// pointer, without this pass the instructions producing alloca'ed address will
+// be based on %SP. NVPTXLowerAlloca tends to help replace store and load on
+// this address with their .local versions, but this may introduce a lot of
+// cvta.to.local instructions. Performance can be improved if we avoid casting
+// address back and forth and directly calculate local address based on %SPL.
+// This peephole pass optimizes these cases, for example
+//
+// It will transform the following pattern
+// %vreg0<def> = LEA_ADDRi64 <fi#0>, 4
+// %vreg1<def> = cvta_to_local_yes_64 %vreg0
+//
+// into
+// %vreg1<def> = LEA_ADDRi64 %VRFrameLocal, 4
+//
+// %VRFrameLocal is the virtual register name of %SPL
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-peephole"
+
+namespace llvm {
+void initializeNVPTXPeepholePass(PassRegistry &);
+}
+
+namespace {
+struct NVPTXPeephole : public MachineFunctionPass {
+ public:
+ static char ID;
+ NVPTXPeephole() : MachineFunctionPass(ID) {
+ initializeNVPTXPeepholePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ const char *getPassName() const override {
+ return "NVPTX optimize redundant cvta.to.local instruction";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+}
+
+char NVPTXPeephole::ID = 0;
+
+INITIALIZE_PASS(NVPTXPeephole, "nvptx-peephole", "NVPTX Peephole", false, false)
+
+static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) {
+ auto &MBB = *Root.getParent();
+ auto &MF = *MBB.getParent();
+ // Check current instruction is cvta.to.local
+ if (Root.getOpcode() != NVPTX::cvta_to_local_yes_64 &&
+ Root.getOpcode() != NVPTX::cvta_to_local_yes)
+ return false;
+
+ auto &Op = Root.getOperand(1);
+ const auto &MRI = MF.getRegInfo();
+ MachineInstr *GenericAddrDef = nullptr;
+ if (Op.isReg() && TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+ GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg());
+ }
+
+ // Check the register operand is uniquely defined by LEA_ADDRi instruction
+ if (!GenericAddrDef || GenericAddrDef->getParent() != &MBB ||
+ (GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi64 &&
+ GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi)) {
+ return false;
+ }
+
+ // Check the LEA_ADDRi operand is Frame index
+ auto &BaseAddrOp = GenericAddrDef->getOperand(1);
+ if (BaseAddrOp.getType() == MachineOperand::MO_FrameIndex) {
+ return true;
+ }
+
+ return false;
+}
+
+static void CombineCVTAToLocal(MachineInstr &Root) {
+ auto &MBB = *Root.getParent();
+ auto &MF = *MBB.getParent();
+ const auto &MRI = MF.getRegInfo();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ auto &Prev = *MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+
+ // Get the correct offset
+ int FrameIndex = Prev.getOperand(1).getIndex();
+ int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+ Prev.getOperand(2).getImm();
+
+ MachineInstrBuilder MIB =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(Prev.getOpcode()),
+ Root.getOperand(0).getReg())
+ .addReg(NVPTX::VRFrameLocal)
+ .addOperand(MachineOperand::CreateImm(Offset));
+
+ MBB.insert((MachineBasicBlock::iterator)&Root, MIB);
+
+ // Check if MRI has only one non dbg use, which is Root
+ if (MRI.hasOneNonDBGUse(Prev.getOperand(0).getReg())) {
+ Prev.eraseFromParentAndMarkDBGValuesForRemoval();
+ }
+ Root.eraseFromParentAndMarkDBGValuesForRemoval();
+}
+
+bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+ // Loop over all of the basic blocks.
+ for (auto &MBB : MF) {
+ // Traverse the basic block.
+ auto BlockIter = MBB.begin();
+
+ while (BlockIter != MBB.end()) {
+ auto &MI = *BlockIter++;
+ if (isCVTAToLocalCombinationCandidate(MI)) {
+ CombineCVTAToLocal(MI);
+ Changed = true;
+ }
+ } // Instruction
+ } // Basic Block
+ return Changed;
+}
+
+MachineFunctionPass *llvm::createNVPTXPeephole() { return new NVPTXPeephole(); }
def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;
// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
-def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot,
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRFrameLocal, VRDepot,
(sequence "ENVREG%u", 0, 31))>;
if (!ST.hasImageHandles())
addPass(createNVPTXReplaceImageHandlesPass());
+ addPass(createNVPTXPeephole());
+
return false;
}
%buf = alloca [16 x i8], align 4
; CHECK: .local .align 4 .b8 __local_depot0[16]
-; CHECK: mov.u64 %rd[[BUF_REG:[0-9]+]]
-; CHECK: cvta.local.u64 %SP, %rd[[BUF_REG]]
+; CHECK: mov.u64 %SPL
; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0]
; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]]
; Ensure we access the local stack properly
-; PTX32: mov.u32 %r{{[0-9]+}}, __local_depot{{[0-9]+}};
-; PTX32: cvta.local.u32 %SP, %r{{[0-9]+}};
+; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}};
+; PTX32: cvta.local.u32 %SP, %SPL;
; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo_param_0];
; PTX32: st.volatile.u32 [%SP+0], %r{{[0-9]+}};
-; PTX64: mov.u64 %rd{{[0-9]+}}, __local_depot{{[0-9]+}};
-; PTX64: cvta.local.u64 %SP, %rd{{[0-9]+}};
+; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}};
+; PTX64: cvta.local.u64 %SP, %SPL;
; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo_param_0];
; PTX64: st.volatile.u32 [%SP+0], %r{{[0-9]+}};
define void @foo(i32 %a) {
store volatile i32 %a, i32* %local
ret void
}
+
+; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}};
+; PTX32: cvta.local.u32 %SP, %SPL;
+; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo2_param_0];
+; PTX32: add.u32 %r[[SP_REG:[0-9]+]], %SPL, 0;
+; PTX32: st.local.u32 [%r[[SP_REG]]], %r{{[0-9]+}};
+; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}};
+; PTX64: cvta.local.u64 %SP, %SPL;
+; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo2_param_0];
+; PTX64: add.u64 %rd[[SP_REG:[0-9]+]], %SPL, 0;
+; PTX64: st.local.u32 [%rd[[SP_REG]]], %r{{[0-9]+}};
+define void @foo2(i32 %a) {
+ %local = alloca i32, align 4
+ store i32 %a, i32* %local
+ call void @bar(i32* %local)
+ ret void
+}
+
+declare void @bar(i32* %a)
+
+!nvvm.annotations = !{!0}
+!0 = !{void (i32)* @foo2, !"kernel", i32 1}
+
+; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}};
+; PTX32-NOT: cvta.local.u32 %SP, %SPL;
+; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo3_param_0];
+; PTX32: add.u32 %r{{[0-9]+}}, %SPL, 0;
+; PTX32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}};
+; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}};
+; PTX64-NOT: cvta.local.u64 %SP, %SPL;
+; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo3_param_0];
+; PTX64: add.u64 %rd{{[0-9]+}}, %SPL, 0;
+; PTX64: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}};
+define void @foo3(i32 %a) {
+ %local = alloca [3 x i32], align 4
+ %1 = bitcast [3 x i32]* %local to i32*
+ %2 = getelementptr inbounds i32, i32* %1, i32 %a
+ store i32 %a, i32* %2
+ ret void
+}