From ae6db4bdd734d141c614220d3d560d0754b4ff7f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 1 Oct 2015 21:43:15 +0000 Subject: [PATCH] AMDGPU: Make SIInsertWaits about a factor of 4 faster This was the slowest target custom pass and was spending 80% of the time in getMinimalPhysRegClass which was called for every register operand. Try to use the statically known register class when possible from the instruction's MCOperandInfo. There are a few pseudo instructions which are not well behaved with unknown register classes which still require the expensive physical register class search. There are a few other possibilities for making this even faster, such as not inspecting implicit operands. For now those are checked because it is technically possible to have a scalar load into exec or vcc which can be implicitly used. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@249079 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIInsertWaits.cpp | 45 ++++++++++++++-------------- lib/Target/AMDGPU/SIRegisterInfo.cpp | 2 ++ 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp index 7ab4fd68932..e0cf84abefc 100644 --- a/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -91,7 +91,8 @@ private: bool isOpRelevant(MachineOperand &Op); /// \brief Get register interval an operand affects. - RegInterval getRegInterval(MachineOperand &Op); + RegInterval getRegInterval(const TargetRegisterClass *RC, + const MachineOperand &Reg) const; /// \brief Handle instructions async components void pushInstruction(MachineBasicBlock &MBB, @@ -142,8 +143,7 @@ FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { } Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { - - uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags; + uint64_t TSFlags = MI.getDesc().TSFlags; Counters Result = { { 0, 0, 0 } }; Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); @@ -161,10 +161,9 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { MachineOperand &Op = MI.getOperand(0); assert(Op.isReg() && "First LGKM operand must be a register!"); - unsigned Reg = Op.getReg(); - // XXX - What if this is a write into a super register? - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); + const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0); + unsigned Size = RC->getSize(); Result.Named.LGKM = Size > 4 ? 2 : 1; } else { // s_dcache_inv etc. do not have a a destination register. Assume we @@ -185,9 +184,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { } bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { - // Constants are always irrelevant - if (!Op.isReg()) + if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) return false; // Defines are always relevant @@ -236,18 +234,13 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { return false; } -RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { - - if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) - return std::make_pair(0, 0); - - unsigned Reg = Op.getReg(); - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); - +RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, + const MachineOperand &Reg) const { + unsigned Size = RC->getSize(); assert(Size >= 4); RegInterval Result; - Result.first = TRI->getEncodingValue(Reg); + Result.first = TRI->getEncodingValue(Reg.getReg()); Result.second = Result.first + Size / 4; return Result; @@ -305,12 +298,12 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, } for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - MachineOperand &Op = I->getOperand(i); if (!isOpRelevant(Op)) continue; - RegInterval Interval = getRegInterval(Op); + const TargetRegisterClass *RC = TII->getOpRegClass(*I, i); + RegInterval Interval = getRegInterval(RC, Op); for (unsigned j = Interval.first; j < Interval.second; ++j) { // Remember which registers we define @@ -405,12 +398,18 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { if (MI.getOpcode() == AMDGPU::S_SENDMSG) return LastIssued; - // For each register affected by this - // instruction increase the result sequence + // For each register affected by this instruction increase the result + // sequence. + // + // TODO: We could probably just look at explicit operands if we removed VCC / + // EXEC from SMRD dest reg classes. for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &Op = MI.getOperand(i); - RegInterval Interval = getRegInterval(Op); + if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) + continue; + + const TargetRegisterClass *RC = TII->getOpRegClass(MI, i); + RegInterval Interval = getRegInterval(RC, Op); for (unsigned j = Interval.first; j < Interval.second; ++j) { if (Op.isDef()) { diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index a3976f5e591..26cfa203cf7 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -326,6 +326,8 @@ unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const { return getEncodingValue(Reg) & 0xff; } +// FIXME: This is very slow. It might be worth creating a map from physreg to +// register class. const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { assert(!TargetRegisterInfo::isVirtualRegister(Reg)); -- 2.34.1