lib/Target/AMDGPU/R600Packetizer.cpp

   1 //===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// This pass implements instructions packetization for R600. It unsets isLast
  12 /// bit of instructions inside a bundle and substitutes src register with
  13 /// PreviousVector when applicable.
  14 //
  15 //===----------------------------------------------------------------------===//
  16
  17 #include "llvm/Support/Debug.h"
  18 #include "AMDGPU.h"
  19 #include "AMDGPUSubtarget.h"
  20 #include "R600InstrInfo.h"
  21 #include "llvm/CodeGen/DFAPacketizer.h"
  22 #include "llvm/CodeGen/MachineDominators.h"
  23 #include "llvm/CodeGen/MachineFunctionPass.h"
  24 #include "llvm/CodeGen/MachineLoopInfo.h"
  25 #include "llvm/CodeGen/Passes.h"
  26 #include "llvm/CodeGen/ScheduleDAG.h"
  27 #include "llvm/Support/raw_ostream.h"
  28
  29 using namespace llvm;
  30
  31 #define DEBUG_TYPE "packets"
  32
  33 namespace {
  34
  35 class R600Packetizer : public MachineFunctionPass {
  36
  37 public:
  38   static char ID;
  39   R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
  40
  41   void getAnalysisUsage(AnalysisUsage &AU) const override {
  42     AU.setPreservesCFG();
  43     AU.addRequired<MachineDominatorTree>();
  44     AU.addPreserved<MachineDominatorTree>();
  45     AU.addRequired<MachineLoopInfo>();
  46     AU.addPreserved<MachineLoopInfo>();
  47     MachineFunctionPass::getAnalysisUsage(AU);
  48   }
  49
  50   const char *getPassName() const override {
  51     return "R600 Packetizer";
  52   }
  53
  54   bool runOnMachineFunction(MachineFunction &Fn) override;
  55 };
  56 char R600Packetizer::ID = 0;
  57
  58 class R600PacketizerList : public VLIWPacketizerList {
  59
  60 private:
  61   const R600InstrInfo *TII;
  62   const R600RegisterInfo &TRI;
  63   bool VLIW5;
  64   bool ConsideredInstUsesAlreadyWrittenVectorElement;
  65
  66   unsigned getSlot(const MachineInstr *MI) const {
  67     return TRI.getHWRegChan(MI->getOperand(0).getReg());
  68   }
  69
  70   /// \returns register to PV chan mapping for bundle/single instructions that
  71   /// immediately precedes I.
  72   DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I)
  73       const {
  74     DenseMap<unsigned, unsigned> Result;
  75     I--;
  76     if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle())
  77       return Result;
  78     MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
  79     if (I->isBundle())
  80       BI++;
  81     int LastDstChan = -1;
  82     do {
  83       bool isTrans = false;
  84       int BISlot = getSlot(&*BI);
  85       if (LastDstChan >= BISlot)
  86         isTrans = true;
  87       LastDstChan = BISlot;
  88       if (TII->isPredicated(&*BI))
  89         continue;
  90       int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
  91       if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
  92         continue;
  93       int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst);
  94       if (DstIdx == -1) {
  95         continue;
  96       }
  97       unsigned Dst = BI->getOperand(DstIdx).getReg();
  98       if (isTrans || TII->isTransOnly(&*BI)) {
  99         Result[Dst] = AMDGPU::PS;
 100         continue;
 101       }
 102       if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
 103           BI->getOpcode() == AMDGPU::DOT4_eg) {
 104         Result[Dst] = AMDGPU::PV_X;
 105         continue;
 106       }
 107       if (Dst == AMDGPU::OQAP) {
 108         continue;
 109       }
 110       unsigned PVReg = 0;
 111       switch (TRI.getHWRegChan(Dst)) {
 112       case 0:
 113         PVReg = AMDGPU::PV_X;
 114         break;
 115       case 1:
 116         PVReg = AMDGPU::PV_Y;
 117         break;
 118       case 2:
 119         PVReg = AMDGPU::PV_Z;
 120         break;
 121       case 3:
 122         PVReg = AMDGPU::PV_W;
 123         break;
 124       default:
 125         llvm_unreachable("Invalid Chan");
 126       }
 127       Result[Dst] = PVReg;
 128     } while ((++BI)->isBundledWithPred());
 129     return Result;
 130   }
 131
 132   void substitutePV(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PVs)
 133       const {
 134     unsigned Ops[] = {
 135       AMDGPU::OpName::src0,
 136       AMDGPU::OpName::src1,
 137       AMDGPU::OpName::src2
 138     };
 139     for (unsigned i = 0; i < 3; i++) {
 140       int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]);
 141       if (OperandIdx < 0)
 142         continue;
 143       unsigned Src = MI->getOperand(OperandIdx).getReg();
 144       const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src);
 145       if (It != PVs.end())
 146         MI->getOperand(OperandIdx).setReg(It->second);
 147     }
 148   }
 149 public:
 150   // Ctor.
 151   R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI)
 152       : VLIWPacketizerList(MF, MLI, nullptr),
 153         TII(static_cast<const R600InstrInfo *>(
 154             MF.getSubtarget().getInstrInfo())),
 155         TRI(TII->getRegisterInfo()) {
 156     VLIW5 = !MF.getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
 157   }
 158
 159   // initPacketizerState - initialize some internal flags.
 160   void initPacketizerState() override {
 161     ConsideredInstUsesAlreadyWrittenVectorElement = false;
 162   }
 163
 164   // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
 165   bool ignorePseudoInstruction(const MachineInstr *MI,
 166                                const MachineBasicBlock *MBB) override {
 167     return false;
 168   }
 169
 170   // isSoloInstruction - return true if instruction MI can not be packetized
 171   // with any other instruction, which means that MI itself is a packet.
 172   bool isSoloInstruction(const MachineInstr *MI) override {
 173     if (TII->isVector(*MI))
 174       return true;
 175     if (!TII->isALUInstr(MI->getOpcode()))
 176       return true;
 177     if (MI->getOpcode() == AMDGPU::GROUP_BARRIER)
 178       return true;
 179     // XXX: This can be removed once the packetizer properly handles all the
 180     // LDS instruction group restrictions.
 181     if (TII->isLDSInstr(MI->getOpcode()))
 182       return true;
 183     return false;
 184   }
 185
 186   // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
 187   // together.
 188   bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override {
 189     MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
 190     if (getSlot(MII) == getSlot(MIJ))
 191       ConsideredInstUsesAlreadyWrittenVectorElement = true;
 192     // Does MII and MIJ share the same pred_sel ?
 193     int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
 194         OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel);
 195     unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0,
 196         PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0;
 197     if (PredI != PredJ)
 198       return false;
 199     if (SUJ->isSucc(SUI)) {
 200       for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) {
 201         const SDep &Dep = SUJ->Succs[i];
 202         if (Dep.getSUnit() != SUI)
 203           continue;
 204         if (Dep.getKind() == SDep::Anti)
 205           continue;
 206         if (Dep.getKind() == SDep::Output)
 207           if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg())
 208             continue;
 209         return false;
 210       }
 211     }
 212
 213     bool ARDef = TII->definesAddressRegister(MII) ||
 214                  TII->definesAddressRegister(MIJ);
 215     bool ARUse = TII->usesAddressRegister(MII) ||
 216                  TII->usesAddressRegister(MIJ);
 217     if (ARDef && ARUse)
 218       return false;
 219
 220     return true;
 221   }
 222
 223   // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
 224   // and SUJ.
 225   bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override {
 226     return false;
 227   }
 228
 229   void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
 230     unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
 231     MI->getOperand(LastOp).setImm(Bit);
 232   }
 233
 234   bool isBundlableWithCurrentPMI(MachineInstr *MI,
 235                                  const DenseMap<unsigned, unsigned> &PV,
 236                                  std::vector<R600InstrInfo::BankSwizzle> &BS,
 237                                  bool &isTransSlot) {
 238     isTransSlot = TII->isTransOnly(MI);
 239     assert (!isTransSlot || VLIW5);
 240
 241     // Is the dst reg sequence legal ?
 242     if (!isTransSlot && !CurrentPacketMIs.empty()) {
 243       if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) {
 244         if (ConsideredInstUsesAlreadyWrittenVectorElement  &&
 245             !TII->isVectorOnly(MI) && VLIW5) {
 246           isTransSlot = true;
 247           DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump(););
 248         }
 249         else
 250           return false;
 251       }
 252     }
 253
 254     // Are the Constants limitations met ?
 255     CurrentPacketMIs.push_back(MI);
 256     if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
 257       DEBUG(
 258         dbgs() << "Couldn't pack :\n";
 259         MI->dump();
 260         dbgs() << "with the following packets :\n";
 261         for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
 262           CurrentPacketMIs[i]->dump();
 263           dbgs() << "\n";
 264         }
 265         dbgs() << "because of Consts read limitations\n";
 266       );
 267       CurrentPacketMIs.pop_back();
 268       return false;
 269     }
 270
 271     // Is there a BankSwizzle set that meet Read Port limitations ?
 272     if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
 273             PV, BS, isTransSlot)) {
 274       DEBUG(
 275         dbgs() << "Couldn't pack :\n";
 276         MI->dump();
 277         dbgs() << "with the following packets :\n";
 278         for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
 279           CurrentPacketMIs[i]->dump();
 280           dbgs() << "\n";
 281         }
 282         dbgs() << "because of Read port limitations\n";
 283       );
 284       CurrentPacketMIs.pop_back();
 285       return false;
 286     }
 287
 288     // We cannot read LDS source registrs from the Trans slot.
 289     if (isTransSlot && TII->readsLDSSrcReg(MI))
 290       return false;
 291
 292     CurrentPacketMIs.pop_back();
 293     return true;
 294   }
 295
 296   MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override {
 297     MachineBasicBlock::iterator FirstInBundle =
 298         CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front();
 299     const DenseMap<unsigned, unsigned> &PV =
 300         getPreviousVector(FirstInBundle);
 301     std::vector<R600InstrInfo::BankSwizzle> BS;
 302     bool isTransSlot;
 303
 304     if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) {
 305       for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
 306         MachineInstr *MI = CurrentPacketMIs[i];
 307         unsigned Op = TII->getOperandIdx(MI->getOpcode(),
 308             AMDGPU::OpName::bank_swizzle);
 309         MI->getOperand(Op).setImm(BS[i]);
 310       }
 311       unsigned Op = TII->getOperandIdx(MI->getOpcode(),
 312           AMDGPU::OpName::bank_swizzle);
 313       MI->getOperand(Op).setImm(BS.back());
 314       if (!CurrentPacketMIs.empty())
 315         setIsLastBit(CurrentPacketMIs.back(), 0);
 316       substitutePV(MI, PV);
 317       MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI);
 318       if (isTransSlot) {
 319         endPacket(std::next(It)->getParent(), std::next(It));
 320       }
 321       return It;
 322     }
 323     endPacket(MI->getParent(), MI);
 324     if (TII->isTransOnly(MI))
 325       return MI;
 326     return VLIWPacketizerList::addToPacket(MI);
 327   }
 328 };
 329
 330 bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
 331   const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
 332   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
 333
 334   // Instantiate the packetizer.
 335   R600PacketizerList Packetizer(Fn, MLI);
 336
 337   // DFA state table should not be empty.
 338   assert(Packetizer.getResourceTracker() && "Empty DFA table!");
 339
 340   //
 341   // Loop over all basic blocks and remove KILL pseudo-instructions
 342   // These instructions confuse the dependence analysis. Consider:
 343   // D0 = ...   (Insn 0)
 344   // R0 = KILL R0, D0 (Insn 1)
 345   // R0 = ... (Insn 2)
 346   // Here, Insn 1 will result in the dependence graph not emitting an output
 347   // dependence between Insn 0 and Insn 2. This can lead to incorrect
 348   // packetization
 349   //
 350   for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
 351        MBB != MBBe; ++MBB) {
 352     MachineBasicBlock::iterator End = MBB->end();
 353     MachineBasicBlock::iterator MI = MBB->begin();
 354     while (MI != End) {
 355       if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF ||
 356           (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) {
 357         MachineBasicBlock::iterator DeleteMI = MI;
 358         ++MI;
 359         MBB->erase(DeleteMI);
 360         End = MBB->end();
 361         continue;
 362       }
 363       ++MI;
 364     }
 365   }
 366
 367   // Loop over all of the basic blocks.
 368   for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
 369        MBB != MBBe; ++MBB) {
 370     // Find scheduling regions and schedule / packetize each region.
 371     unsigned RemainingCount = MBB->size();
 372     for(MachineBasicBlock::iterator RegionEnd = MBB->end();
 373         RegionEnd != MBB->begin();) {
 374       // The next region starts above the previous region. Look backward in the
 375       // instruction stream until we find the nearest boundary.
 376       MachineBasicBlock::iterator I = RegionEnd;
 377       for(;I != MBB->begin(); --I, --RemainingCount) {
 378         if (TII->isSchedulingBoundary(&*std::prev(I), &*MBB, Fn))
 379           break;
 380       }
 381       I = MBB->begin();
 382
 383       // Skip empty scheduling regions.
 384       if (I == RegionEnd) {
 385         RegionEnd = std::prev(RegionEnd);
 386         --RemainingCount;
 387         continue;
 388       }
 389       // Skip regions with one instruction.
 390       if (I == std::prev(RegionEnd)) {
 391         RegionEnd = std::prev(RegionEnd);
 392         continue;
 393       }
 394
 395       Packetizer.PacketizeMIs(&*MBB, &*I, RegionEnd);
 396       RegionEnd = I;
 397     }
 398   }
 399
 400   return true;
 401
 402 }
 403
 404 } // end anonymous namespace
 405
 406 llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) {
 407   return new R600Packetizer(tm);
 408 }