lib/Target/AMDGPU/SIInsertWaits.cpp

   1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Insert wait instructions for memory reads and writes.
  12 ///
  13 /// Memory reads and writes are issued asynchronously, so we need to insert
  14 /// S_WAITCNT instructions when we want to access any of their results or
  15 /// overwrite any register that's used asynchronously.
  16 //
  17 //===----------------------------------------------------------------------===//
  18
  19 #include "AMDGPU.h"
  20 #include "AMDGPUSubtarget.h"
  21 #include "SIDefines.h"
  22 #include "SIInstrInfo.h"
  23 #include "SIMachineFunctionInfo.h"
  24 #include "llvm/CodeGen/MachineFunction.h"
  25 #include "llvm/CodeGen/MachineFunctionPass.h"
  26 #include "llvm/CodeGen/MachineInstrBuilder.h"
  27 #include "llvm/CodeGen/MachineRegisterInfo.h"
  28
  29 using namespace llvm;
  30
  31 namespace {
  32
  33 /// \brief One variable for each of the hardware counters
  34 typedef union {
  35   struct {
  36     unsigned VM;
  37     unsigned EXP;
  38     unsigned LGKM;
  39   } Named;
  40   unsigned Array[3];
  41
  42 } Counters;
  43
  44 typedef enum {
  45   OTHER,
  46   SMEM,
  47   VMEM
  48 } InstType;
  49
  50 typedef Counters RegCounters[512];
  51 typedef std::pair<unsigned, unsigned> RegInterval;
  52
  53 class SIInsertWaits : public MachineFunctionPass {
  54
  55 private:
  56   static char ID;
  57   const SIInstrInfo *TII;
  58   const SIRegisterInfo *TRI;
  59   const MachineRegisterInfo *MRI;
  60
  61   /// \brief Constant hardware limits
  62   static const Counters WaitCounts;
  63
  64   /// \brief Constant zero value
  65   static const Counters ZeroCounts;
  66
  67   /// \brief Counter values we have already waited on.
  68   Counters WaitedOn;
  69
  70   /// \brief Counter values for last instruction issued.
  71   Counters LastIssued;
  72
  73   /// \brief Registers used by async instructions.
  74   RegCounters UsedRegs;
  75
  76   /// \brief Registers defined by async instructions.
  77   RegCounters DefinedRegs;
  78
  79   /// \brief Different export instruction types seen since last wait.
  80   unsigned ExpInstrTypesSeen;
  81
  82   /// \brief Type of the last opcode.
  83   InstType LastOpcodeType;
  84
  85   bool LastInstWritesM0;
  86
  87   /// \brief Get increment/decrement amount for this instruction.
  88   Counters getHwCounts(MachineInstr &MI);
  89
  90   /// \brief Is operand relevant for async execution?
  91   bool isOpRelevant(MachineOperand &Op);
  92
  93   /// \brief Get register interval an operand affects.
  94   RegInterval getRegInterval(const TargetRegisterClass *RC,
  95                              const MachineOperand &Reg) const;
  96
  97   /// \brief Handle instructions async components
  98   void pushInstruction(MachineBasicBlock &MBB,
  99                        MachineBasicBlock::iterator I);
 100
 101   /// \brief Insert the actual wait instruction
 102   bool insertWait(MachineBasicBlock &MBB,
 103                   MachineBasicBlock::iterator I,
 104                   const Counters &Counts);
 105
 106   /// \brief Do we need def2def checks?
 107   bool unorderedDefines(MachineInstr &MI);
 108
 109   /// \brief Resolve all operand dependencies to counter requirements
 110   Counters handleOperands(MachineInstr &MI);
 111
 112   /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
 113   void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
 114
 115 public:
 116   SIInsertWaits(TargetMachine &tm) :
 117     MachineFunctionPass(ID),
 118     TII(nullptr),
 119     TRI(nullptr),
 120     ExpInstrTypesSeen(0) { }
 121
 122   bool runOnMachineFunction(MachineFunction &MF) override;
 123
 124   const char *getPassName() const override {
 125     return "SI insert wait instructions";
 126   }
 127
 128   void getAnalysisUsage(AnalysisUsage &AU) const override {
 129     AU.setPreservesCFG();
 130     MachineFunctionPass::getAnalysisUsage(AU);
 131   }
 132 };
 133
 134 } // End anonymous namespace
 135
 136 char SIInsertWaits::ID = 0;
 137
 138 const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
 139 const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
 140
 141 FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
 142   return new SIInsertWaits(tm);
 143 }
 144
 145 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
 146   uint64_t TSFlags = MI.getDesc().TSFlags;
 147   Counters Result = { { 0, 0, 0 } };
 148
 149   Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
 150
 151   // Only consider stores or EXP for EXP_CNT
 152   Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
 153       (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
 154
 155   // LGKM may uses larger values
 156   if (TSFlags & SIInstrFlags::LGKM_CNT) {
 157
 158     if (TII->isSMRD(MI)) {
 159
 160       if (MI.getNumOperands() != 0) {
 161         assert(MI.getOperand(0).isReg() &&
 162                "First LGKM operand must be a register!");
 163
 164         // XXX - What if this is a write into a super register?
 165         const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
 166         unsigned Size = RC->getSize();
 167         Result.Named.LGKM = Size > 4 ? 2 : 1;
 168       } else {
 169         // s_dcache_inv etc. do not have a a destination register. Assume we
 170         // want a wait on these.
 171         // XXX - What is the right value?
 172         Result.Named.LGKM = 1;
 173       }
 174     } else {
 175       // DS
 176       Result.Named.LGKM = 1;
 177     }
 178
 179   } else {
 180     Result.Named.LGKM = 0;
 181   }
 182
 183   return Result;
 184 }
 185
 186 bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
 187   // Constants are always irrelevant
 188   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
 189     return false;
 190
 191   // Defines are always relevant
 192   if (Op.isDef())
 193     return true;
 194
 195   // For exports all registers are relevant
 196   MachineInstr &MI = *Op.getParent();
 197   if (MI.getOpcode() == AMDGPU::EXP)
 198     return true;
 199
 200   // For stores the stored value is also relevant
 201   if (!MI.getDesc().mayStore())
 202     return false;
 203
 204   // Check if this operand is the value being stored.
 205   // Special case for DS instructions, since the address
 206   // operand comes before the value operand and it may have
 207   // multiple data operands.
 208
 209   if (TII->isDS(MI)) {
 210     MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
 211     if (Data && Op.isIdenticalTo(*Data))
 212       return true;
 213
 214     MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
 215     if (Data0 && Op.isIdenticalTo(*Data0))
 216       return true;
 217
 218     MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
 219     if (Data1 && Op.isIdenticalTo(*Data1))
 220       return true;
 221
 222     return false;
 223   }
 224
 225   // NOTE: This assumes that the value operand is before the
 226   // address operand, and that there is only one value operand.
 227   for (MachineInstr::mop_iterator I = MI.operands_begin(),
 228        E = MI.operands_end(); I != E; ++I) {
 229
 230     if (I->isReg() && I->isUse())
 231       return Op.isIdenticalTo(*I);
 232   }
 233
 234   return false;
 235 }
 236
 237 RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
 238                                           const MachineOperand &Reg) const {
 239   unsigned Size = RC->getSize();
 240   assert(Size >= 4);
 241
 242   RegInterval Result;
 243   Result.first = TRI->getEncodingValue(Reg.getReg());
 244   Result.second = Result.first + Size / 4;
 245
 246   return Result;
 247 }
 248
 249 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
 250                                     MachineBasicBlock::iterator I) {
 251
 252   // Get the hardware counter increments and sum them up
 253   Counters Increment = getHwCounts(*I);
 254   Counters Limit = ZeroCounts;
 255   unsigned Sum = 0;
 256
 257   for (unsigned i = 0; i < 3; ++i) {
 258     LastIssued.Array[i] += Increment.Array[i];
 259     if (Increment.Array[i])
 260       Limit.Array[i] = LastIssued.Array[i];
 261     Sum += Increment.Array[i];
 262   }
 263
 264   // If we don't increase anything then that's it
 265   if (Sum == 0) {
 266     LastOpcodeType = OTHER;
 267     return;
 268   }
 269
 270   if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
 271       AMDGPUSubtarget::VOLCANIC_ISLANDS) {
 272     // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
 273     // or SMEM clause, respectively.
 274     //
 275     // The temporary workaround is to break the clauses with S_NOP.
 276     //
 277     // The proper solution would be to allocate registers such that all source
 278     // and destination registers don't overlap, e.g. this is illegal:
 279     //   r0 = load r2
 280     //   r2 = load r0
 281     if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) ||
 282         (LastOpcodeType == VMEM && Increment.Named.VM)) {
 283       // Insert a NOP to break the clause.
 284       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
 285           .addImm(0);
 286       LastInstWritesM0 = false;
 287     }
 288
 289     if (TII->isSMRD(*I))
 290       LastOpcodeType = SMEM;
 291     else if (Increment.Named.VM)
 292       LastOpcodeType = VMEM;
 293   }
 294
 295   // Remember which export instructions we have seen
 296   if (Increment.Named.EXP) {
 297     ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
 298   }
 299
 300   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
 301     MachineOperand &Op = I->getOperand(i);
 302     if (!isOpRelevant(Op))
 303       continue;
 304
 305     const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
 306     RegInterval Interval = getRegInterval(RC, Op);
 307     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 308
 309       // Remember which registers we define
 310       if (Op.isDef())
 311         DefinedRegs[j] = Limit;
 312
 313       // and which one we are using
 314       if (Op.isUse())
 315         UsedRegs[j] = Limit;
 316     }
 317   }
 318 }
 319
 320 bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
 321                                MachineBasicBlock::iterator I,
 322                                const Counters &Required) {
 323
 324   // End of program? No need to wait on anything
 325   if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
 326     return false;
 327
 328   // Figure out if the async instructions execute in order
 329   bool Ordered[3];
 330
 331   // VM_CNT is always ordered
 332   Ordered[0] = true;
 333
 334   // EXP_CNT is unordered if we have both EXP & VM-writes
 335   Ordered[1] = ExpInstrTypesSeen == 3;
 336
 337   // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
 338   Ordered[2] = false;
 339
 340   // The values we are going to put into the S_WAITCNT instruction
 341   Counters Counts = WaitCounts;
 342
 343   // Do we really need to wait?
 344   bool NeedWait = false;
 345
 346   for (unsigned i = 0; i < 3; ++i) {
 347
 348     if (Required.Array[i] <= WaitedOn.Array[i])
 349       continue;
 350
 351     NeedWait = true;
 352
 353     if (Ordered[i]) {
 354       unsigned Value = LastIssued.Array[i] - Required.Array[i];
 355
 356       // Adjust the value to the real hardware possibilities.
 357       Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
 358
 359     } else
 360       Counts.Array[i] = 0;
 361
 362     // Remember on what we have waited on.
 363     WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
 364   }
 365
 366   if (!NeedWait)
 367     return false;
 368
 369   // Reset EXP_CNT instruction types
 370   if (Counts.Named.EXP == 0)
 371     ExpInstrTypesSeen = 0;
 372
 373   // Build the wait instruction
 374   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
 375           .addImm((Counts.Named.VM & 0xF) |
 376                   ((Counts.Named.EXP & 0x7) << 4) |
 377                   ((Counts.Named.LGKM & 0x7) << 8));
 378
 379   LastOpcodeType = OTHER;
 380   LastInstWritesM0 = false;
 381   return true;
 382 }
 383
 384 /// \brief helper function for handleOperands
 385 static void increaseCounters(Counters &Dst, const Counters &Src) {
 386
 387   for (unsigned i = 0; i < 3; ++i)
 388     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
 389 }
 390
 391 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 392
 393   Counters Result = ZeroCounts;
 394
 395   // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
 396   // but we also want to wait for any other outstanding transfers before
 397   // signalling other hardware blocks
 398   if (MI.getOpcode() == AMDGPU::S_SENDMSG)
 399     return LastIssued;
 400
 401   // For each register affected by this instruction increase the result
 402   // sequence.
 403   //
 404   // TODO: We could probably just look at explicit operands if we removed VCC /
 405   // EXEC from SMRD dest reg classes.
 406   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
 407     MachineOperand &Op = MI.getOperand(i);
 408     if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
 409       continue;
 410
 411     const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
 412     RegInterval Interval = getRegInterval(RC, Op);
 413     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 414
 415       if (Op.isDef()) {
 416         increaseCounters(Result, UsedRegs[j]);
 417         increaseCounters(Result, DefinedRegs[j]);
 418       }
 419
 420       if (Op.isUse())
 421         increaseCounters(Result, DefinedRegs[j]);
 422     }
 423   }
 424
 425   return Result;
 426 }
 427
 428 void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
 429                                   MachineBasicBlock::iterator I) {
 430   if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <
 431       AMDGPUSubtarget::VOLCANIC_ISLANDS)
 432     return;
 433
 434   // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
 435   if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
 436     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
 437     LastInstWritesM0 = false;
 438     return;
 439   }
 440
 441   // Set whether this instruction sets M0
 442   LastInstWritesM0 = false;
 443
 444   unsigned NumOperands = I->getNumOperands();
 445   for (unsigned i = 0; i < NumOperands; i++) {
 446     const MachineOperand &Op = I->getOperand(i);
 447
 448     if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
 449       LastInstWritesM0 = true;
 450   }
 451 }
 452
 453 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
 454 // around other non-memory instructions.
 455 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
 456   bool Changes = false;
 457
 458   TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
 459   TRI =
 460       static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 461
 462   MRI = &MF.getRegInfo();
 463
 464   WaitedOn = ZeroCounts;
 465   LastIssued = ZeroCounts;
 466   LastOpcodeType = OTHER;
 467   LastInstWritesM0 = false;
 468
 469   memset(&UsedRegs, 0, sizeof(UsedRegs));
 470   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
 471
 472   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
 473        BI != BE; ++BI) {
 474
 475     MachineBasicBlock &MBB = *BI;
 476     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
 477          I != E; ++I) {
 478
 479       // Wait for everything before a barrier.
 480       if (I->getOpcode() == AMDGPU::S_BARRIER)
 481         Changes |= insertWait(MBB, I, LastIssued);
 482       else
 483         Changes |= insertWait(MBB, I, handleOperands(*I));
 484
 485       pushInstruction(MBB, I);
 486       handleSendMsg(MBB, I);
 487     }
 488
 489     // Wait for everything at the end of the MBB
 490     Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
 491   }
 492
 493   return Changes;
 494 }