lib/Target/R600/AMDGPUAsmPrinter.cpp

   1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 ///
  12 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
  13 /// code.  When passed an MCAsmStreamer it prints assembly and when passed
  14 /// an MCObjectStreamer it outputs binary code.
  15 //
  16 //===----------------------------------------------------------------------===//
  17 //
  18
  19 #include "AMDGPUAsmPrinter.h"
  20 #include "AMDGPU.h"
  21 #include "AMDGPUSubtarget.h"
  22 #include "R600Defines.h"
  23 #include "R600MachineFunctionInfo.h"
  24 #include "R600RegisterInfo.h"
  25 #include "SIDefines.h"
  26 #include "SIMachineFunctionInfo.h"
  27 #include "SIRegisterInfo.h"
  28 #include "llvm/CodeGen/MachineFrameInfo.h"
  29 #include "llvm/MC/MCContext.h"
  30 #include "llvm/MC/MCSectionELF.h"
  31 #include "llvm/MC/MCStreamer.h"
  32 #include "llvm/Support/ELF.h"
  33 #include "llvm/Support/MathExtras.h"
  34 #include "llvm/Support/TargetRegistry.h"
  35 #include "llvm/Target/TargetLoweringObjectFile.h"
  36
  37 using namespace llvm;
  38
  39 // TODO: This should get the default rounding mode from the kernel. We just set
  40 // the default here, but this could change if the OpenCL rounding mode pragmas
  41 // are used.
  42 //
  43 // The denormal mode here should match what is reported by the OpenCL runtime
  44 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
  45 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
  46 //
  47 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
  48 // precision, and leaves single precision to flush all and does not report
  49 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
  50 // CL_FP_DENORM for both.
  51 //
  52 // FIXME: It seems some instructions do not support single precision denormals
  53 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
  54 // and sin_f32, cos_f32 on most parts).
  55
  56 // We want to use these instructions, and using fp32 denormals also causes
  57 // instructions to run at the double precision rate for the device so it's
  58 // probably best to just report no single precision denormals.
  59 static uint32_t getFPMode(const MachineFunction &F) {
  60   const AMDGPUSubtarget& ST = F.getTarget().getSubtarget<AMDGPUSubtarget>();
  61   // TODO: Is there any real use for the flush in only / flush out only modes?
  62
  63   uint32_t FP32Denormals =
  64     ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
  65
  66   uint32_t FP64Denormals =
  67     ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
  68
  69   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
  70          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
  71          FP_DENORM_MODE_SP(FP32Denormals) |
  72          FP_DENORM_MODE_DP(FP64Denormals);
  73 }
  74
  75 static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
  76                                               MCStreamer &Streamer) {
  77   return new AMDGPUAsmPrinter(tm, Streamer);
  78 }
  79
  80 extern "C" void LLVMInitializeR600AsmPrinter() {
  81   TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
  82 }
  83
  84 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
  85     : AsmPrinter(TM, Streamer) {
  86   DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode();
  87 }
  88
  89 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
  90
  91   // This label is used to mark the end of the .text section.
  92   const TargetLoweringObjectFile &TLOF = getObjFileLowering();
  93   OutStreamer.SwitchSection(TLOF.getTextSection());
  94   MCSymbol *EndOfTextLabel =
  95       OutContext.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
  96   OutStreamer.EmitLabel(EndOfTextLabel);
  97 }
  98
  99 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 100   SetupMachineFunction(MF);
 101
 102   OutStreamer.emitRawComment(Twine('@') + MF.getName() + Twine(':'));
 103
 104   MCContext &Context = getObjFileLowering().getContext();
 105   const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config",
 106                                               ELF::SHT_PROGBITS, 0,
 107                                               SectionKind::getReadOnly());
 108   OutStreamer.SwitchSection(ConfigSection);
 109
 110   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
 111   SIProgramInfo KernelInfo;
 112   if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
 113     getSIProgramInfo(KernelInfo, MF);
 114     EmitProgramInfoSI(MF, KernelInfo);
 115   } else {
 116     EmitProgramInfoR600(MF);
 117   }
 118
 119   DisasmLines.clear();
 120   HexLines.clear();
 121   DisasmLineMaxLen = 0;
 122
 123   OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
 124   EmitFunctionBody();
 125
 126   if (isVerbose()) {
 127     const MCSectionELF *CommentSection
 128       = Context.getELFSection(".AMDGPU.csdata",
 129                               ELF::SHT_PROGBITS, 0,
 130                               SectionKind::getReadOnly());
 131     OutStreamer.SwitchSection(CommentSection);
 132
 133     if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
 134       OutStreamer.emitRawComment(" Kernel info:", false);
 135       OutStreamer.emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
 136                                  false);
 137       OutStreamer.emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
 138                                  false);
 139       OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
 140                                  false);
 141       OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
 142                                  false);
 143       OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
 144                                  false);
 145       OutStreamer.emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
 146                                  false);
 147     } else {
 148       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 149       OutStreamer.emitRawComment(
 150         Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
 151     }
 152   }
 153
 154   if (STM.dumpCode()) {
 155 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 156     MF.dump();
 157 #endif
 158
 159     if (DisasmEnabled) {
 160       OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm",
 161                                                   ELF::SHT_NOTE, 0,
 162                                                   SectionKind::getReadOnly()));
 163
 164       for (size_t i = 0; i < DisasmLines.size(); ++i) {
 165         std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
 166         Comment += " ; " + HexLines[i] + "\n";
 167
 168         OutStreamer.EmitBytes(StringRef(DisasmLines[i]));
 169         OutStreamer.EmitBytes(StringRef(Comment));
 170       }
 171     }
 172   }
 173
 174   return false;
 175 }
 176
 177 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
 178   unsigned MaxGPR = 0;
 179   bool killPixel = false;
 180   const R600RegisterInfo *RI = static_cast<const R600RegisterInfo *>(
 181       TM.getSubtargetImpl()->getRegisterInfo());
 182   const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 183   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
 184
 185   for (const MachineBasicBlock &MBB : MF) {
 186     for (const MachineInstr &MI : MBB) {
 187       if (MI.getOpcode() == AMDGPU::KILLGT)
 188         killPixel = true;
 189       unsigned numOperands = MI.getNumOperands();
 190       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
 191         const MachineOperand &MO = MI.getOperand(op_idx);
 192         if (!MO.isReg())
 193           continue;
 194         unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
 195
 196         // Register with value > 127 aren't GPR
 197         if (HWReg > 127)
 198           continue;
 199         MaxGPR = std::max(MaxGPR, HWReg);
 200       }
 201     }
 202   }
 203
 204   unsigned RsrcReg;
 205   if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
 206     // Evergreen / Northern Islands
 207     switch (MFI->getShaderType()) {
 208     default: // Fall through
 209     case ShaderType::COMPUTE:  RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
 210     case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
 211     case ShaderType::PIXEL:    RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
 212     case ShaderType::VERTEX:   RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
 213     }
 214   } else {
 215     // R600 / R700
 216     switch (MFI->getShaderType()) {
 217     default: // Fall through
 218     case ShaderType::GEOMETRY: // Fall through
 219     case ShaderType::COMPUTE:  // Fall through
 220     case ShaderType::VERTEX:   RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
 221     case ShaderType::PIXEL:    RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
 222     }
 223   }
 224
 225   OutStreamer.EmitIntValue(RsrcReg, 4);
 226   OutStreamer.EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
 227                            S_STACK_SIZE(MFI->StackSize), 4);
 228   OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
 229   OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
 230
 231   if (MFI->getShaderType() == ShaderType::COMPUTE) {
 232     OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
 233     OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
 234   }
 235 }
 236
 237 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 238                                         const MachineFunction &MF) const {
 239   uint64_t CodeSize = 0;
 240   unsigned MaxSGPR = 0;
 241   unsigned MaxVGPR = 0;
 242   bool VCCUsed = false;
 243   const SIRegisterInfo *RI = static_cast<const SIRegisterInfo *>(
 244       TM.getSubtargetImpl()->getRegisterInfo());
 245
 246   for (const MachineBasicBlock &MBB : MF) {
 247     for (const MachineInstr &MI : MBB) {
 248       // TODO: CodeSize should account for multiple functions.
 249       CodeSize += MI.getDesc().Size;
 250
 251       unsigned numOperands = MI.getNumOperands();
 252       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
 253         const MachineOperand &MO = MI.getOperand(op_idx);
 254         unsigned width = 0;
 255         bool isSGPR = false;
 256
 257         if (!MO.isReg()) {
 258           continue;
 259         }
 260         unsigned reg = MO.getReg();
 261         if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO ||
 262             reg == AMDGPU::VCC_HI) {
 263           VCCUsed = true;
 264           continue;
 265         }
 266
 267         switch (reg) {
 268         default: break;
 269         case AMDGPU::SCC:
 270         case AMDGPU::EXEC:
 271         case AMDGPU::M0:
 272           continue;
 273         }
 274
 275         if (AMDGPU::SReg_32RegClass.contains(reg)) {
 276           isSGPR = true;
 277           width = 1;
 278         } else if (AMDGPU::VReg_32RegClass.contains(reg)) {
 279           isSGPR = false;
 280           width = 1;
 281         } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
 282           isSGPR = true;
 283           width = 2;
 284         } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
 285           isSGPR = false;
 286           width = 2;
 287         } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
 288           isSGPR = false;
 289           width = 3;
 290         } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
 291           isSGPR = true;
 292           width = 4;
 293         } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
 294           isSGPR = false;
 295           width = 4;
 296         } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
 297           isSGPR = true;
 298           width = 8;
 299         } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
 300           isSGPR = false;
 301           width = 8;
 302         } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
 303           isSGPR = true;
 304           width = 16;
 305         } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
 306           isSGPR = false;
 307           width = 16;
 308         } else {
 309           llvm_unreachable("Unknown register class");
 310         }
 311         unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
 312         unsigned maxUsed = hwReg + width - 1;
 313         if (isSGPR) {
 314           MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
 315         } else {
 316           MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
 317         }
 318       }
 319     }
 320   }
 321
 322   if (VCCUsed)
 323     MaxSGPR += 2;
 324
 325   ProgInfo.NumVGPR = MaxVGPR;
 326   ProgInfo.NumSGPR = MaxSGPR;
 327
 328   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
 329   // register.
 330   ProgInfo.FloatMode = getFPMode(MF);
 331
 332   // XXX: Not quite sure what this does, but sc seems to unset this.
 333   ProgInfo.IEEEMode = 0;
 334
 335   // Do not clamp NAN to 0.
 336   ProgInfo.DX10Clamp = 0;
 337
 338   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
 339   ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
 340
 341   ProgInfo.CodeLen = CodeSize;
 342 }
 343
 344 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
 345                                          const SIProgramInfo &KernelInfo) {
 346   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
 347   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 348
 349   unsigned RsrcReg;
 350   switch (MFI->getShaderType()) {
 351   default: // Fall through
 352   case ShaderType::COMPUTE:  RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break;
 353   case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break;
 354   case ShaderType::PIXEL:    RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break;
 355   case ShaderType::VERTEX:   RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break;
 356   }
 357
 358   unsigned LDSAlignShift;
 359   if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
 360     // LDS is allocated in 64 dword blocks.
 361     LDSAlignShift = 8;
 362   } else {
 363     // LDS is allocated in 128 dword blocks.
 364     LDSAlignShift = 9;
 365   }
 366
 367   unsigned LDSBlocks =
 368     RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
 369
 370   // Scratch is allocated in 256 dword blocks.
 371   unsigned ScratchAlignShift = 10;
 372   // We need to program the hardware with the amount of scratch memory that
 373   // is used by the entire wave.  KernelInfo.ScratchSize is the amount of
 374   // scratch memory used per thread.
 375   unsigned ScratchBlocks =
 376     RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(),
 377                        1 << ScratchAlignShift) >> ScratchAlignShift;
 378
 379   if (MFI->getShaderType() == ShaderType::COMPUTE) {
 380     OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
 381
 382     const uint32_t ComputePGMRSrc1 =
 383       S_00B848_VGPRS(KernelInfo.NumVGPR / 4) |
 384       S_00B848_SGPRS(KernelInfo.NumSGPR / 8) |
 385       S_00B848_PRIORITY(KernelInfo.Priority) |
 386       S_00B848_FLOAT_MODE(KernelInfo.FloatMode) |
 387       S_00B848_PRIV(KernelInfo.Priv) |
 388       S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) |
 389       S_00B848_IEEE_MODE(KernelInfo.DebugMode) |
 390       S_00B848_IEEE_MODE(KernelInfo.IEEEMode);
 391
 392     OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
 393
 394     OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
 395     const uint32_t ComputePGMRSrc2 =
 396       S_00B84C_LDS_SIZE(LDSBlocks) |
 397       S_00B02C_SCRATCH_EN(ScratchBlocks > 0);
 398
 399     OutStreamer.EmitIntValue(ComputePGMRSrc2, 4);
 400
 401     OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
 402     OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4);
 403   } else {
 404     OutStreamer.EmitIntValue(RsrcReg, 4);
 405     OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
 406                              S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4);
 407   }
 408
 409   if (MFI->getShaderType() == ShaderType::PIXEL) {
 410     OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
 411     OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
 412     OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
 413     OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
 414   }
 415 }