S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
S_00B848_PRIV(ProgInfo.Priv) |
S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
- S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
+ S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
+ // 0 = X, 1 = XY, 2 = XYZ
+ unsigned TIDIGCompCnt = 0;
+ if (MFI->hasWorkItemIDZ())
+ TIDIGCompCnt = 2;
+ else if (MFI->hasWorkItemIDY())
+ TIDIGCompCnt = 1;
+
ProgInfo.ComputePGMRSrc2 =
S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
- S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
- S_00B84C_TGID_X_EN(1) |
- S_00B84C_TGID_Y_EN(1) |
- S_00B84C_TGID_Z_EN(1) |
- S_00B84C_TG_SIZE_EN(1) |
- S_00B84C_TIDIG_COMP_CNT(2) |
- S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
+ S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
+ S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
+ S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
+ S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
+ S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
+ S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
+ S_00B84C_EXCP_EN_MSB(0) |
+ S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
+ S_00B84C_EXCP_EN(0);
}
static unsigned getRsrcReg(unsigned ShaderType) {
header.compute_pgm_resource_registers =
KernelInfo.ComputePGMRSrc1 |
(KernelInfo.ComputePGMRSrc2 << 32);
- header.code_properties =
- AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
- AMD_CODE_PROPERTY_IS_PTR64;
+ header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+
+ if (MFI->hasPrivateSegmentBuffer()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+ }
+
+ if (MFI->hasDispatchPtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+
+ if (MFI->hasQueuePtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+
+ if (MFI->hasKernargSegmentPtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+
+ if (MFI->hasDispatchID())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+
+ if (MFI->hasFlatScratchInit())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+
+ // TODO: Private segment size
+
+ if (MFI->hasGridWorkgroupCountX()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
+ }
+
+ if (MFI->hasGridWorkgroupCountY()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
+ }
+
+ if (MFI->hasGridWorkgroupCountZ()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
+ }
if (MFI->hasDispatchPtr())
header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
SDLoc DL(Addr);
MachineFunction &MF = CurDAG->getMachineFunction();
- const SIRegisterInfo *TRI =
- static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- unsigned ScratchOffsetReg = TRI->getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
- SOffset = CurDAG->getRegister(ScratchOffsetReg, MVT::i32);
+ SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
// (add n0, c1)
if (CurDAG->isBaseWithConstantOffset(Addr)) {
return true;
}
+static ArrayRef<MCPhysReg> getAllSGPR128() {
+ return makeArrayRef(AMDGPU::SReg_128RegClass.begin(),
+ AMDGPU::SReg_128RegClass.getNumRegs());
+}
+
+static ArrayRef<MCPhysReg> getAllSGPRs() {
+ return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
+ AMDGPU::SGPR_32RegClass.getNumRegs());
+}
+
void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
if (!MF.getFrameInfo()->hasStackObjects())
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// If we only have SGPR spills, we won't actually be using scratch memory
// since these spill to VGPRs.
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
// We need to insert initialization of the scratch resource descriptor.
unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
assert(ScratchRsrcReg != AMDGPU::NoRegister);
- uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+ unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+ assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
+
+ unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+
+ unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
+ if (ST.isAmdHsaOS()) {
+ PreloadedPrivateBufferReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ }
+
+ // If we reserved the original input registers, we don't need to copy to the
+ // reserved registers.
+ if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
+ // We should always reserve these 5 registers at the same time.
+ assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
+ "scratch wave offset and private segment buffer inconsistent");
+ return;
+ }
+
+
+ // We added live-ins during argument lowering, but since they were not used
+ // they were deleted. We're adding the uses now, so add them back.
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+ MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+
+ if (ST.isAmdHsaOS()) {
+ MRI.addLiveIn(PreloadedPrivateBufferReg);
+ MBB.addLiveIn(PreloadedPrivateBufferReg);
+ }
+
+ // We reserved the last registers for this. Shift it down to the end of those
+ // which were actually used.
+ //
+ // FIXME: It might be safer to use a pseudoregister before replacement.
+
+ // FIXME: We should be able to eliminate unused input registers. We only
+ // cannot do this for the resources required for scratch access. For now we
+ // skip over user SGPRs and may leave unused holes.
+
+ // We find the resource first because it has an alignment requirement.
+ if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+ // Skip the last 2 elements because the last one is reserved for VCC, and
+ // this is the 2nd to last element already.
+ for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
+ // Pick the first unallocated one. Make sure we don't clobber the other
+ // reserved input we needed.
+ if (!MRI.isPhysRegUsed(Reg)) {
+ assert(MRI.isAllocatable(Reg));
+ MRI.replaceRegWith(ScratchRsrcReg, Reg);
+ ScratchRsrcReg = Reg;
+ MFI->setScratchRSrcReg(ScratchRsrcReg);
+ break;
+ }
+ }
+ }
+
+ if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ // Skip the last 2 elements because the last one is reserved for VCC, and
+ // this is the 2nd to last element already.
+ unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+ for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
+ // Pick the first unallocated SGPR. Be careful not to pick an alias of the
+ // scratch descriptor, since we haven’t added its uses yet.
+ if (!MRI.isPhysRegUsed(Reg)) {
+ assert(MRI.isAllocatable(Reg) &&
+ !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
+
+ MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+ ScratchWaveOffsetReg = Reg;
+ MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+ break;
+ }
+ }
+ }
+
+
+ assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
+
+ const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
MachineBasicBlock::iterator I = MBB.begin();
DebugLoc DL;
- unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
- unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
- unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
- unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
-
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
- .addExternalSymbol("SCRATCH_RSRC_DWORD0");
+ if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
+ // Make sure we emit the copy for the offset first. We may have chosen to copy
+ // the buffer resource into a register that aliases the input offset register.
+ BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg)
+ .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
+ }
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
- .addExternalSymbol("SCRATCH_RSRC_DWORD1");
+ if (ST.isAmdHsaOS()) {
+ // Insert copies from argument register.
+ assert(
+ !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
+ !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
+
+ unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+ unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3);
+
+ unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1);
+ unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3);
+
+ const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64);
+
+ BuildMI(MBB, I, DL, SMovB64, Rsrc01)
+ .addReg(Lo, RegState::Kill);
+ BuildMI(MBB, I, DL, SMovB64, Rsrc23)
+ .addReg(Hi, RegState::Kill);
+ } else {
+ unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+ unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+ unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+ unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+ // Use relocations to get the pointer, and setup the other bits manually.
+ uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+ BuildMI(MBB, I, DL, SMovB32, Rsrc0)
+ .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc1)
+ .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc2)
+ .addImm(Rsrc23 & 0xffffffff)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc3)
+ .addImm(Rsrc23 >> 32)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ }
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
- .addImm(Rsrc23 & 0xffffffff);
+ // Make the register selected live throughout the function.
+ for (MachineBasicBlock &OtherBB : MF) {
+ if (&OtherBB == &MBB)
+ continue;
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
- .addImm(Rsrc23 >> 32);
+ OtherBB.addLiveIn(ScratchRsrcReg);
+ OtherBB.addLiveIn(ScratchWaveOffsetReg);
+ }
}
void SIFrameLowering::processFunctionBeforeFrameFinalized(
Align); // Alignment
}
+static ArrayRef<MCPhysReg> getAllSGPRs() {
+ return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
+ AMDGPU::SGPR_32RegClass.getNumRegs());
+}
+
SDValue SITargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
CCInfo.AllocateReg(AMDGPU::VGPR1);
}
- // The pointer to the list of arguments is stored in SGPR0, SGPR1
- // The pointer to the scratch buffer is stored in SGPR2, SGPR3
if (Info->getShaderType() == ShaderType::COMPUTE) {
- if (Subtarget->isAmdHsaOS())
- Info->NumUserSGPRs += 4; // FIXME: Need to support scratch buffers.
- else
- Info->NumUserSGPRs += 4;
-
- unsigned InputPtrReg =
- TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
- unsigned InputPtrRegLo =
- TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
- unsigned InputPtrRegHi =
- TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
- CCInfo.AllocateReg(InputPtrRegLo);
- CCInfo.AllocateReg(InputPtrRegHi);
- MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+ getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
+ Splits);
+ }
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+ if (Info->hasPrivateSegmentBuffer()) {
+ unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
+ MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+ CCInfo.AllocateReg(PrivateSegmentBufferReg);
+ }
- if (MFI->hasDispatchPtr()) {
- unsigned DispatchPtrReg
- = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR);
- MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
- }
+ if (Info->hasDispatchPtr()) {
+ unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+ MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
+ CCInfo.AllocateReg(DispatchPtrReg);
}
- if (Info->getShaderType() == ShaderType::COMPUTE) {
- getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
- Splits);
+ if (Info->hasKernargSegmentPtr()) {
+ unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+ MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+ CCInfo.AllocateReg(InputPtrReg);
}
AnalyzeFormalArguments(CCInfo, Splits);
InVals.push_back(Val);
}
- if (Info->getShaderType() != ShaderType::COMPUTE) {
- unsigned ScratchIdx = CCInfo.getFirstUnallocated(makeArrayRef(
- AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
- Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
+ // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+ // these from the dispatch pointer.
+
+ // Start adding system SGPRs.
+ if (Info->hasWorkGroupIDX()) {
+ unsigned Reg = Info->addWorkGroupIDX();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ } else
+ llvm_unreachable("work group id x is always enabled");
+
+ if (Info->hasWorkGroupIDY()) {
+ unsigned Reg = Info->addWorkGroupIDY();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkGroupIDZ()) {
+ unsigned Reg = Info->addWorkGroupIDZ();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
}
- if (MF.getFrameInfo()->hasStackObjects() || ST.isVGPRSpillingEnabled(Info))
- Info->setScratchRSrcReg(TRI);
+ if (Info->hasWorkGroupInfo()) {
+ unsigned Reg = Info->addWorkGroupInfo();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasPrivateSegmentWaveByteOffset()) {
+ // Scratch wave offset passed in system SGPR.
+ unsigned PrivateSegmentWaveByteOffsetReg
+ = Info->addPrivateSegmentWaveByteOffset();
+
+ MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
+ }
+
+ // Now that we've figured out where the scratch register inputs are, see if
+ // should reserve the arguments and use them directly.
+
+ bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+
+ if (ST.isAmdHsaOS()) {
+ // TODO: Assume we will spill without optimizations.
+ if (HasStackObjects) {
+ // If we have stack objects, we unquestionably need the private buffer
+ // resource. For the HSA ABI, this will be the first 4 user SGPR
+ // inputs. We can reserve those and use them directly.
+
+ unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ Info->setScratchRSrcReg(PrivateSegmentBufferReg);
+
+ unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ } else {
+ unsigned ReservedBufferReg
+ = TRI->reservedPrivateSegmentBufferReg(MF);
+ unsigned ReservedOffsetReg
+ = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+
+ // We tentatively reserve the last registers (skipping the last two
+ // which may contain VCC). After register allocation, we'll replace
+ // these with the ones immediately after those which were really
+ // allocated. In the prologue copies will be inserted from the argument
+ // to these reserved registers.
+ Info->setScratchRSrcReg(ReservedBufferReg);
+ Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+ }
+ } else {
+ unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
+
+ // Without HSA, relocations are used for the scratch pointer and the
+ // buffer resource setup is always inserted in the prologue. Scratch wave
+ // offset is still in an input SGPR.
+ Info->setScratchRSrcReg(ReservedBufferReg);
+
+ if (HasStackObjects) {
+ unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+ } else {
+ unsigned ReservedOffsetReg
+ = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+ Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+ }
+ }
+
+ if (Info->hasWorkItemIDX()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ } else
+ llvm_unreachable("workitem id x should always be enabled");
+
+ if (Info->hasWorkItemIDY()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkItemIDZ()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
if (Chains.empty())
return Chain;
assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
- unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue(
- *MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-
unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
MFI->setHasSpilledVGPRs();
BuildMI(MBB, MI, DL, get(Opcode))
.addReg(SrcReg) // src
.addFrameIndex(FrameIndex) // frame_idx
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
- .addReg(ScratchOffsetPreloadReg) // scratch_offset
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
.addMemOperand(MMO);
}
assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
- unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue(
- *MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-
unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addFrameIndex(FrameIndex) // frame_idx
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
- .addReg(ScratchOffsetPreloadReg) // scratch_offset
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
.addMemOperand(MMO);
}
: AMDGPUMachineFunction(MF),
TIDReg(AMDGPU::NoRegister),
ScratchRSrcReg(AMDGPU::NoRegister),
+ ScratchWaveOffsetReg(AMDGPU::NoRegister),
+ PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
+ DispatchPtrUserSGPR(AMDGPU::NoRegister),
+ QueuePtrUserSGPR(AMDGPU::NoRegister),
+ KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
+ DispatchIDUserSGPR(AMDGPU::NoRegister),
+ FlatScratchInitUserSGPR(AMDGPU::NoRegister),
+ PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
+ GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
+ GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
+ GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
+ WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
+ WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
+ WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
+ WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
+ PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
LDSWaveSpillSize(0),
PSInputAddr(0),
NumUserSGPRs(0),
+ NumSystemSGPRs(0),
HasSpilledSGPRs(false),
HasSpilledVGPRs(false),
+ PrivateSegmentBuffer(false),
DispatchPtr(false),
QueuePtr(false),
DispatchID(false),
- KernargSegmentPtr(true),
+ KernargSegmentPtr(false),
FlatScratchInit(false),
GridWorkgroupCountX(false),
GridWorkgroupCountY(false),
WorkGroupIDY(false),
WorkGroupIDZ(false),
WorkGroupInfo(false),
+ PrivateSegmentWaveByteOffset(false),
WorkItemIDX(true),
WorkItemIDY(false),
WorkItemIDZ(false) {
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
const Function *F = MF.getFunction();
- if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
- DispatchPtr = true;
+ const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+
+ if (getShaderType() == ShaderType::COMPUTE)
+ KernargSegmentPtr = true;
if (F->hasFnAttribute("amdgpu-work-group-id-y"))
WorkGroupIDY = true;
if (F->hasFnAttribute("amdgpu-work-item-id-z"))
WorkItemIDZ = true;
+
+ bool MaySpill = ST.isVGPRSpillingEnabled(this);
+ bool HasStackObjects = FrameInfo->hasStackObjects();
+
+ if (HasStackObjects || MaySpill)
+ PrivateSegmentWaveByteOffset = true;
+
+ if (ST.isAmdHsaOS()) {
+ if (HasStackObjects || MaySpill)
+ PrivateSegmentBuffer = true;
+
+ if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
+ DispatchPtr = true;
+ }
+
+ // X, XY, and XYZ are the only supported combinations, so make sure Y is
+ // enabled if Z is.
+ if (WorkItemIDZ)
+ WorkItemIDY = true;
+}
+
+unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
+ const SIRegisterInfo &TRI) {
+ PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+ NumUserSGPRs += 4;
+ return PrivateSegmentBufferUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
+ DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return DispatchPtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
+ QueuePtrUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return QueuePtrUserSGPR;
}
-void SIMachineFunctionInfo::setScratchRSrcReg(const SIRegisterInfo *TRI) {
- // We need to round up to next multiple of 4.
- unsigned NextSReg128 = RoundUpToAlignment(NumUserSGPRs + 5, 4);
- unsigned RegSub0 = AMDGPU::SReg_32RegClass.getRegister(NextSReg128);
- ScratchRSrcReg = TRI->getMatchingSuperReg(RegSub0, AMDGPU::sub0,
- &AMDGPU::SReg_128RegClass);
+unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
+ KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return KernargSegmentPtrUserSGPR;
}
SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+ // FIXME: This should be removed and getPreloadedValue moved here.
+ friend struct SIRegisterInfo;
void anchor() override;
unsigned TIDReg;
+
+ // Registers that may be reserved for spilling purposes. These may be the same
+ // as the input registers.
unsigned ScratchRSrcReg;
+ unsigned ScratchWaveOffsetReg;
+
+ // Input registers setup for the HSA ABI.
+ // User SGPRs in allocation order.
+ unsigned PrivateSegmentBufferUserSGPR;
+ unsigned DispatchPtrUserSGPR;
+ unsigned QueuePtrUserSGPR;
+ unsigned KernargSegmentPtrUserSGPR;
+ unsigned DispatchIDUserSGPR;
+ unsigned FlatScratchInitUserSGPR;
+ unsigned PrivateSegmentSizeUserSGPR;
+ unsigned GridWorkGroupCountXUserSGPR;
+ unsigned GridWorkGroupCountYUserSGPR;
+ unsigned GridWorkGroupCountZUserSGPR;
+
+ // System SGPRs in allocation order.
+ unsigned WorkGroupIDXSystemSGPR;
+ unsigned WorkGroupIDYSystemSGPR;
+ unsigned WorkGroupIDZSystemSGPR;
+ unsigned WorkGroupInfoSystemSGPR;
+ unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
public:
// FIXME: Make private
std::map<unsigned, unsigned> LaneVGPRs;
unsigned ScratchOffsetReg;
unsigned NumUserSGPRs;
+ unsigned NumSystemSGPRs;
private:
bool HasSpilledSGPRs;
bool HasSpilledVGPRs;
- // Feature bits required for inputs passed in user / system SGPRs.
+ // Feature bits required for inputs passed in user SGPRs.
+ bool PrivateSegmentBuffer : 1;
bool DispatchPtr : 1;
bool QueuePtr : 1;
bool DispatchID : 1;
bool GridWorkgroupCountY : 1;
bool GridWorkgroupCountZ : 1;
+ // Feature bits required for inputs passed in system SGPRs.
bool WorkGroupIDX : 1; // Always initialized.
bool WorkGroupIDY : 1;
bool WorkGroupIDZ : 1;
bool WorkGroupInfo : 1;
+ bool PrivateSegmentWaveByteOffset : 1;
bool WorkItemIDX : 1; // Always initialized.
bool WorkItemIDY : 1;
bool WorkItemIDZ : 1;
+
+ MCPhysReg getNextUserSGPR() const {
+ assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
+ return AMDGPU::SGPR0 + NumUserSGPRs;
+ }
+
+ MCPhysReg getNextSystemSGPR() const {
+ return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
+ }
+
public:
struct SpilledReg {
unsigned VGPR;
unsigned getTIDReg() const { return TIDReg; };
void setTIDReg(unsigned Reg) { TIDReg = Reg; }
+ // Add user SGPRs.
+ unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
+ unsigned addDispatchPtr(const SIRegisterInfo &TRI);
+ unsigned addQueuePtr(const SIRegisterInfo &TRI);
+ unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
+
+ // Add system SGPRs.
+ unsigned addWorkGroupIDX() {
+ WorkGroupIDXSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupIDXSystemSGPR;
+ }
+
+ unsigned addWorkGroupIDY() {
+ WorkGroupIDYSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupIDYSystemSGPR;
+ }
+
+ unsigned addWorkGroupIDZ() {
+ WorkGroupIDZSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupIDZSystemSGPR;
+ }
+
+ unsigned addWorkGroupInfo() {
+ WorkGroupInfoSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupInfoSystemSGPR;
+ }
+
+ unsigned addPrivateSegmentWaveByteOffset() {
+ PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return PrivateSegmentWaveByteOffsetSystemSGPR;
+ }
+
+ bool hasPrivateSegmentBuffer() const {
+ return PrivateSegmentBuffer;
+ }
+
bool hasDispatchPtr() const {
return DispatchPtr;
}
return WorkGroupInfo;
}
+ bool hasPrivateSegmentWaveByteOffset() const {
+ return PrivateSegmentWaveByteOffset;
+ }
+
bool hasWorkItemIDX() const {
return WorkItemIDX;
}
return WorkItemIDZ;
}
+ unsigned getNumUserSGPRs() const {
+ return NumUserSGPRs;
+ }
+
+ unsigned getNumPreloadedSGPRs() const {
+ return NumUserSGPRs + NumSystemSGPRs;
+ }
+
+ unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
+ return PrivateSegmentWaveByteOffsetSystemSGPR;
+ }
+
/// \brief Returns the physical register reserved for use as the resource
/// descriptor for scratch accesses.
unsigned getScratchRSrcReg() const {
return ScratchRSrcReg;
}
- void setScratchRSrcReg(const SIRegisterInfo *TRI);
+ void setScratchRSrcReg(unsigned Reg) {
+ assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+ ScratchRSrcReg = Reg;
+ }
+
+ unsigned getScratchWaveOffsetReg() const {
+ return ScratchWaveOffsetReg;
+ }
+
+ void setScratchWaveOffsetReg(unsigned Reg) {
+ assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+ ScratchWaveOffsetReg = Reg;
+ }
bool hasSpilledSGPRs() const {
return HasSpilledSGPRs;
Reserved.set(*R);
}
+unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
+ const MachineFunction &MF) const {
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ if (ST.hasSGPRInitBug()) {
+ unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4;
+ unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+ return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+ }
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the
+ // next sgpr128 down.
+ return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
+ }
+
+ return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+ const MachineFunction &MF) const {
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ if (ST.hasSGPRInitBug()) {
+ unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5;
+ return AMDGPU::SGPR_32RegClass.getRegister(Idx);
+ }
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // Next register before reservations for flat_scr and vcc.
+ return AMDGPU::SGPR97;
+ }
+
+ return AMDGPU::SGPR95;
+}
+
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
}
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+ if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
+ // Reserve 1 SGPR for scratch wave offset in case we need to spill.
+ reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
+ }
+
unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
if (ScratchRSrcReg != AMDGPU::NoRegister) {
- unsigned ScratchOffsetPreloadReg
- = getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
- // We will need to use this user SGPR argument for spilling, and thus never
- // want it to be spilled.
- reserveRegisterTuples(Reserved, ScratchOffsetPreloadReg);
-
// Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
// to spill.
// TODO: May need to reserve a VGPR if doing LDS spilling.
reserveRegisterTuples(Reserved, ScratchRSrcReg);
- assert(!isSubRegister(ScratchRSrcReg, ScratchOffsetPreloadReg));
+ assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
}
return Reserved;
unsigned SubReg = NumSubRegs > 1 ?
getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
Value;
- bool IsKill = (i == e - 1);
BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
.addReg(SubReg, getDefRegState(IsLoad))
- .addReg(ScratchRsrcReg, getKillRegState(IsKill))
+ .addReg(ScratchRsrcReg)
.addReg(SOffset)
.addImm(Offset)
.addImm(0) // glc
return OpType == AMDGPU::OPERAND_REG_INLINE_C;
}
+// FIXME: Most of these are flexible with HSA and we don't need to reserve them
+// as input registers if unused. Whether the dispatch ptr is necessary should be
+// easy to detect from used intrinsics. Scratch setup is harder to know.
unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
switch (Value) {
case SIRegisterInfo::WORKGROUP_ID_X:
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
+ assert(MFI->hasWorkGroupIDX());
+ return MFI->WorkGroupIDXSystemSGPR;
case SIRegisterInfo::WORKGROUP_ID_Y:
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
+ assert(MFI->hasWorkGroupIDY());
+ return MFI->WorkGroupIDYSystemSGPR;
case SIRegisterInfo::WORKGROUP_ID_Z:
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
+ assert(MFI->hasWorkGroupIDZ());
+ return MFI->WorkGroupIDZSystemSGPR;
case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
- if (MFI->getShaderType() != ShaderType::COMPUTE)
- return MFI->ScratchOffsetReg;
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
+ return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
- llvm_unreachable("currently unused");
+ assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations");
+ assert(MFI->hasPrivateSegmentBuffer());
+ return MFI->PrivateSegmentBufferUserSGPR;
case SIRegisterInfo::KERNARG_SEGMENT_PTR:
- return ST.isAmdHsaOS() ? AMDGPU::SGPR2_SGPR3 : AMDGPU::SGPR0_SGPR1;
+ assert(MFI->hasKernargSegmentPtr());
+ return MFI->KernargSegmentPtrUserSGPR;
case SIRegisterInfo::DISPATCH_PTR:
assert(MFI->hasDispatchPtr());
- return AMDGPU::SGPR0_SGPR1;
+ return MFI->DispatchPtrUserSGPR;
case SIRegisterInfo::QUEUE_PTR:
llvm_unreachable("not implemented");
case SIRegisterInfo::WORKITEM_ID_X:
+ assert(MFI->hasWorkItemIDX());
return AMDGPU::VGPR0;
case SIRegisterInfo::WORKITEM_ID_Y:
+ assert(MFI->hasWorkItemIDY());
return AMDGPU::VGPR1;
case SIRegisterInfo::WORKITEM_ID_Z:
+ assert(MFI->hasWorkItemIDZ());
return AMDGPU::VGPR2;
}
llvm_unreachable("unexpected preloaded value type");
public:
SIRegisterInfo();
+ /// Return the end register initially reserved for the scratch buffer in case
+ /// spilling is needed.
+ unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
+
+ /// Return the end register initially reserved for the scratch wave offset in
+ /// case spilling is needed.
+ unsigned reservedPrivateSegmentWaveByteOffsetReg(
+ const MachineFunction &MF) const;
+
BitVector getReservedRegs(const MachineFunction &MF) const override;
unsigned getRegPressureSetLimit(const MachineFunction &MF,
; HSA: .amdgpu_hsa_kernel simple
; HSA: {{^}}simple:
; HSA: .amd_kernel_code_t
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
; HSA: .end_amd_kernel_code_t
-; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x0
+; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
; Make sure we are setting the ATC bit:
; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000
; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
-; XUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
-; XUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
; FIXME: align on alloca seems to be ignored for private_segment_alignment
; ALL-LABEL: {{^}}large_alloca_compute_shader:
-; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN: s_mov_b32 s14, -1
-; CI: s_mov_b32 s15, 0x80f000
-; VI: s_mov_b32 s15, 0x800000
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN: s_mov_b32 s10, -1
+; CI: s_mov_b32 s11, 0x80f000
+; VI: s_mov_b32 s11, 0x800000
; GCNHSA: .amd_kernel_code_t
+
+; GCNHSA: compute_pgm_rsrc2_scratch_en = 1
+; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6
+; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1
+; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0
+; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0
+; GCNHSA: compute_pgm_rsrc2_tg_size_en = 0
+; GCNHSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+
+; GCNHSA: enable_sgpr_private_segment_buffer = 1
+; GCNHSA: enable_sgpr_dispatch_ptr = 0
+; GCNHSA: enable_sgpr_queue_ptr = 0
+; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1
+; GCNHSA: enable_sgpr_dispatch_id = 0
+; GCNHSA: enable_sgpr_flat_scratch_init = 0
+; GCNHSA: enable_sgpr_private_segment_size = 0
+; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0
+; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0
+; GCNHSA: enable_sgpr_grid_workgroup_count_z = 0
+; GCNHSA: workitem_private_segment_byte_size = 0
; GCNHSA: private_segment_alignment = 4
; GCNHSA: .end_amd_kernel_code_t
-; GCNHSA: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCNHSA: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCNHSA: s_mov_b32 s10, -1
-; CIHSA: s_mov_b32 s11, 0x180f000
-; VIHSA: s_mov_b32 s11, 0x11800000
-; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen
-; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen
+; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
+; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
; Scratch size = alloca size + emergency stack slot
; ALL: ; ScratchSize: 32772
; CI: s_mov_b32 s11, 0x80f000
; VI: s_mov_b32 s11, 0x800000
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
; ALL: ; ScratchSize: 32772
define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
; CI: s_mov_b32 s11, 0x80f000
; VI: s_mov_b32 s11, 0x800000
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
; ALL: ; ScratchSize: 32772
define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 {
--- /dev/null
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}read_workdim:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].Z
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+define void @read_workdim(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.AMDGPU.read.workdim() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}read_workdim_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOT: 0xff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @read_workdim_known_bits(i32 addrspace(1)* %out) {
+entry:
+ %dim = call i32 @llvm.AMDGPU.read.workdim() #0
+ %shl = shl i32 %dim, 24
+ %shr = lshr i32 %shl, 24
+ store i32 %shr, i32 addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.AMDGPU.read.workdim() #0
+
+attributes #0 = { readnone }
; GCN-LABEL: {{^}}test:
; GCN: enable_sgpr_dispatch_ptr = 1
-; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
define void @test(i32 addrspace(1)* %out) {
%dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
%header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck %s
; CHECK-LABEL: {{^}}test_debug_value:
-; CHECK: s_load_dwordx2
-; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR0_SGPR1
+; CHECK: s_load_dwordx2 s[4:5]
+; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR4_SGPR5
; CHECK: buffer_store_dword
; CHECK: s_endpgm
define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {
--- /dev/null
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}local_size_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].Z
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1
+; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4
+
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_x(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.r600.read.local.size.x() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].W
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_y(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.r600.read.local.size.y() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].X
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_z(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.r600.read.local.size.z() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_xy:
+; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
+; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
+; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
+; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
+; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
+; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_xy(i32 addrspace(1)* %out) {
+entry:
+ %x = call i32 @llvm.r600.read.local.size.x() #0
+ %y = call i32 @llvm.r600.read.local.size.y() #0
+ %val = mul i32 %x, %y
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_xz:
+
+; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
+; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
+; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
+; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_xz(i32 addrspace(1)* %out) {
+entry:
+ %x = call i32 @llvm.r600.read.local.size.x() #0
+ %z = call i32 @llvm.r600.read.local.size.z() #0
+ %val = mul i32 %x, %z
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_yz:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 1
+
+; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
+; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
+; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_yz(i32 addrspace(1)* %out) {
+entry:
+ %y = call i32 @llvm.r600.read.local.size.y() #0
+ %z = call i32 @llvm.r600.read.local.size.z() #0
+ %val = mul i32 %y, %z
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_xyz:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 1
+
+; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
+; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
+; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
+; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
+; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
+; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_xyz(i32 addrspace(1)* %out) {
+entry:
+ %x = call i32 @llvm.r600.read.local.size.x() #0
+ %y = call i32 @llvm.r600.read.local.size.y() #0
+ %z = call i32 @llvm.r600.read.local.size.z() #0
+ %xy = mul i32 %x, %y
+ %xyz = add i32 %xy, %z
+ store i32 %xyz, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_x_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_x_known_bits(i32 addrspace(1)* %out) {
+entry:
+ %size = call i32 @llvm.r600.read.local.size.x() #0
+ %shl = shl i32 %size, 16
+ %shr = lshr i32 %shl, 16
+ store i32 %shr, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_y_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_y_known_bits(i32 addrspace(1)* %out) {
+entry:
+ %size = call i32 @llvm.r600.read.local.size.y() #0
+ %shl = shl i32 %size, 16
+ %shr = lshr i32 %shl, 16
+ store i32 %shr, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_z_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_z_known_bits(i32 addrspace(1)* %out) {
+entry:
+ %size = call i32 @llvm.r600.read.local.size.z() #0
+ %shl = shl i32 %size, 16
+ %shr = lshr i32 %shl, 16
+ store i32 %shr, i32 addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.r600.read.local.size.x() #0
+declare i32 @llvm.r600.read.local.size.y() #0
+declare i32 @llvm.r600.read.local.size.z() #0
+
+attributes #0 = { nounwind readnone }
; EG: .long 166120
; EG-NEXT: .long 8
; GCN: .long 47180
-; GCN-NEXT: .long 38792
+; GCN-NEXT: .long 32900
; EG: {{^}}local_memory_two_objects:
; EG: .long 166120
; EG-NEXT: .long 128
; SI: .long 47180
-; SI-NEXT: .long 71560
+; SI-NEXT: .long 65668
; CI: .long 47180
-; CI-NEXT: .long 38792
+; CI-NEXT: .long 32900
; FUNC-LABEL: {{^}}local_memory:
; GCN-LABEL: {{^}}spill_vgpr_compute:
+; GCN: s_mov_b32 s16, s3
; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-NEXT: s_mov_b32 s14, -1
; SI-NEXT: s_mov_b32 s15, 0x80f000
; VI-NEXT: s_mov_b32 s15, 0x800000
-; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s8 offset:{{[0-9]+}} ; 4-byte Folded Spill
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}}
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}}
+; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
; GCN: NumVgprs: 256
; GCN: ScratchSize: 1024
; GCN-LABEL: {{^}}main:
-; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCN-NEXT: s_mov_b32 s10, -1
-; SI-NEXT: s_mov_b32 s11, 0x80f000
-; VI-NEXT: s_mov_b32 s11, 0x800000
+; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s14, -1
+; SI-NEXT: s_mov_b32 s15, 0x80f000
+; VI-NEXT: s_mov_b32 s15, 0x800000
; s12 is offset user SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, s[8:11], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
; GCN: NumVgprs: 256
; GCN: ScratchSize: 1024
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=HSA -check-prefix=CI-HSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=HSA -check-prefix=VI-HSA -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[0].X
-; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; HSA: .amd_kernel_code_t
+
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: enable_sgpr_dispatch_id = 0
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: enable_sgpr_private_segment_size = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+
+; HSA: .end_amd_kernel_code_t
+
+
+; GCN-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
define void @ngroups_x (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.ngroups.x() #0
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[0].Y
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
define void @ngroups_y (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.ngroups.y() #0
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[0].Z
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
define void @ngroups_z (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.ngroups.z() #0
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[0].W
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
define void @global_size_x (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.global.size.x() #0
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[1].X
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
define void @global_size_y (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.global.size.y() #0
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[1].Y
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
define void @global_size_z (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.global.size.z() #0
ret void
}
-; FUNC-LABEL: {{^}}local_size_x:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].Z
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_x (i32 addrspace(1)* %out) {
-entry:
- %0 = call i32 @llvm.r600.read.local.size.x() #0
- store i32 %0, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_y:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].W
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_y (i32 addrspace(1)* %out) {
-entry:
- %0 = call i32 @llvm.r600.read.local.size.y() #0
- store i32 %0, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_z:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].X
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_z (i32 addrspace(1)* %out) {
-entry:
- %0 = call i32 @llvm.r600.read.local.size.z() #0
- store i32 %0, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}get_work_dim:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].Z
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @get_work_dim (i32 addrspace(1)* %out) {
-entry:
- %0 = call i32 @llvm.AMDGPU.read.workdim() #0
- store i32 %0, i32 addrspace(1)* %out
- ret void
-}
-
-; The tgid values are stored in sgprs offset by the number of user sgprs.
-; Currently we always use exactly 2 user sgprs for the pointer to the
-; kernel arguments, but this may change in the future.
+; The tgid values are stored in sgprs offset by the number of user
+; sgprs.
; FUNC-LABEL: {{^}}tgid_x:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4
+; HSA: .amd_kernel_code_t
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 0
+; HSA: compute_pgm_rsrc2_tgid_z_en = 0
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+; HSA: .end_amd_kernel_code_t
+
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}}
+; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}}
; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_x (i32 addrspace(1)* %out) {
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @tgid_x(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tgid.x() #0
store i32 %0, i32 addrspace(1)* %out
}
; FUNC-LABEL: {{^}}tgid_y:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 1
+; HSA: compute_pgm_rsrc2_tgid_z_en = 0
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3
+; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7
; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_y (i32 addrspace(1)* %out) {
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @tgid_y(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tgid.y() #0
store i32 %0, i32 addrspace(1)* %out
}
; FUNC-LABEL: {{^}}tgid_z:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 0
+; HSA: compute_pgm_rsrc2_tgid_z_en = 1
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: enable_sgpr_dispatch_id = 0
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: enable_sgpr_private_segment_size = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}}
+; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}}
; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_z (i32 addrspace(1)* %out) {
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @tgid_z(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tgid.z() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 132{{$}}
+
; FUNC-LABEL: {{^}}tidig_x:
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
; GCN: buffer_store_dword v0
-define void @tidig_x (i32 addrspace(1)* %out) {
+define void @tidig_x(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tidig.x() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 2180{{$}}
+
; FUNC-LABEL: {{^}}tidig_y:
+
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1
; GCN: buffer_store_dword v1
-define void @tidig_y (i32 addrspace(1)* %out) {
+define void @tidig_y(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tidig.y() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 4228{{$}}
+
; FUNC-LABEL: {{^}}tidig_z:
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2
; GCN: buffer_store_dword v2
-define void @tidig_z (i32 addrspace(1)* %out) {
+define void @tidig_z(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tidig.z() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
-; FUNC-LABEL: {{^}}local_size_x_known_bits:
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
-; GCN-NOT: 0xffff
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_x_known_bits(i32 addrspace(1)* %out) {
-entry:
- %size = call i32 @llvm.r600.read.local.size.x() #0
- %shl = shl i32 %size, 16
- %shr = lshr i32 %shl, 16
- store i32 %shr, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_y_known_bits:
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
-; GCN-NOT: 0xffff
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_y_known_bits(i32 addrspace(1)* %out) {
-entry:
- %size = call i32 @llvm.r600.read.local.size.y() #0
- %shl = shl i32 %size, 16
- %shr = lshr i32 %shl, 16
- store i32 %shr, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_z_known_bits:
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
-; GCN-NOT: 0xffff
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_z_known_bits(i32 addrspace(1)* %out) {
-entry:
- %size = call i32 @llvm.r600.read.local.size.z() #0
- %shl = shl i32 %size, 16
- %shr = lshr i32 %shl, 16
- store i32 %shr, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}get_work_dim_known_bits:
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
-; GCN-NOT: 0xff
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @get_work_dim_known_bits(i32 addrspace(1)* %out) {
-entry:
- %dim = call i32 @llvm.AMDGPU.read.workdim() #0
- %shl = shl i32 %dim, 24
- %shr = lshr i32 %shl, 24
- store i32 %shr, i32 addrspace(1)* %out
- ret void
-}
-
declare i32 @llvm.r600.read.ngroups.x() #0
declare i32 @llvm.r600.read.ngroups.y() #0
declare i32 @llvm.r600.read.ngroups.z() #0
declare i32 @llvm.r600.read.global.size.y() #0
declare i32 @llvm.r600.read.global.size.z() #0
-declare i32 @llvm.r600.read.local.size.x() #0
-declare i32 @llvm.r600.read.local.size.y() #0
-declare i32 @llvm.r600.read.local.size.z() #0
-
declare i32 @llvm.r600.read.tgid.x() #0
declare i32 @llvm.r600.read.tgid.y() #0
declare i32 @llvm.r600.read.tgid.z() #0