X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FAMDGPU%2FSIISelLowering.cpp;h=0c678c4b73b9072f82683d1581c553f9d8d5bf16;hb=7207597936bb29378fdc9cf5f5705fb715aa7ef3;hp=b4b439c2a96a68dc0c896e0d9cbd28653db1f08c;hpb=bc68baa6944a6fd884fab3f92bedb7d730afb274;p=oota-llvm.git diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index b4b439c2a96..0c678c4b73b 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -20,6 +20,7 @@ #include "SIISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" @@ -51,6 +52,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); @@ -103,6 +107,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::v4i1, Expand); setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); @@ -155,13 +160,30 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, for (MVT VT : MVT::fp_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + + setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::LOAD, MVT::v2i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); + + setOperationAction(ISD::STORE, MVT::v2i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); + + setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); @@ -173,9 +195,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); + + + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch(Op) { case ISD::LOAD: @@ -186,6 +213,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, case ISD::INSERT_VECTOR_ELT: case ISD::INSERT_SUBVECTOR: case ISD::EXTRACT_SUBVECTOR: + case ISD::SCALAR_TO_VECTOR: break; case ISD::CONCAT_VECTORS: setOperationAction(Op, VT, Custom); @@ -197,6 +225,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, } } + // Most operations are naturally 32-bit vector operations. We only support + // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. + for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); + } + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); @@ -215,7 +259,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setTargetDAGCombine(ISD::SMAX); setTargetDAGCombine(ISD::UMIN); setTargetDAGCombine(ISD::UMAX); - setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); @@ -449,6 +492,32 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, return MVT::Other; } +static bool isFlatGlobalAddrSpace(unsigned AS) { + return AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS; +} + +bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); +} + + +bool SITargetLowering::isMemOpUniform(const SDNode *N) const { + const MemSDNode *MemNode = cast(N); + const Value *Ptr = MemNode->getMemOperand()->getValue(); + + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers + if (isa(Ptr) || isa(Ptr) || isa(Ptr) || + isa(Ptr)) + return true; + + const Instruction *I = dyn_cast_or_null(Ptr); + return I && I->getMetadata("amdgpu.uniform"); +} + TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(EVT VT) const { if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) @@ -471,7 +540,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = static_cast(Subtarget->getRegisterInfo()); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); Type *Ty = VT.getTypeForEVT(*DAG.getContext()); @@ -509,8 +578,16 @@ SDValue SITargetLowering::LowerFormalArguments( MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo(); + const AMDGPUSubtarget &ST = MF.getSubtarget(); + + if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) { + const Function *Fn = MF.getFunction(); + DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA"); + DAG.getContext()->diagnose(NoGraphicsHSA); + return SDValue(); + } - assert(CallConv == CallingConv::C); + // FIXME: We currently assume all calling conventions are kernels. SmallVector Splits; BitVector Skipped(Ins.size()); @@ -520,18 +597,20 @@ SDValue SITargetLowering::LowerFormalArguments( // First check if it's a PS input addr if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && - !Arg.Flags.isByVal()) { - - assert((PSInputNum <= 15) && "Too many PS inputs!"); + !Arg.Flags.isByVal() && PSInputNum <= 15) { - if (!Arg.Used) { + if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { // We can safely skip PS inputs Skipped.set(i); ++PSInputNum; continue; } - Info->PSInputAddr |= 1 << PSInputNum++; + Info->markPSInputAllocated(PSInputNum); + if (Arg.Used) + Info->PSInputEna |= 1 << PSInputNum; + + ++PSInputNum; } // Second split vertices into their elements @@ -561,41 +640,18 @@ SDValue SITargetLowering::LowerFormalArguments( *DAG.getContext()); // At least one interpolation mode must be enabled or else the GPU will hang. + // + // Check PSInputAddr instead of PSInputEna. The idea is that if the user set + // PSInputAddr, the user wants to enable some bits after the compilation + // based on run-time states. Since we can't know what the final PSInputEna + // will look like, so we shouldn't do anything here and the user should take + // responsibility for the correct programming. if (Info->getShaderType() == ShaderType::PIXEL && - (Info->PSInputAddr & 0x7F) == 0) { - Info->PSInputAddr |= 1; + (Info->getPSInputAddr() & 0x7F) == 0) { CCInfo.AllocateReg(AMDGPU::VGPR0); CCInfo.AllocateReg(AMDGPU::VGPR1); - } - - // The pointer to the list of arguments is stored in SGPR0, SGPR1 - // The pointer to the scratch buffer is stored in SGPR2, SGPR3 - if (Info->getShaderType() == ShaderType::COMPUTE) { - if (Subtarget->isAmdHsaOS()) - Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. - else - Info->NumUserSGPRs = 4; - - unsigned InputPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); - unsigned InputPtrRegLo = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned InputPtrRegHi = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); - - unsigned ScratchPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchPtrRegLo = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned ScratchPtrRegHi = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); - - CCInfo.AllocateReg(InputPtrRegLo); - CCInfo.AllocateReg(InputPtrRegHi); - CCInfo.AllocateReg(ScratchPtrRegLo); - CCInfo.AllocateReg(ScratchPtrRegHi); - MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); - MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); + Info->markPSInputAllocated(0); + Info->PSInputEna |= 1; } if (Info->getShaderType() == ShaderType::COMPUTE) { @@ -603,6 +659,25 @@ SDValue SITargetLowering::LowerFormalArguments( Splits); } + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? + if (Info->hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + CCInfo.AllocateReg(PrivateSegmentBufferReg); + } + + if (Info->hasDispatchPtr()) { + unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(DispatchPtrReg); + } + + if (Info->hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(InputPtrReg); + } + AnalyzeFormalArguments(CCInfo, Splits); SmallVector Chains; @@ -690,10 +765,113 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } - if (Info->getShaderType() != ShaderType::COMPUTE) { - unsigned ScratchIdx = CCInfo.getFirstUnallocated(makeArrayRef( - AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); - Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. + + // Start adding system SGPRs. + if (Info->hasWorkGroupIDX()) { + unsigned Reg = Info->addWorkGroupIDX(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("work group id x is always enabled"); + + if (Info->hasWorkGroupIDY()) { + unsigned Reg = Info->addWorkGroupIDY(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupIDZ()) { + unsigned Reg = Info->addWorkGroupIDZ(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupInfo()) { + unsigned Reg = Info->addWorkGroupInfo(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasPrivateSegmentWaveByteOffset()) { + // Scratch wave offset passed in system SGPR. + unsigned PrivateSegmentWaveByteOffsetReg + = Info->addPrivateSegmentWaveByteOffset(); + + MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); + } + + // Now that we've figured out where the scratch register inputs are, see if + // should reserve the arguments and use them directly. + + bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + + if (ST.isAmdHsaOS()) { + // TODO: Assume we will spill without optimizations. + if (HasStackObjects) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the HSA ABI, this will be the first 4 user SGPR + // inputs. We can reserve those and use them directly. + + unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + Info->setScratchRSrcReg(PrivateSegmentBufferReg); + + unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + } else { + unsigned ReservedBufferReg + = TRI->reservedPrivateSegmentBufferReg(MF); + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + + // We tentatively reserve the last registers (skipping the last two + // which may contain VCC). After register allocation, we'll replace + // these with the ones immediately after those which were really + // allocated. In the prologue copies will be inserted from the argument + // to these reserved registers. + Info->setScratchRSrcReg(ReservedBufferReg); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } else { + unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); + + // Without HSA, relocations are used for the scratch pointer and the + // buffer resource setup is always inserted in the prologue. Scratch wave + // offset is still in an input SGPR. + Info->setScratchRSrcReg(ReservedBufferReg); + + if (HasStackObjects) { + unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + } else { + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } + + if (Info->hasWorkItemIDX()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("workitem id x should always be enabled"); + + if (Info->hasWorkItemIDY()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkItemIDZ()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); } if (Chains.empty()) @@ -702,30 +880,103 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } +SDValue SITargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + SDLoc DL, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *Info = MF.getInfo(); + + if (Info->getShaderType() == ShaderType::COMPUTE) + return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, + OutVals, DL, DAG); + + SmallVector Splits; + SmallVector SplitVals; + + // Split vectors into their elements. + for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + const ISD::OutputArg &Out = Outs[i]; + + if (Out.VT.isVector()) { + MVT VT = Out.VT.getVectorElementType(); + ISD::OutputArg NewOut = Out; + NewOut.Flags.setSplit(); + NewOut.VT = VT; + + // We want the original number of vector elements here, e.g. + // three or five, not four or eight. + unsigned NumElements = Out.ArgVT.getVectorNumElements(); + + for (unsigned j = 0; j != NumElements; ++j) { + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i], + DAG.getConstant(j, DL, MVT::i32)); + SplitVals.push_back(Elem); + Splits.push_back(NewOut); + NewOut.PartOffset += NewOut.VT.getStoreSize(); + } + } else { + SplitVals.push_back(OutVals[i]); + Splits.push_back(Out); + } + } + + // CCValAssign - represent the assignment of the return value to a location. + SmallVector RVLocs; + + // CCState - Info about the registers and stack slots. + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + + // Analyze outgoing return values. + AnalyzeReturn(CCInfo, Splits); + + SDValue Flag; + SmallVector RetOps; + RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + + // Copy the result values into the output registers. + for (unsigned i = 0, realRVLocIdx = 0; + i != RVLocs.size(); + ++i, ++realRVLocIdx) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + SDValue Arg = SplitVals[realRVLocIdx]; + + // Copied from other backends. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + break; + } + + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } + + // Update chain and glue. + RetOps[0] = Chain; + if (Flag.getNode()) + RetOps.push_back(Flag); + + return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps); +} + MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { - MachineBasicBlock::iterator I = *MI; - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; - case AMDGPU::SI_RegisterStorePseudo: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - MachineInstrBuilder MIB = - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), - Reg); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) - MIB.addOperand(MI->getOperand(i)); - - MI->eraseFromParent(); - break; - } } return BB; } @@ -956,20 +1207,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, const GlobalValue *GV = GSD->getGlobal(); MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); - SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); - - SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(0, DL, MVT::i32)); - SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(1, DL, MVT::i32)); - - SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrLo, GA); - SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrHi, DAG.getConstant(0, DL, MVT::i32), - SDValue(Lo.getNode(), 1)); - return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); + return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA); } SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, @@ -989,6 +1228,18 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, // a glue result. } +SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, + SDValue Op, + MVT VT, + unsigned Offset) const { + SDLoc SL(Op); + SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, + DAG.getEntryNode(), Offset, false); + // The local size values will have the hi 16-bits as zero. + return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, + DAG.getValueType(VT)); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -1003,6 +1254,17 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // TODO: Should this propagate fast-math-flags? switch (IntrinsicID) { + case Intrinsic::amdgcn_dispatch_ptr: + if (!Subtarget->isAmdHsaOS()) { + DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(), + "hsa intrinsic without hsa target"); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); + } + + return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT); + case Intrinsic::r600_read_ngroups_x: return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_X, false); @@ -1022,37 +1284,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_X, false); + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_X); case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Y, false); + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_Y); case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Z, false); - + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_Z); case Intrinsic::AMDGPU_read_workdim: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - getImplicitParameterOffset(MFI, GRID_DIM), false); - + // Really only 2 bits. + return lowerImplicitZextParam(DAG, Op, MVT::i8, + getImplicitParameterOffset(MFI, GRID_DIM)); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); case Intrinsic::r600_read_tgid_y: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); case Intrinsic::r600_read_tgid_z: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); case Intrinsic::r600_read_tidig_x: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); case Intrinsic::r600_read_tidig_y: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops[] = { Op.getOperand(1), @@ -1091,6 +1352,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getConstant(2, DL, MVT::i32), // P0 Op.getOperand(1), Op.getOperand(2), Glue); } + case AMDGPUIntrinsic::SI_packf16: + if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) + return DAG.getUNDEF(MVT::i32); + return Op; case AMDGPUIntrinsic::SI_fs_interp: { SDValue IJ = Op.getOperand(4); SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, @@ -1106,6 +1371,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, Op.getOperand(1), Op.getOperand(2), Glue); } + case Intrinsic::amdgcn_interp_p1: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); + SDValue Glue = M0.getValue(1); + return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Glue); + } + case Intrinsic::amdgcn_interp_p2: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); + SDValue Glue = SDValue(M0.getNode(), 1); + return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), + Glue); + } default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); } @@ -1166,16 +1444,29 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { "Custom lowering for non-i32 vectors hasn't been implemented."); unsigned NumElements = Op.getValueType().getVectorNumElements(); assert(NumElements != 2 && "v2 loads are supported for all address spaces."); + switch (Load->getAddressSpace()) { default: break; + case AMDGPUAS::CONSTANT_ADDRESS: + if (isMemOpUniform(Load)) + break; + // Non-uniform loads will be selected to MUBUF instructions, so they + // have the same legalization requires ments as global and private + // loads. + // + // Fall-through case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::PRIVATE_ADDRESS: + if (NumElements >= 8) + return SplitVectorLoad(Op, DAG); + // v4 loads are supported for private and global memory. if (NumElements <= 4) break; // fall-through case AMDGPUAS::LOCAL_ADDRESS: - return ScalarizeVectorLoad(Op, DAG); + // If properly aligned, if we split we might be able to use ds_read_b64. + return SplitVectorLoad(Op, DAG); } } @@ -1397,7 +1688,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return Ret; if (VT.isVector() && VT.getVectorNumElements() >= 8) - return ScalarizeVectorStore(Op, DAG); + return SplitVectorStore(Op, DAG); if (VT == MVT::i1) return DAG.getTruncStore(Store->getChain(), DL, @@ -1840,7 +2131,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::UINT_TO_FP: { return performUCharToFloatCombine(N, DCI); - + } case ISD::FADD: { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) break; @@ -1922,7 +2213,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, break; } - } case ISD::LOAD: case ISD::STORE: case ISD::ATOMIC_LOAD: @@ -2144,9 +2434,14 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, static_cast(Subtarget->getInstrInfo()); MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - TII->legalizeOperands(MI); - if (TII->isMIMG(MI->getOpcode())) { + if (TII->isVOP3(MI->getOpcode())) { + // Make sure constant bus requirements are respected. + TII->legalizeOperandsVOP3(MRI, MI); + return; + } + + if (TII->isMIMG(*MI)) { unsigned VReg = MI->getOperand(0).getReg(); unsigned Writemask = MI->getOperand(1).getImm(); unsigned BitsSet = 0; @@ -2252,15 +2547,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); } -MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23()); -} - SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { @@ -2278,13 +2564,41 @@ std::pair SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { - if (Constraint == "r") { - switch(VT.SimpleTy) { - default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); - case MVT::i64: - return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); - case MVT::i32: + + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 's': + case 'r': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); + case 64: + return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); + case 128: + return std::make_pair(0U, &AMDGPU::SReg_128RegClass); + case 256: + return std::make_pair(0U, &AMDGPU::SReg_256RegClass); + } + + case 'v': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: + return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); + case 64: + return std::make_pair(0U, &AMDGPU::VReg_64RegClass); + case 96: + return std::make_pair(0U, &AMDGPU::VReg_96RegClass); + case 128: + return std::make_pair(0U, &AMDGPU::VReg_128RegClass); + case 256: + return std::make_pair(0U, &AMDGPU::VReg_256RegClass); + case 512: + return std::make_pair(0U, &AMDGPU::VReg_512RegClass); + } } } @@ -2305,3 +2619,16 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } + +SITargetLowering::ConstraintType +SITargetLowering::getConstraintType(StringRef Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 's': + case 'v': + return C_RegisterClass; + } + } + return TargetLowering::getConstraintType(Constraint); +}