X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FAMDGPU%2FSIISelLowering.cpp;h=544867513d9c541720560cae67e978e4eca156e6;hb=1c18e5b909377e768aef8039d47d8a7a92a15323;hp=ab93bceb96ec5ce277cae372773177628b949a86;hpb=76eb0395686d266a7416d578867fc62ef5e4ee82;p=oota-llvm.git diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index ab93bceb96e..544867513d9 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -107,6 +107,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::v4i1, Expand); setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); @@ -258,7 +259,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setTargetDAGCombine(ISD::SMAX); setTargetDAGCombine(ISD::UMIN); setTargetDAGCombine(ISD::UMAX); - setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); @@ -492,6 +492,32 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, return MVT::Other; } +static bool isFlatGlobalAddrSpace(unsigned AS) { + return AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS; +} + +bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); +} + + +bool SITargetLowering::isMemOpUniform(const SDNode *N) const { + const MemSDNode *MemNode = cast(N); + const Value *Ptr = MemNode->getMemOperand()->getValue(); + + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers + if (isa(Ptr) || isa(Ptr) || isa(Ptr) || + isa(Ptr)) + return true; + + const Instruction *I = dyn_cast_or_null(Ptr); + return I && I->getMetadata("amdgpu.uniform"); +} + TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(EVT VT) const { if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) @@ -571,18 +597,20 @@ SDValue SITargetLowering::LowerFormalArguments( // First check if it's a PS input addr if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && - !Arg.Flags.isByVal()) { - - assert((PSInputNum <= 15) && "Too many PS inputs!"); + !Arg.Flags.isByVal() && PSInputNum <= 15) { - if (!Arg.Used) { + if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { // We can safely skip PS inputs Skipped.set(i); ++PSInputNum; continue; } - Info->PSInputAddr |= 1 << PSInputNum++; + Info->markPSInputAllocated(PSInputNum); + if (Arg.Used) + Info->PSInputEna |= 1 << PSInputNum; + + ++PSInputNum; } // Second split vertices into their elements @@ -612,11 +640,25 @@ SDValue SITargetLowering::LowerFormalArguments( *DAG.getContext()); // At least one interpolation mode must be enabled or else the GPU will hang. + // + // Check PSInputAddr instead of PSInputEna. The idea is that if the user set + // PSInputAddr, the user wants to enable some bits after the compilation + // based on run-time states. Since we can't know what the final PSInputEna + // will look like, so we shouldn't do anything here and the user should take + // responsibility for the correct programming. + // + // Otherwise, the following restrictions apply: + // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. + // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be + // enabled too. if (Info->getShaderType() == ShaderType::PIXEL && - (Info->PSInputAddr & 0x7F) == 0) { - Info->PSInputAddr |= 1; + ((Info->getPSInputAddr() & 0x7F) == 0 || + ((Info->getPSInputAddr() & 0xF) == 0 && + Info->isPSInputAllocated(11)))) { CCInfo.AllocateReg(AMDGPU::VGPR0); CCInfo.AllocateReg(AMDGPU::VGPR1); + Info->markPSInputAllocated(0); + Info->PSInputEna |= 1; } if (Info->getShaderType() == ShaderType::COMPUTE) { @@ -845,6 +887,97 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } +SDValue SITargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + SDLoc DL, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *Info = MF.getInfo(); + + if (Info->getShaderType() == ShaderType::COMPUTE) + return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, + OutVals, DL, DAG); + + Info->setIfReturnsVoid(Outs.size() == 0); + + SmallVector Splits; + SmallVector SplitVals; + + // Split vectors into their elements. + for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + const ISD::OutputArg &Out = Outs[i]; + + if (Out.VT.isVector()) { + MVT VT = Out.VT.getVectorElementType(); + ISD::OutputArg NewOut = Out; + NewOut.Flags.setSplit(); + NewOut.VT = VT; + + // We want the original number of vector elements here, e.g. + // three or five, not four or eight. + unsigned NumElements = Out.ArgVT.getVectorNumElements(); + + for (unsigned j = 0; j != NumElements; ++j) { + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i], + DAG.getConstant(j, DL, MVT::i32)); + SplitVals.push_back(Elem); + Splits.push_back(NewOut); + NewOut.PartOffset += NewOut.VT.getStoreSize(); + } + } else { + SplitVals.push_back(OutVals[i]); + Splits.push_back(Out); + } + } + + // CCValAssign - represent the assignment of the return value to a location. + SmallVector RVLocs; + + // CCState - Info about the registers and stack slots. + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + + // Analyze outgoing return values. + AnalyzeReturn(CCInfo, Splits); + + SDValue Flag; + SmallVector RetOps; + RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + + // Copy the result values into the output registers. + for (unsigned i = 0, realRVLocIdx = 0; + i != RVLocs.size(); + ++i, ++realRVLocIdx) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + SDValue Arg = SplitVals[realRVLocIdx]; + + // Copied from other backends. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + break; + } + + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } + + // Update chain and glue. + RetOps[0] = Chain; + if (Flag.getNode()) + RetOps.push_back(Flag); + + return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps); +} + MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { @@ -1083,20 +1216,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, const GlobalValue *GV = GSD->getGlobal(); MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); - SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); - - SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(0, DL, MVT::i32)); - SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(1, DL, MVT::i32)); - - SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrLo, GA); - SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrHi, DAG.getConstant(0, DL, MVT::i32), - SDValue(Lo.getNode(), 1)); - return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); + return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA); } SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, @@ -1143,6 +1264,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { case Intrinsic::amdgcn_dispatch_ptr: + if (!Subtarget->isAmdHsaOS()) { + DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(), + "hsa intrinsic without hsa target"); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); + } + return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT); @@ -1252,6 +1380,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, Op.getOperand(1), Op.getOperand(2), Glue); } + case Intrinsic::amdgcn_interp_p1: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); + SDValue Glue = M0.getValue(1); + return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Glue); + } + case Intrinsic::amdgcn_interp_p2: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); + SDValue Glue = SDValue(M0.getNode(), 1); + return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), + Glue); + } default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); } @@ -1315,6 +1456,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { switch (Load->getAddressSpace()) { default: break; + case AMDGPUAS::CONSTANT_ADDRESS: + if (isMemOpUniform(Load)) + break; + // Non-uniform loads will be selected to MUBUF instructions, so they + // have the same legalization requires ments as global and private + // loads. + // + // Fall-through case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::PRIVATE_ADDRESS: if (NumElements >= 8) @@ -1991,7 +2140,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::UINT_TO_FP: { return performUCharToFloatCombine(N, DCI); - + } case ISD::FADD: { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) break; @@ -2073,7 +2222,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, break; } - } case ISD::LOAD: case ISD::STORE: case ISD::ATOMIC_LOAD: @@ -2425,13 +2573,41 @@ std::pair SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { - if (Constraint == "r") { - switch(VT.SimpleTy) { - default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); - case MVT::i64: - return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); - case MVT::i32: + + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 's': + case 'r': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); + case 64: + return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); + case 128: + return std::make_pair(0U, &AMDGPU::SReg_128RegClass); + case 256: + return std::make_pair(0U, &AMDGPU::SReg_256RegClass); + } + + case 'v': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: + return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); + case 64: + return std::make_pair(0U, &AMDGPU::VReg_64RegClass); + case 96: + return std::make_pair(0U, &AMDGPU::VReg_96RegClass); + case 128: + return std::make_pair(0U, &AMDGPU::VReg_128RegClass); + case 256: + return std::make_pair(0U, &AMDGPU::VReg_256RegClass); + case 512: + return std::make_pair(0U, &AMDGPU::VReg_512RegClass); + } } } @@ -2452,3 +2628,16 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } + +SITargetLowering::ConstraintType +SITargetLowering::getConstraintType(StringRef Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 's': + case 'v': + return C_RegisterClass; + } + } + return TargetLowering::getConstraintType(Constraint); +}