setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
setTargetDAGCombine(ISD::SMAX);
setTargetDAGCombine(ISD::UMIN);
setTargetDAGCombine(ISD::UMAX);
- setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::OR);
return MVT::Other;
}
+static bool isFlatGlobalAddrSpace(unsigned AS) {
+ return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+ AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
+}
+
+
+bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
+ const MemSDNode *MemNode = cast<MemSDNode>(N);
+ const Value *Ptr = MemNode->getMemOperand()->getValue();
+
+ // UndefValue means this is a load of a kernel input. These are uniform.
+ // Sometimes LDS instructions have constant pointers
+ if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) ||
+ isa<GlobalValue>(Ptr))
+ return true;
+
+ const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
+ return I && I->getMetadata("amdgpu.uniform");
+}
+
TargetLoweringBase::LegalizeTypeAction
SITargetLowering::getPreferredVectorAction(EVT VT) const {
if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
// First check if it's a PS input addr
if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
- !Arg.Flags.isByVal()) {
-
- assert((PSInputNum <= 15) && "Too many PS inputs!");
+ !Arg.Flags.isByVal() && PSInputNum <= 15) {
- if (!Arg.Used) {
+ if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
// We can safely skip PS inputs
Skipped.set(i);
++PSInputNum;
continue;
}
- Info->PSInputAddr |= 1 << PSInputNum++;
+ Info->markPSInputAllocated(PSInputNum);
+ if (Arg.Used)
+ Info->PSInputEna |= 1 << PSInputNum;
+
+ ++PSInputNum;
}
// Second split vertices into their elements
*DAG.getContext());
// At least one interpolation mode must be enabled or else the GPU will hang.
+ //
+ // Check PSInputAddr instead of PSInputEna. The idea is that if the user set
+ // PSInputAddr, the user wants to enable some bits after the compilation
+ // based on run-time states. Since we can't know what the final PSInputEna
+ // will look like, so we shouldn't do anything here and the user should take
+ // responsibility for the correct programming.
if (Info->getShaderType() == ShaderType::PIXEL &&
- (Info->PSInputAddr & 0x7F) == 0) {
- Info->PSInputAddr |= 1;
+ (Info->getPSInputAddr() & 0x7F) == 0) {
CCInfo.AllocateReg(AMDGPU::VGPR0);
CCInfo.AllocateReg(AMDGPU::VGPR1);
+ Info->markPSInputAllocated(0);
+ Info->PSInputEna |= 1;
}
if (Info->getShaderType() == ShaderType::COMPUTE) {
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
+SDValue SITargetLowering::LowerReturn(SDValue Chain,
+ CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ SDLoc DL, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ if (Info->getShaderType() == ShaderType::COMPUTE)
+ return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
+ OutVals, DL, DAG);
+
+ SmallVector<ISD::OutputArg, 48> Splits;
+ SmallVector<SDValue, 48> SplitVals;
+
+ // Split vectors into their elements.
+ for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+ const ISD::OutputArg &Out = Outs[i];
+
+ if (Out.VT.isVector()) {
+ MVT VT = Out.VT.getVectorElementType();
+ ISD::OutputArg NewOut = Out;
+ NewOut.Flags.setSplit();
+ NewOut.VT = VT;
+
+ // We want the original number of vector elements here, e.g.
+ // three or five, not four or eight.
+ unsigned NumElements = Out.ArgVT.getVectorNumElements();
+
+ for (unsigned j = 0; j != NumElements; ++j) {
+ SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
+ DAG.getConstant(j, DL, MVT::i32));
+ SplitVals.push_back(Elem);
+ Splits.push_back(NewOut);
+ NewOut.PartOffset += NewOut.VT.getStoreSize();
+ }
+ } else {
+ SplitVals.push_back(OutVals[i]);
+ Splits.push_back(Out);
+ }
+ }
+
+ // CCValAssign - represent the assignment of the return value to a location.
+ SmallVector<CCValAssign, 48> RVLocs;
+
+ // CCState - Info about the registers and stack slots.
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ // Analyze outgoing return values.
+ AnalyzeReturn(CCInfo, Splits);
+
+ SDValue Flag;
+ SmallVector<SDValue, 48> RetOps;
+ RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0, realRVLocIdx = 0;
+ i != RVLocs.size();
+ ++i, ++realRVLocIdx) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ SDValue Arg = SplitVals[realRVLocIdx];
+
+ // Copied from other backends.
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ break;
+ }
+
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ // Update chain and glue.
+ RetOps[0] = Chain;
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr * MI, MachineBasicBlock * BB) const {
const GlobalValue *GV = GSD->getGlobal();
MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
- SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
-
- SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
- DAG.getConstant(0, DL, MVT::i32));
- SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
- DAG.getConstant(1, DL, MVT::i32));
-
- SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
- PtrLo, GA);
- SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
- PtrHi, DAG.getConstant(0, DL, MVT::i32),
- SDValue(Lo.getNode(), 1));
- return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+ return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA);
}
SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
switch (IntrinsicID) {
case Intrinsic::amdgcn_dispatch_ptr:
+ if (!Subtarget->isAmdHsaOS()) {
+ DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(),
+ "hsa intrinsic without hsa target");
+ DAG.getContext()->diagnose(BadIntrin);
+ return DAG.getUNDEF(VT);
+ }
+
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT);
return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
Op.getOperand(1), Op.getOperand(2), Glue);
}
+ case Intrinsic::amdgcn_interp_p1: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
+ SDValue Glue = M0.getValue(1);
+ return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3), Glue);
+ }
+ case Intrinsic::amdgcn_interp_p2: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
+ SDValue Glue = SDValue(M0.getNode(), 1);
+ return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
+ Glue);
+ }
default:
return AMDGPUTargetLowering::LowerOperation(Op, DAG);
}
switch (Load->getAddressSpace()) {
default: break;
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ if (isMemOpUniform(Load))
+ break;
+ // Non-uniform loads will be selected to MUBUF instructions, so they
+ // have the same legalization requires ments as global and private
+ // loads.
+ //
+ // Fall-through
case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::PRIVATE_ADDRESS:
if (NumElements >= 8)
case ISD::UINT_TO_FP: {
return performUCharToFloatCombine(N, DCI);
-
+ }
case ISD::FADD: {
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
break;
break;
}
- }
case ISD::LOAD:
case ISD::STORE:
case ISD::ATOMIC_LOAD:
SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
- if (Constraint == "r") {
- switch(VT.SimpleTy) {
- default: llvm_unreachable("Unhandled type for 'r' inline asm constraint");
- case MVT::i64:
- return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
- case MVT::i32:
+
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 's':
+ case 'r':
+ switch (VT.getSizeInBits()) {
+ default:
+ return std::make_pair(0U, nullptr);
+ case 32:
return std::make_pair(0U, &AMDGPU::SGPR_32RegClass);
+ case 64:
+ return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+ case 128:
+ return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
+ case 256:
+ return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+ }
+
+ case 'v':
+ switch (VT.getSizeInBits()) {
+ default:
+ return std::make_pair(0U, nullptr);
+ case 32:
+ return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
+ case 64:
+ return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
+ case 96:
+ return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
+ case 128:
+ return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
+ case 256:
+ return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
+ case 512:
+ return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
+ }
}
}
}
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
+
+SITargetLowering::ConstraintType
+SITargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default: break;
+ case 's':
+ case 'v':
+ return C_RegisterClass;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}