X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FR600%2FAMDGPUISelLowering.cpp;h=948533bc09f07cae300cad99533223c25703636f;hb=8bd9405026b50394e173a4b3159aacd841efe564;hp=9891ad32fa7c29c38a849d8ec8e12f3e60297db7;hpb=68e132866236f5d59271d2c7ffb77a9c8e743752;p=oota-llvm.git diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 9891ad32fa7..948533bc09f 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -15,24 +15,80 @@ #include "AMDGPUISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUFrameLowering.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDILIntrinsicInfo.h" +#include "R600MachineFunctionInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" using namespace llvm; +namespace { + +/// Diagnostic information for unimplemented or unsupported feature reporting. +class DiagnosticInfoUnsupported : public DiagnosticInfo { +private: + const Twine &Description; + const Function &Fn; + + static int KindID; + + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } + +public: + DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, + DiagnosticSeverity Severity = DS_Error) + : DiagnosticInfo(getKindID(), Severity), + Description(Desc), + Fn(Fn) { } + + const Function &getFunction() const { return Fn; } + const Twine &getDescription() const { return Description; } + + void print(DiagnosticPrinter &DP) const override { + DP << "unsupported " << getDescription() << " in " << Fn.getName(); + } + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; + +int DiagnosticInfoUnsupported::KindID = 0; +} + + +static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), + ArgFlags.getOrigAlign()); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + + return true; +} + #include "AMDGPUGenCallingConv.inc" AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()) { + Subtarget = &TM.getSubtarget(); + // Initialize target lowering borrowed from AMDIL InitAMDILLowering(); @@ -48,6 +104,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FABS, MVT::f32, Legal); setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FROUND, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); // The hardware supports ROTR, but not ROTL setOperationAction(ISD::ROTL, MVT::i32, Expand); @@ -57,48 +115,218 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::f32, Promote); AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); + setOperationAction(ISD::STORE, MVT::v2f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::v8f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); + + setOperationAction(ISD::STORE, MVT::v16f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); + setOperationAction(ISD::STORE, MVT::f64, Promote); AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); + // Custom lowering of vector stores is required for local address space + // stores. + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + // XXX: Native v2i32 local address space stores are possible, but not + // currently implemented. + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); + setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); + + // XXX: This can be change to Custom, once ExpandVectorStores can + // handle 64-bit stores. + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + + setTruncStoreAction(MVT::i64, MVT::i1, Expand); + setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); + setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); + + setOperationAction(ISD::LOAD, MVT::f32, Promote); AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); + setOperationAction(ISD::LOAD, MVT::v2f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v8f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); + + setOperationAction(ISD::LOAD, MVT::v16f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); + setOperationAction(ISD::LOAD, MVT::f64, Promote); AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); + + setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand); + + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + + setOperationAction(ISD::FNEG, MVT::v2f32, Expand); + setOperationAction(ISD::FNEG, MVT::v4f32, Expand); + + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Custom); setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::VSELECT, MVT::v2f32, Expand); + setOperationAction(ISD::VSELECT, MVT::v4f32, Expand); - int types[] = { - (int)MVT::v2i32, - (int)MVT::v4i32 + static const MVT::SimpleValueType IntTypes[] = { + MVT::v2i32, MVT::v4i32 }; - size_t NumTypes = sizeof(types) / sizeof(*types); + const size_t NumIntTypes = array_lengthof(IntTypes); - for (unsigned int x = 0; x < NumTypes; ++x) { - MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x]; + for (unsigned int x = 0; x < NumIntTypes; ++x) { + MVT::SimpleValueType VT = IntTypes[x]; //Expand the following operations for the current type by default setOperationAction(ISD::ADD, VT, Expand); setOperationAction(ISD::AND, VT, Expand); + setOperationAction(ISD::FP_TO_SINT, VT, Expand); + setOperationAction(ISD::FP_TO_UINT, VT, Expand); setOperationAction(ISD::MUL, VT, Expand); setOperationAction(ISD::OR, VT, Expand); setOperationAction(ISD::SHL, VT, Expand); + setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::SRL, VT, Expand); setOperationAction(ISD::SRA, VT, Expand); setOperationAction(ISD::SUB, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::XOR, VT, Expand); } + + static const MVT::SimpleValueType FloatTypes[] = { + MVT::v2f32, MVT::v4f32 + }; + const size_t NumFloatTypes = array_lengthof(FloatTypes); + + for (unsigned int x = 0; x < NumFloatTypes; ++x) { + MVT::SimpleValueType VT = FloatTypes[x]; + setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FADD, VT, Expand); + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::FFLOOR, VT, Expand); + setOperationAction(ISD::FTRUNC, VT, Expand); + setOperationAction(ISD::FMUL, VT, Expand); + setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FSQRT, VT, Expand); + setOperationAction(ISD::FSUB, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); + } + + setTargetDAGCombine(ISD::MUL); +} + +//===----------------------------------------------------------------------===// +// Target Information +//===----------------------------------------------------------------------===// + +MVT AMDGPUTargetLowering::getVectorIdxTy() const { + return MVT::i32; +} + +bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, + EVT CastTy) const { + if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) + return true; + + unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); + unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); + + return ((LScalarSize <= CastScalarSize) || + (CastScalarSize >= 32) || + (LScalarSize < 32)); +} + +//===---------------------------------------------------------------------===// +// Target Properties +//===---------------------------------------------------------------------===// + +bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { + assert(VT.isFloatingPoint()); + return VT == MVT::f32; +} + +bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { + assert(VT.isFloatingPoint()); + return VT == MVT::f32; +} + +bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { + // Truncate is just accessing a subregister. + return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); +} + +bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { + // Truncate is just accessing a subregister. + return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && + (Dest->getPrimitiveSizeInBits() % 32 == 0); +} + +bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { + const DataLayout *DL = getDataLayout(); + unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); + unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); + + return SrcSize == 32 && DestSize == 64; +} + +bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { + // Any register load of a 64-bit value really requires 2 32-bit moves. For all + // practical purposes, the extra mov 0 to load a 64-bit is free. As used, + // this will enable reducing 64-bit operations the 32-bit, which is always + // good. + return Src == MVT::i32 && Dest == MVT::i64; +} + +bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { + // There aren't really 64-bit registers, but pairs of 32-bit ones and only a + // limited number of native 64-bit operations. Shrinking an operation to fit + // in a single 32-bit register should always be helpful. As currently used, + // this is much less general than the name suggests, and is only used in + // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is + // not profitable, and may actually be harmful. + return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; } //===---------------------------------------------------------------------===// @@ -125,13 +353,30 @@ SDValue AMDGPUTargetLowering::LowerReturn( // Target specific lowering //===---------------------------------------------------------------------===// +SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl &InVals) const { + SDValue Callee = CLI.Callee; + SelectionDAG &DAG = CLI.DAG; + + const Function &Fn = *DAG.getMachineFunction().getFunction(); + + StringRef FuncName(""); + + if (const GlobalAddressSDNode *G = dyn_cast(Callee)) + FuncName = G->getGlobal()->getName(); + + DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName); + DAG.getContext()->diagnose(NoCalls); + return SDValue(); +} + SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: Op.getNode()->dump(); - assert(0 && "Custom lowering code for this" - "instruction is not implemented yet!"); + llvm_unreachable("Custom lowering code for this" + "instruction is not implemented yet!"); break; // AMDIL DAG lowering case ISD::SDIV: return LowerSDIV(Op, DAG); @@ -139,30 +384,176 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); // AMDGPU DAG lowering + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); + case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); } return Op; } +void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const { + switch (N->getOpcode()) { + case ISD::SIGN_EXTEND_INREG: + // Different parts of legalization seem to interpret which type of + // sign_extend_inreg is the one to check for custom lowering. The extended + // from type is what really matters, but some places check for custom + // lowering of the result type. This results in trying to use + // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do + // nothing here and let the illegal result integer be handled normally. + return; + + default: + return; + } +} + +SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, + const GlobalValue *GV, + const SDValue &InitPtr, + SDValue Chain, + SelectionDAG &DAG) const { + const DataLayout *TD = getTargetMachine().getDataLayout(); + SDLoc DL(InitPtr); + if (const ConstantInt *CI = dyn_cast(Init)) { + EVT VT = EVT::getEVT(CI->getType()); + PointerType *PtrTy = PointerType::get(CI->getType(), 0); + return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(CI->getType())); + } else if (const ConstantFP *CFP = dyn_cast(Init)) { + EVT VT = EVT::getEVT(CFP->getType()); + PointerType *PtrTy = PointerType::get(CFP->getType(), 0); + return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(CFP->getType())); + } else if (Init->getType()->isAggregateType()) { + EVT PtrVT = InitPtr.getValueType(); + unsigned NumElements = Init->getType()->getArrayNumElements(); + SmallVector Chains; + for (unsigned i = 0; i < NumElements; ++i) { + SDValue Offset = DAG.getConstant(i * TD->getTypeAllocSize( + Init->getType()->getArrayElementType()), PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); + Chains.push_back(LowerConstantInitializer(Init->getAggregateElement(i), + GV, Ptr, Chain, DAG)); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + Chains.data(), Chains.size()); + } else { + Init->dump(); + llvm_unreachable("Unhandled constant initializer"); + } +} + SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, SDValue Op, SelectionDAG &DAG) const { const DataLayout *TD = getTargetMachine().getDataLayout(); GlobalAddressSDNode *G = cast(Op); - // XXX: What does the value of G->getOffset() mean? - assert(G->getOffset() == 0 && + const GlobalValue *GV = G->getGlobal(); + + switch (G->getAddressSpace()) { + default: llvm_unreachable("Global Address lowering not implemented for this " + "address space"); + case AMDGPUAS::LOCAL_ADDRESS: { + // XXX: What does the value of G->getOffset() mean? + assert(G->getOffset() == 0 && "Do not know what to do with an non-zero offset"); - unsigned Offset = MFI->LDSSize; - const GlobalValue *GV = G->getGlobal(); - uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); + unsigned Offset; + if (MFI->LocalMemoryObjects.count(GV) == 0) { + uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); + Offset = MFI->LDSSize; + MFI->LocalMemoryObjects[GV] = Offset; + // XXX: Account for alignment? + MFI->LDSSize += Size; + } else { + Offset = MFI->LocalMemoryObjects[GV]; + } + + return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace())); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + Type *EltType = GV->getType()->getElementType(); + unsigned Size = TD->getTypeAllocSize(EltType); + unsigned Alignment = TD->getPrefTypeAlignment(EltType); + + const GlobalVariable *Var = dyn_cast(GV); + const Constant *Init = Var->getInitializer(); + int FI = FrameInfo->CreateStackObject(Size, Alignment, false); + SDValue InitPtr = DAG.getFrameIndex(FI, + getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); + SmallVector WorkList; + + for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), + E = DAG.getEntryNode()->use_end(); I != E; ++I) { + if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) + continue; + WorkList.push_back(*I); + } + SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); + for (SmallVector::iterator I = WorkList.begin(), + E = WorkList.end(); I != E; ++I) { + SmallVector Ops; + Ops.push_back(Chain); + for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { + Ops.push_back((*I)->getOperand(i)); + } + DAG.UpdateNodeOperands(*I, Ops.data(), Ops.size()); + } + return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), + getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); + } + } +} + +SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, + SelectionDAG &DAG) const { + SmallVector Args; + SDValue A = Op.getOperand(0); + SDValue B = Op.getOperand(1); + + DAG.ExtractVectorElements(A, Args); + DAG.ExtractVectorElements(B, Args); - // XXX: Account for alignment? - MFI->LDSSize += Size; + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), + Args.data(), Args.size()); +} + +SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + + SmallVector Args; + unsigned Start = cast(Op.getOperand(1))->getZExtValue(); + EVT VT = Op.getValueType(); + DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, + VT.getVectorNumElements()); + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), + Args.data(), Args.size()); +} - return DAG.getConstant(Offset, TD->getPointerSize() == 8 ? MVT::i64 : MVT::i32); +SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, + SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = + static_cast(getTargetMachine().getFrameLowering()); + + FrameIndexSDNode *FIN = dyn_cast(Op); + assert(FIN); + + unsigned FrameIndex = FIN->getIndex(); + unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); + return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), + Op.getValueType()); } SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, @@ -199,6 +590,30 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case AMDGPUIntrinsic::AMDGPU_umin: return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1), Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_bfe_i32: + return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfe_u32: + return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfi: + return DAG.getNode(AMDGPUISD::BFI, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfm: + return DAG.getNode(AMDGPUISD::BFM, DL, VT, + Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDIL_round_nearest: return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); } @@ -263,7 +678,7 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op, case ISD::SETTRUE2: case ISD::SETUO: case ISD::SETO: - assert(0 && "Operation should already be optimised !"); + llvm_unreachable("Operation should already be optimised!"); case ISD::SETULE: case ISD::SETULT: case ISD::SETOLE: @@ -287,12 +702,249 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op, return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS); } case ISD::SETCC_INVALID: - assert(0 && "Invalid setcc condcode !"); + llvm_unreachable("Invalid setcc condcode!"); } return Op; } +SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, + SelectionDAG &DAG) const { + LoadSDNode *Load = dyn_cast(Op); + EVT MemEltVT = Load->getMemoryVT().getVectorElementType(); + EVT EltVT = Op.getValueType().getVectorElementType(); + EVT PtrVT = Load->getBasePtr().getValueType(); + unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); + SmallVector Loads; + SDLoc SL(Op); + + for (unsigned i = 0, e = NumElts; i != e; ++i) { + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), + DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT)); + Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, + Load->getChain(), Ptr, + MachinePointerInfo(Load->getMemOperand()->getValue()), + MemEltVT, Load->isVolatile(), Load->isNonTemporal(), + Load->getAlignment())); + } + return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), + Loads.data(), Loads.size()); +} + +SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = dyn_cast(Op); + EVT MemVT = Store->getMemoryVT(); + unsigned MemBits = MemVT.getSizeInBits(); + + // Byte stores are really expensive, so if possible, try to pack 32-bit vector + // truncating store into an i32 store. + // XXX: We could also handle optimize other vector bitwidths. + if (!MemVT.isVector() || MemBits > 32) { + return SDValue(); + } + + SDLoc DL(Op); + SDValue Value = Store->getValue(); + EVT VT = Value.getValueType(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Ptr = Store->getBasePtr(); + EVT MemEltVT = MemVT.getVectorElementType(); + unsigned MemEltBits = MemEltVT.getSizeInBits(); + unsigned MemNumElements = MemVT.getVectorNumElements(); + unsigned PackedSize = MemVT.getStoreSizeInBits(); + SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32); + + assert(Value.getValueType().getScalarSizeInBits() >= 32); + + SDValue PackedValue; + for (unsigned i = 0; i < MemNumElements; ++i) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, + DAG.getConstant(i, MVT::i32)); + Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); + Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg + + SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32); + Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); + + if (i == 0) { + PackedValue = Elt; + } else { + PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt); + } + } + + if (PackedSize < 32) { + EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); + return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, + Store->getMemOperand()->getPointerInfo(), + PackedVT, + Store->isNonTemporal(), Store->isVolatile(), + Store->getAlignment()); + } + + return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, + Store->getMemOperand()->getPointerInfo(), + Store->isVolatile(), Store->isNonTemporal(), + Store->getAlignment()); +} +SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = cast(Op); + EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); + EVT EltVT = Store->getValue().getValueType().getVectorElementType(); + EVT PtrVT = Store->getBasePtr().getValueType(); + unsigned NumElts = Store->getMemoryVT().getVectorNumElements(); + SDLoc SL(Op); + + SmallVector Chains; + + for (unsigned i = 0, e = NumElts; i != e; ++i) { + SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Store->getValue(), DAG.getConstant(i, MVT::i32)); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, + Store->getBasePtr(), + DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), + PtrVT)); + Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, + MachinePointerInfo(Store->getMemOperand()->getValue()), + MemEltVT, Store->isVolatile(), Store->isNonTemporal(), + Store->getAlignment())); + } + return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains.data(), NumElts); +} + +SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast(Op); + ISD::LoadExtType ExtType = Load->getExtensionType(); + EVT VT = Op.getValueType(); + EVT MemVT = Load->getMemoryVT(); + + if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) { + // We can do the extload to 32-bits, and then need to separately extend to + // 64-bits. + + SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32, + Load->getChain(), + Load->getBasePtr(), + MemVT, + Load->getMemOperand()); + return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32); + } + + if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { + assert(VT == MVT::i1 && "Only i1 non-extloads expected"); + // FIXME: Copied from PPC + // First, load into 32 bits, then truncate to 1 bit. + + SDValue Chain = Load->getChain(); + SDValue BasePtr = Load->getBasePtr(); + MachineMemOperand *MMO = Load->getMemOperand(); + + SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, + BasePtr, MVT::i8, MMO); + return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD); + } + + // Lower loads constant address space global variable loads + if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + isa( + GetUnderlyingObject(Load->getMemOperand()->getValue()))) { + + SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL, + getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); + Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, + DAG.getConstant(2, MVT::i32)); + return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), Ptr, + DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); + } + + if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || + ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) + return SDValue(); + + + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), + DAG.getConstant(2, MVT::i32)); + SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), Ptr, + DAG.getTargetConstant(0, MVT::i32), + Op.getOperand(2)); + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, + Load->getBasePtr(), + DAG.getConstant(0x3, MVT::i32)); + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, MVT::i32)); + + Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); + + EVT MemEltVT = MemVT.getScalarType(); + if (ExtType == ISD::SEXTLOAD) { + SDValue MemEltVTNode = DAG.getValueType(MemEltVT); + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); + } + + return DAG.getZeroExtendInReg(Ret, DL, MemEltVT); +} + +SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); + if (Result.getNode()) { + return Result; + } + + StoreSDNode *Store = cast(Op); + SDValue Chain = Store->getChain(); + if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && + Store->getValue().getValueType().isVector()) { + return SplitVectorStore(Op, DAG); + } + + EVT MemVT = Store->getMemoryVT(); + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && + MemVT.bitsLT(MVT::i32)) { + unsigned Mask = 0; + if (Store->getMemoryVT() == MVT::i8) { + Mask = 0xff; + } else if (Store->getMemoryVT() == MVT::i16) { + Mask = 0xffff; + } + SDValue BasePtr = Store->getBasePtr(); + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, + DAG.getConstant(2, MVT::i32)); + SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Ptr, DAG.getTargetConstant(0, MVT::i32)); + + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, + DAG.getConstant(0x3, MVT::i32)); + + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, MVT::i32)); + + SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, + Store->getValue()); + + SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); + + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, + MaskedValue, ShiftAmt); + + SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32), + ShiftAmt); + DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, + DAG.getConstant(0xffffffff, MVT::i32)); + Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + + SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); + return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32)); + } + return SDValue(); +} SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const { @@ -349,13 +1001,13 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, DAG.getConstant(-1, VT), DAG.getConstant(0, VT), - ISD::SETGE); - // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0) - SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder, - DAG.getConstant(0, VT), + ISD::SETUGE); + // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) + SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, + Num_S_Remainder, DAG.getConstant(-1, VT), DAG.getConstant(0, VT), - ISD::SETGE); + ISD::SETUGE); // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, Remainder_GE_Zero); @@ -393,16 +1045,187 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), Remainder_A_Den, Rem, ISD::SETEQ); - SDValue Ops[2]; - Ops[0] = Div; - Ops[1] = Rem; + SDValue Ops[2] = { + Div, + Rem + }; return DAG.getMergeValues(Ops, 2, DL); } +SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + SDValue S0 = Op.getOperand(0); + SDLoc DL(Op); + if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64) + return SDValue(); + + // f32 uint_to_fp i64 + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, + DAG.getConstant(0, MVT::i32)); + SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, + DAG.getConstant(1, MVT::i32)); + SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); + FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, + DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32 + return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); + +} + +SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op, + unsigned BitsDiff, + SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + SDValue Shift = DAG.getConstant(BitsDiff, VT); + // Shift left by 'Shift' bits. + SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift); + // Signed shift Right by 'Shift' bits. + return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift); +} + +SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, + SelectionDAG &DAG) const { + EVT ExtraVT = cast(Op.getOperand(1))->getVT(); + MVT VT = Op.getSimpleValueType(); + MVT ScalarVT = VT.getScalarType(); + + if (!VT.isVector()) + return SDValue(); + + SDValue Src = Op.getOperand(0); + SDLoc DL(Op); + + // TODO: Don't scalarize on Evergreen? + unsigned NElts = VT.getVectorNumElements(); + SmallVector Args; + DAG.ExtractVectorElements(Src, Args, 0, NElts); + + SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); + for (unsigned I = 0; I < NElts; ++I) + Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args.data(), Args.size()); +} + +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// + +static bool isU24(SDValue Op, SelectionDAG &DAG) { + APInt KnownZero, KnownOne; + EVT VT = Op.getValueType(); + DAG.ComputeMaskedBits(Op, KnownZero, KnownOne); + + return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; +} + +static bool isI24(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + // In order for this to be a signed 24-bit value, bit 23, must + // be a sign bit. + return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated + // as unsigned 24-bit values. + (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; +} + +static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { + + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = Op.getValueType(); + + APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, true, true); + if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); +} + +SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + switch(N->getOpcode()) { + default: break; + case ISD::MUL: { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mul; + + // FIXME: Add support for 24-bit multiply with 64-bit output on SI. + if (VT.isVector() || VT.getSizeInBits() > 32) + break; + + if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { + N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); + } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { + N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); + } else { + break; + } + + // We need to use sext even for MUL_U24, because MUL_U24 is used + // for signed multiply of 8 and 16-bit types. + SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT); + + return Reg; + } + case AMDGPUISD::MUL_I24: + case AMDGPUISD::MUL_U24: { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + simplifyI24(N0, DCI); + simplifyI24(N1, DCI); + return SDValue(); + } + } + return SDValue(); +} + //===----------------------------------------------------------------------===// // Helper functions //===----------------------------------------------------------------------===// +void AMDGPUTargetLowering::getOriginalFunctionArgs( + SelectionDAG &DAG, + const Function *F, + const SmallVectorImpl &Ins, + SmallVectorImpl &OrigIns) const { + + for (unsigned i = 0, e = Ins.size(); i < e; ++i) { + if (Ins[i].ArgVT == Ins[i].VT) { + OrigIns.push_back(Ins[i]); + continue; + } + + EVT VT; + if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) { + // Vector has been split into scalars. + VT = Ins[i].ArgVT.getVectorElementType(); + } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() && + Ins[i].ArgVT.getVectorElementType() != + Ins[i].VT.getVectorElementType()) { + // Vector elements have been promoted + VT = Ins[i].ArgVT; + } else { + // Vector has been spilt into smaller vectors. + VT = Ins[i].VT; + } + + ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used, + Ins[i].OrigArgIndex, Ins[i].PartOffset); + OrigIns.push_back(Arg); + } +} + bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { if (ConstantFPSDNode * CFP = dyn_cast(Op)) { return CFP->isExactlyValue(1.0); @@ -459,10 +1282,78 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMIN) NODE_NAME_CASE(SMIN) NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(BFE_U32) + NODE_NAME_CASE(BFE_I32) + NODE_NAME_CASE(BFI) + NODE_NAME_CASE(BFM) + NODE_NAME_CASE(MUL_U24) + NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(URECIP) + NODE_NAME_CASE(DOT4) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) NODE_NAME_CASE(REGISTER_STORE) + NODE_NAME_CASE(LOAD_CONSTANT) + NODE_NAME_CASE(LOAD_INPUT) + NODE_NAME_CASE(SAMPLE) + NODE_NAME_CASE(SAMPLEB) + NODE_NAME_CASE(SAMPLED) + NODE_NAME_CASE(SAMPLEL) + NODE_NAME_CASE(STORE_MSKOR) + NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + } +} + +static void computeMaskedBitsForMinMax(const SDValue Op0, + const SDValue Op1, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) { + APInt Op0Zero, Op0One; + APInt Op1Zero, Op1One; + DAG.ComputeMaskedBits(Op0, Op0Zero, Op0One, Depth); + DAG.ComputeMaskedBits(Op1, Op1Zero, Op1One, Depth); + + KnownZero = Op0Zero & Op1Zero; + KnownOne = Op0One & Op1One; +} + +void AMDGPUTargetLowering::computeMaskedBitsForTargetNode( + const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + + KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. + unsigned Opc = Op.getOpcode(); + switch (Opc) { + case ISD::INTRINSIC_WO_CHAIN: { + // FIXME: The intrinsic should just use the node. + switch (cast(Op.getOperand(0))->getZExtValue()) { + case AMDGPUIntrinsic::AMDGPU_imax: + case AMDGPUIntrinsic::AMDGPU_umax: + case AMDGPUIntrinsic::AMDGPU_imin: + case AMDGPUIntrinsic::AMDGPU_umin: + computeMaskedBitsForMinMax(Op.getOperand(1), Op.getOperand(2), + KnownZero, KnownOne, DAG, Depth); + break; + default: + break; + } + + break; + } + case AMDGPUISD::SMAX: + case AMDGPUISD::UMAX: + case AMDGPUISD::SMIN: + case AMDGPUISD::UMIN: + computeMaskedBitsForMinMax(Op.getOperand(0), Op.getOperand(1), + KnownZero, KnownOne, DAG, Depth); + break; + default: + break; } }