X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FR600%2FAMDGPUISelLowering.cpp;h=948533bc09f07cae300cad99533223c25703636f;hb=8bd9405026b50394e173a4b3159aacd841efe564;hp=bde6894cec39c6a5aa0721548e74816942677d82;hpb=f54a8409f9fcf025999d8a53fac989c6d0c82e72;p=oota-llvm.git diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index bde6894cec3..948533bc09f 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -15,24 +15,69 @@ #include "AMDGPUISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUFrameLowering.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDILIntrinsicInfo.h" #include "R600MachineFunctionInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" using namespace llvm; + +namespace { + +/// Diagnostic information for unimplemented or unsupported feature reporting. +class DiagnosticInfoUnsupported : public DiagnosticInfo { +private: + const Twine &Description; + const Function &Fn; + + static int KindID; + + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } + +public: + DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, + DiagnosticSeverity Severity = DS_Error) + : DiagnosticInfo(getKindID(), Severity), + Description(Desc), + Fn(Fn) { } + + const Function &getFunction() const { return Fn; } + const Twine &getDescription() const { return Description; } + + void print(DiagnosticPrinter &DP) const override { + DP << "unsupported " << getDescription() << " in " << Fn.getName(); + } + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; + +int DiagnosticInfoUnsupported::KindID = 0; +} + + static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { - unsigned Offset = State.AllocateStack(ValVT.getSizeInBits() / 8, ArgFlags.getOrigAlign()); - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), + ArgFlags.getOrigAlign()); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); return true; } @@ -42,6 +87,8 @@ static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()) { + Subtarget = &TM.getSubtarget(); + // Initialize target lowering borrowed from AMDIL InitAMDILLowering(); @@ -57,6 +104,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FABS, MVT::f32, Legal); setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FROUND, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); // The hardware supports ROTR, but not ROTL setOperationAction(ISD::ROTL, MVT::i32, Expand); @@ -91,10 +140,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); + // XXX: This can be change to Custom, once ExpandVectorStores can // handle 64-bit stores. setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i1, Expand); + setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); + setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); + + setOperationAction(ISD::LOAD, MVT::f32, Promote); AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); @@ -115,8 +170,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand); setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand); @@ -131,9 +192,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + setOperationAction(ISD::FNEG, MVT::v2f32, Expand); setOperationAction(ISD::FNEG, MVT::v4f32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); @@ -164,6 +229,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::XOR, VT, Expand); } @@ -175,14 +241,20 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : for (unsigned int x = 0; x < NumFloatTypes; ++x) { MVT::SimpleValueType VT = FloatTypes[x]; + setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FADD, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); + setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FMUL, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FSUB, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); } + + setTargetDAGCombine(ISD::MUL); } //===----------------------------------------------------------------------===// @@ -193,6 +265,18 @@ MVT AMDGPUTargetLowering::getVectorIdxTy() const { return MVT::i32; } +bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, + EVT CastTy) const { + if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) + return true; + + unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); + unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); + + return ((LScalarSize <= CastScalarSize) || + (CastScalarSize >= 32) || + (LScalarSize < 32)); +} //===---------------------------------------------------------------------===// // Target Properties @@ -208,6 +292,43 @@ bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { return VT == MVT::f32; } +bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { + // Truncate is just accessing a subregister. + return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); +} + +bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { + // Truncate is just accessing a subregister. + return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && + (Dest->getPrimitiveSizeInBits() % 32 == 0); +} + +bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { + const DataLayout *DL = getDataLayout(); + unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); + unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); + + return SrcSize == 32 && DestSize == 64; +} + +bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { + // Any register load of a 64-bit value really requires 2 32-bit moves. For all + // practical purposes, the extra mov 0 to load a 64-bit is free. As used, + // this will enable reducing 64-bit operations the 32-bit, which is always + // good. + return Src == MVT::i32 && Dest == MVT::i64; +} + +bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { + // There aren't really 64-bit registers, but pairs of 32-bit ones and only a + // limited number of native 64-bit operations. Shrinking an operation to fit + // in a single 32-bit register should always be helpful. As currently used, + // this is much less general than the name suggests, and is only used in + // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is + // not profitable, and may actually be harmful. + return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; +} + //===---------------------------------------------------------------------===// // TargetLowering Callbacks //===---------------------------------------------------------------------===// @@ -232,13 +353,30 @@ SDValue AMDGPUTargetLowering::LowerReturn( // Target specific lowering //===---------------------------------------------------------------------===// +SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl &InVals) const { + SDValue Callee = CLI.Callee; + SelectionDAG &DAG = CLI.DAG; + + const Function &Fn = *DAG.getMachineFunction().getFunction(); + + StringRef FuncName(""); + + if (const GlobalAddressSDNode *G = dyn_cast(Callee)) + FuncName = G->getGlobal()->getName(); + + DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName); + DAG.getContext()->diagnose(NoCalls); + return SDValue(); +} + SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: Op.getNode()->dump(); - assert(0 && "Custom lowering code for this" - "instruction is not implemented yet!"); + llvm_unreachable("Custom lowering code for this" + "instruction is not implemented yet!"); break; // AMDIL DAG lowering case ISD::SDIV: return LowerSDIV(Op, DAG); @@ -248,50 +386,131 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) // AMDGPU DAG lowering case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); - case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); + case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); } return Op; } +void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const { + switch (N->getOpcode()) { + case ISD::SIGN_EXTEND_INREG: + // Different parts of legalization seem to interpret which type of + // sign_extend_inreg is the one to check for custom lowering. The extended + // from type is what really matters, but some places check for custom + // lowering of the result type. This results in trying to use + // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do + // nothing here and let the illegal result integer be handled normally. + return; + + default: + return; + } +} + +SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, + const GlobalValue *GV, + const SDValue &InitPtr, + SDValue Chain, + SelectionDAG &DAG) const { + const DataLayout *TD = getTargetMachine().getDataLayout(); + SDLoc DL(InitPtr); + if (const ConstantInt *CI = dyn_cast(Init)) { + EVT VT = EVT::getEVT(CI->getType()); + PointerType *PtrTy = PointerType::get(CI->getType(), 0); + return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(CI->getType())); + } else if (const ConstantFP *CFP = dyn_cast(Init)) { + EVT VT = EVT::getEVT(CFP->getType()); + PointerType *PtrTy = PointerType::get(CFP->getType(), 0); + return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(CFP->getType())); + } else if (Init->getType()->isAggregateType()) { + EVT PtrVT = InitPtr.getValueType(); + unsigned NumElements = Init->getType()->getArrayNumElements(); + SmallVector Chains; + for (unsigned i = 0; i < NumElements; ++i) { + SDValue Offset = DAG.getConstant(i * TD->getTypeAllocSize( + Init->getType()->getArrayElementType()), PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); + Chains.push_back(LowerConstantInitializer(Init->getAggregateElement(i), + GV, Ptr, Chain, DAG)); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + Chains.data(), Chains.size()); + } else { + Init->dump(); + llvm_unreachable("Unhandled constant initializer"); + } +} + SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, SDValue Op, SelectionDAG &DAG) const { const DataLayout *TD = getTargetMachine().getDataLayout(); GlobalAddressSDNode *G = cast(Op); + const GlobalValue *GV = G->getGlobal(); - assert(G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS); - // XXX: What does the value of G->getOffset() mean? - assert(G->getOffset() == 0 && + switch (G->getAddressSpace()) { + default: llvm_unreachable("Global Address lowering not implemented for this " + "address space"); + case AMDGPUAS::LOCAL_ADDRESS: { + // XXX: What does the value of G->getOffset() mean? + assert(G->getOffset() == 0 && "Do not know what to do with an non-zero offset"); - const GlobalValue *GV = G->getGlobal(); + unsigned Offset; + if (MFI->LocalMemoryObjects.count(GV) == 0) { + uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); + Offset = MFI->LDSSize; + MFI->LocalMemoryObjects[GV] = Offset; + // XXX: Account for alignment? + MFI->LDSSize += Size; + } else { + Offset = MFI->LocalMemoryObjects[GV]; + } - unsigned Offset; - if (MFI->LocalMemoryObjects.count(GV) == 0) { - uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); - Offset = MFI->LDSSize; - MFI->LocalMemoryObjects[GV] = Offset; - // XXX: Account for alignment? - MFI->LDSSize += Size; - } else { - Offset = MFI->LocalMemoryObjects[GV]; + return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace())); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + Type *EltType = GV->getType()->getElementType(); + unsigned Size = TD->getTypeAllocSize(EltType); + unsigned Alignment = TD->getPrefTypeAlignment(EltType); + + const GlobalVariable *Var = dyn_cast(GV); + const Constant *Init = Var->getInitializer(); + int FI = FrameInfo->CreateStackObject(Size, Alignment, false); + SDValue InitPtr = DAG.getFrameIndex(FI, + getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); + SmallVector WorkList; + + for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), + E = DAG.getEntryNode()->use_end(); I != E; ++I) { + if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) + continue; + WorkList.push_back(*I); + } + SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); + for (SmallVector::iterator I = WorkList.begin(), + E = WorkList.end(); I != E; ++I) { + SmallVector Ops; + Ops.push_back(Chain); + for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { + Ops.push_back((*I)->getOperand(i)); + } + DAG.UpdateNodeOperands(*I, Ops.data(), Ops.size()); + } + return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), + getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); } - - return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace())); -} - -void AMDGPUTargetLowering::ExtractVectorElements(SDValue Op, SelectionDAG &DAG, - SmallVectorImpl &Args, - unsigned Start, - unsigned Count) const { - EVT VT = Op.getValueType(); - for (unsigned i = Start, e = Start + Count; i != e; ++i) { - Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op), - VT.getVectorElementType(), - Op, DAG.getConstant(i, MVT::i32))); } } @@ -301,28 +520,41 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); - ExtractVectorElements(A, DAG, Args, 0, - A.getValueType().getVectorNumElements()); - ExtractVectorElements(B, DAG, Args, 0, - B.getValueType().getVectorNumElements()); + DAG.ExtractVectorElements(A, Args); + DAG.ExtractVectorElements(B, Args); return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), - &Args[0], Args.size()); + Args.data(), Args.size()); } SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { SmallVector Args; - EVT VT = Op.getValueType(); unsigned Start = cast(Op.getOperand(1))->getZExtValue(); - ExtractVectorElements(Op.getOperand(0), DAG, Args, Start, - VT.getVectorNumElements()); + EVT VT = Op.getValueType(); + DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, + VT.getVectorNumElements()); return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), - &Args[0], Args.size()); + Args.data(), Args.size()); } +SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, + SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = + static_cast(getTargetMachine().getFrameLowering()); + + FrameIndexSDNode *FIN = dyn_cast(Op); + assert(FIN); + + unsigned FrameIndex = FIN->getIndex(); + unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); + return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), + Op.getValueType()); +} SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { @@ -358,6 +590,30 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case AMDGPUIntrinsic::AMDGPU_umin: return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1), Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_bfe_i32: + return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfe_u32: + return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfi: + return DAG.getNode(AMDGPUISD::BFI, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfm: + return DAG.getNode(AMDGPUISD::BFM, DL, VT, + Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDIL_round_nearest: return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); } @@ -422,7 +678,7 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op, case ISD::SETTRUE2: case ISD::SETUO: case ISD::SETO: - assert(0 && "Operation should already be optimised !"); + llvm_unreachable("Operation should already be optimised!"); case ISD::SETULE: case ISD::SETULT: case ISD::SETOLE: @@ -446,7 +702,7 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op, return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS); } case ISD::SETCC_INVALID: - assert(0 && "Invalid setcc condcode !"); + llvm_unreachable("Invalid setcc condcode!"); } return Op; } @@ -470,8 +726,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, MemEltVT, Load->isVolatile(), Load->isNonTemporal(), Load->getAlignment())); } - return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), &Loads[0], - Loads.size()); + return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), + Loads.data(), Loads.size()); } SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, @@ -480,49 +736,54 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, EVT MemVT = Store->getMemoryVT(); unsigned MemBits = MemVT.getSizeInBits(); - // Byte stores are really expensive, so if possible, try to pack - // 32-bit vector truncatating store into an i32 store. - // XXX: We could also handle optimize other vector bitwidths + // Byte stores are really expensive, so if possible, try to pack 32-bit vector + // truncating store into an i32 store. + // XXX: We could also handle optimize other vector bitwidths. if (!MemVT.isVector() || MemBits > 32) { return SDValue(); } SDLoc DL(Op); - const SDValue &Value = Store->getValue(); + SDValue Value = Store->getValue(); EVT VT = Value.getValueType(); - const SDValue &Ptr = Store->getBasePtr(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Ptr = Store->getBasePtr(); EVT MemEltVT = MemVT.getVectorElementType(); unsigned MemEltBits = MemEltVT.getSizeInBits(); unsigned MemNumElements = MemVT.getVectorNumElements(); - EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); - SDValue Mask; - switch(MemEltBits) { - case 8: - Mask = DAG.getConstant(0xFF, PackedVT); - break; - case 16: - Mask = DAG.getConstant(0xFFFF, PackedVT); - break; - default: - llvm_unreachable("Cannot lower this vector store"); - } + unsigned PackedSize = MemVT.getStoreSizeInBits(); + SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32); + + assert(Value.getValueType().getScalarSizeInBits() >= 32); + SDValue PackedValue; for (unsigned i = 0; i < MemNumElements; ++i) { - EVT ElemVT = VT.getVectorElementType(); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, DAG.getConstant(i, MVT::i32)); - Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT); - Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask); - SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT); - Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift); + Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); + Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg + + SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32); + Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); + if (i == 0) { PackedValue = Elt; } else { - PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt); + PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt); } } + + if (PackedSize < 32) { + EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); + return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, + Store->getMemOperand()->getPointerInfo(), + PackedVT, + Store->isNonTemporal(), Store->isVolatile(), + Store->getAlignment()); + } + return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, - MachinePointerInfo(Store->getMemOperand()->getValue()), + Store->getMemOperand()->getPointerInfo(), Store->isVolatile(), Store->isNonTemporal(), Store->getAlignment()); } @@ -550,20 +811,138 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, MemEltVT, Store->isVolatile(), Store->isNonTemporal(), Store->getAlignment())); } - return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts); + return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains.data(), NumElts); +} + +SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast(Op); + ISD::LoadExtType ExtType = Load->getExtensionType(); + EVT VT = Op.getValueType(); + EVT MemVT = Load->getMemoryVT(); + + if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) { + // We can do the extload to 32-bits, and then need to separately extend to + // 64-bits. + + SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32, + Load->getChain(), + Load->getBasePtr(), + MemVT, + Load->getMemOperand()); + return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32); + } + + if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { + assert(VT == MVT::i1 && "Only i1 non-extloads expected"); + // FIXME: Copied from PPC + // First, load into 32 bits, then truncate to 1 bit. + + SDValue Chain = Load->getChain(); + SDValue BasePtr = Load->getBasePtr(); + MachineMemOperand *MMO = Load->getMemOperand(); + + SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, + BasePtr, MVT::i8, MMO); + return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD); + } + + // Lower loads constant address space global variable loads + if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + isa( + GetUnderlyingObject(Load->getMemOperand()->getValue()))) { + + SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL, + getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); + Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, + DAG.getConstant(2, MVT::i32)); + return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), Ptr, + DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); + } + + if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || + ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) + return SDValue(); + + + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), + DAG.getConstant(2, MVT::i32)); + SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), Ptr, + DAG.getTargetConstant(0, MVT::i32), + Op.getOperand(2)); + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, + Load->getBasePtr(), + DAG.getConstant(0x3, MVT::i32)); + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, MVT::i32)); + + Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); + + EVT MemEltVT = MemVT.getScalarType(); + if (ExtType == ISD::SEXTLOAD) { + SDValue MemEltVTNode = DAG.getValueType(MemEltVT); + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); + } + + return DAG.getZeroExtendInReg(Ret, DL, MemEltVT); } SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); if (Result.getNode()) { return Result; } StoreSDNode *Store = cast(Op); - if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + SDValue Chain = Store->getChain(); + if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && Store->getValue().getValueType().isVector()) { return SplitVectorStore(Op, DAG); } + + EVT MemVT = Store->getMemoryVT(); + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && + MemVT.bitsLT(MVT::i32)) { + unsigned Mask = 0; + if (Store->getMemoryVT() == MVT::i8) { + Mask = 0xff; + } else if (Store->getMemoryVT() == MVT::i16) { + Mask = 0xffff; + } + SDValue BasePtr = Store->getBasePtr(); + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, + DAG.getConstant(2, MVT::i32)); + SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Ptr, DAG.getTargetConstant(0, MVT::i32)); + + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, + DAG.getConstant(0x3, MVT::i32)); + + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, MVT::i32)); + + SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, + Store->getValue()); + + SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); + + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, + MaskedValue, ShiftAmt); + + SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32), + ShiftAmt); + DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, + DAG.getConstant(0xffffffff, MVT::i32)); + Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + + SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); + return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32)); + } return SDValue(); } @@ -622,13 +1001,13 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, DAG.getConstant(-1, VT), DAG.getConstant(0, VT), - ISD::SETGE); - // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0) - SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder, - DAG.getConstant(0, VT), + ISD::SETUGE); + // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) + SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, + Num_S_Remainder, DAG.getConstant(-1, VT), DAG.getConstant(0, VT), - ISD::SETGE); + ISD::SETUGE); // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, Remainder_GE_Zero); @@ -666,12 +1045,150 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), Remainder_A_Den, Rem, ISD::SETEQ); - SDValue Ops[2]; - Ops[0] = Div; - Ops[1] = Rem; + SDValue Ops[2] = { + Div, + Rem + }; return DAG.getMergeValues(Ops, 2, DL); } +SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + SDValue S0 = Op.getOperand(0); + SDLoc DL(Op); + if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64) + return SDValue(); + + // f32 uint_to_fp i64 + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, + DAG.getConstant(0, MVT::i32)); + SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, + DAG.getConstant(1, MVT::i32)); + SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); + FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, + DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32 + return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); + +} + +SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op, + unsigned BitsDiff, + SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + SDValue Shift = DAG.getConstant(BitsDiff, VT); + // Shift left by 'Shift' bits. + SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift); + // Signed shift Right by 'Shift' bits. + return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift); +} + +SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, + SelectionDAG &DAG) const { + EVT ExtraVT = cast(Op.getOperand(1))->getVT(); + MVT VT = Op.getSimpleValueType(); + MVT ScalarVT = VT.getScalarType(); + + if (!VT.isVector()) + return SDValue(); + + SDValue Src = Op.getOperand(0); + SDLoc DL(Op); + + // TODO: Don't scalarize on Evergreen? + unsigned NElts = VT.getVectorNumElements(); + SmallVector Args; + DAG.ExtractVectorElements(Src, Args, 0, NElts); + + SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); + for (unsigned I = 0; I < NElts; ++I) + Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args.data(), Args.size()); +} + +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// + +static bool isU24(SDValue Op, SelectionDAG &DAG) { + APInt KnownZero, KnownOne; + EVT VT = Op.getValueType(); + DAG.ComputeMaskedBits(Op, KnownZero, KnownOne); + + return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; +} + +static bool isI24(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + // In order for this to be a signed 24-bit value, bit 23, must + // be a sign bit. + return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated + // as unsigned 24-bit values. + (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; +} + +static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { + + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = Op.getValueType(); + + APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, true, true); + if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); +} + +SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + switch(N->getOpcode()) { + default: break; + case ISD::MUL: { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mul; + + // FIXME: Add support for 24-bit multiply with 64-bit output on SI. + if (VT.isVector() || VT.getSizeInBits() > 32) + break; + + if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { + N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); + } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { + N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); + } else { + break; + } + + // We need to use sext even for MUL_U24, because MUL_U24 is used + // for signed multiply of 8 and 16-bit types. + SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT); + + return Reg; + } + case AMDGPUISD::MUL_I24: + case AMDGPUISD::MUL_U24: { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + simplifyI24(N0, DCI); + simplifyI24(N1, DCI); + return SDValue(); + } + } + return SDValue(); +} //===----------------------------------------------------------------------===// // Helper functions @@ -765,7 +1282,14 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMIN) NODE_NAME_CASE(SMIN) NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(BFE_U32) + NODE_NAME_CASE(BFE_I32) + NODE_NAME_CASE(BFI) + NODE_NAME_CASE(BFM) + NODE_NAME_CASE(MUL_U24) + NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(URECIP) + NODE_NAME_CASE(DOT4) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) @@ -780,3 +1304,56 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(TBUFFER_STORE_FORMAT) } } + +static void computeMaskedBitsForMinMax(const SDValue Op0, + const SDValue Op1, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) { + APInt Op0Zero, Op0One; + APInt Op1Zero, Op1One; + DAG.ComputeMaskedBits(Op0, Op0Zero, Op0One, Depth); + DAG.ComputeMaskedBits(Op1, Op1Zero, Op1One, Depth); + + KnownZero = Op0Zero & Op1Zero; + KnownOne = Op0One & Op1One; +} + +void AMDGPUTargetLowering::computeMaskedBitsForTargetNode( + const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + + KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. + unsigned Opc = Op.getOpcode(); + switch (Opc) { + case ISD::INTRINSIC_WO_CHAIN: { + // FIXME: The intrinsic should just use the node. + switch (cast(Op.getOperand(0))->getZExtValue()) { + case AMDGPUIntrinsic::AMDGPU_imax: + case AMDGPUIntrinsic::AMDGPU_umax: + case AMDGPUIntrinsic::AMDGPU_imin: + case AMDGPUIntrinsic::AMDGPU_umin: + computeMaskedBitsForMinMax(Op.getOperand(1), Op.getOperand(2), + KnownZero, KnownOne, DAG, Depth); + break; + default: + break; + } + + break; + } + case AMDGPUISD::SMAX: + case AMDGPUISD::UMAX: + case AMDGPUISD::SMIN: + case AMDGPUISD::UMIN: + computeMaskedBitsForMinMax(Op.getOperand(0), Op.getOperand(1), + KnownZero, KnownOne, DAG, Depth); + break; + default: + break; + } +}