X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FCodeGen%2FBasicTargetTransformInfo.cpp;h=7f31b1a982fc107996a7d0baae40c85c264783bb;hb=33b37c7b12314d292c9074806f38aeaa6a707085;hp=3892cc4dd551fdf71258013c5ce6b17acc911050;hpb=14925e6b885f8bd8cf448627386d412831f4bf1b;p=oota-llvm.git diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp index 3892cc4dd55..7f31b1a982f 100644 --- a/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -15,41 +15,46 @@ /// //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "basictti" #include "llvm/CodeGen/Passes.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetSubtargetInfo.h" #include - using namespace llvm; +static cl::opt +PartialUnrollingThreshold("partial-unrolling-threshold", cl::init(0), + cl::desc("Threshold for partial unrolling"), cl::Hidden); + +#define DEBUG_TYPE "basictti" + namespace { -class BasicTTI : public ImmutablePass, public TargetTransformInfo { - const TargetLowering *TLI; +class BasicTTI final : public ImmutablePass, public TargetTransformInfo { + const TargetMachine *TM; /// Estimate the overhead of scalarizing an instruction. Insert and Extract /// are set if the result needs to be inserted and/or extracted from vectors. unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; + const TargetLoweringBase *getTLI() const { return TM->getTargetLowering(); } + public: - BasicTTI() : ImmutablePass(ID), TLI(0) { + BasicTTI() : ImmutablePass(ID), TM(nullptr) { llvm_unreachable("This pass cannot be directly constructed"); } - BasicTTI(const TargetLowering *TLI) : ImmutablePass(ID), TLI(TLI) { + BasicTTI(const TargetMachine *TM) : ImmutablePass(ID), TM(TM) { initializeBasicTTIPass(*PassRegistry::getPassRegistry()); } - virtual void initializePass() { + void initializePass() override { pushTTIStack(this); } - virtual void finalizePass() { - popTTIStack(); - } - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { TargetTransformInfo::getAnalysisUsage(AU); } @@ -57,50 +62,61 @@ public: static char ID; /// Provide necessary pointer adjustments for the two base classes. - virtual void *getAdjustedAnalysisPointer(const void *ID) { + void *getAdjustedAnalysisPointer(const void *ID) override { if (ID == &TargetTransformInfo::ID) return (TargetTransformInfo*)this; return this; } + bool hasBranchDivergence() const override; + /// \name Scalar TTI Implementations /// @{ - virtual bool isLegalAddImmediate(int64_t imm) const; - virtual bool isLegalICmpImmediate(int64_t imm) const; - virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, - int64_t BaseOffset, bool HasBaseReg, - int64_t Scale) const; - virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const; - virtual bool isTypeLegal(Type *Ty) const; - virtual unsigned getJumpBufAlignment() const; - virtual unsigned getJumpBufSize() const; - virtual bool shouldBuildLookupTables() const; + bool isLegalAddImmediate(int64_t imm) const override; + bool isLegalICmpImmediate(int64_t imm) const override; + bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, + int64_t Scale) const override; + int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, + int64_t Scale) const override; + bool isTruncateFree(Type *Ty1, Type *Ty2) const override; + bool isTypeLegal(Type *Ty) const override; + unsigned getJumpBufAlignment() const override; + unsigned getJumpBufSize() const override; + bool shouldBuildLookupTables() const override; + bool haveFastSqrt(Type *Ty) const override; + void getUnrollingPreferences(Loop *L, + UnrollingPreferences &UP) const override; /// @} /// \name Vector TTI Implementations /// @{ - virtual unsigned getNumberOfRegisters(bool Vector) const; - virtual unsigned getMaximumUnrollFactor() const; - virtual unsigned getRegisterBitWidth(bool Vector) const; - virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; - virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, - int Index, Type *SubTp) const; - virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, - Type *Src) const; - virtual unsigned getCFInstrCost(unsigned Opcode) const; - virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) const; - virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) const; - virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) const; - virtual unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy, - ArrayRef Tys) const; - virtual unsigned getNumberOfParts(Type *Tp) const; + unsigned getNumberOfRegisters(bool Vector) const override; + unsigned getMaximumUnrollFactor() const override; + unsigned getRegisterBitWidth(bool Vector) const override; + unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, + OperandValueKind) const override; + unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, + int Index, Type *SubTp) const override; + unsigned getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src) const override; + unsigned getCFInstrCost(unsigned Opcode) const override; + unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) const override; + unsigned getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) const override; + unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) const override; + unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy, + ArrayRef Tys) const override; + unsigned getNumberOfParts(Type *Tp) const override; + unsigned getAddressComputationCost( Type *Ty, bool IsComplex) const override; + unsigned getReductionCost(unsigned Opcode, Type *Ty, + bool IsPairwise) const override; /// @} }; @@ -112,53 +128,128 @@ INITIALIZE_AG_PASS(BasicTTI, TargetTransformInfo, "basictti", char BasicTTI::ID = 0; ImmutablePass * -llvm::createBasicTargetTransformInfoPass(const TargetLowering *TLI) { - return new BasicTTI(TLI); +llvm::createBasicTargetTransformInfoPass(const TargetMachine *TM) { + return new BasicTTI(TM); } +bool BasicTTI::hasBranchDivergence() const { return false; } bool BasicTTI::isLegalAddImmediate(int64_t imm) const { - return TLI->isLegalAddImmediate(imm); + return getTLI()->isLegalAddImmediate(imm); } bool BasicTTI::isLegalICmpImmediate(int64_t imm) const { - return TLI->isLegalICmpImmediate(imm); + return getTLI()->isLegalICmpImmediate(imm); } bool BasicTTI::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) const { - TargetLowering::AddrMode AM; + TargetLoweringBase::AddrMode AM; AM.BaseGV = BaseGV; AM.BaseOffs = BaseOffset; AM.HasBaseReg = HasBaseReg; AM.Scale = Scale; - return TLI->isLegalAddressingMode(AM, Ty); + return getTLI()->isLegalAddressingMode(AM, Ty); +} + +int BasicTTI::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, + int64_t Scale) const { + TargetLoweringBase::AddrMode AM; + AM.BaseGV = BaseGV; + AM.BaseOffs = BaseOffset; + AM.HasBaseReg = HasBaseReg; + AM.Scale = Scale; + return getTLI()->getScalingFactorCost(AM, Ty); } bool BasicTTI::isTruncateFree(Type *Ty1, Type *Ty2) const { - return TLI->isTruncateFree(Ty1, Ty2); + return getTLI()->isTruncateFree(Ty1, Ty2); } bool BasicTTI::isTypeLegal(Type *Ty) const { - EVT T = TLI->getValueType(Ty); - return TLI->isTypeLegal(T); + EVT T = getTLI()->getValueType(Ty); + return getTLI()->isTypeLegal(T); } unsigned BasicTTI::getJumpBufAlignment() const { - return TLI->getJumpBufAlignment(); + return getTLI()->getJumpBufAlignment(); } unsigned BasicTTI::getJumpBufSize() const { - return TLI->getJumpBufSize(); + return getTLI()->getJumpBufSize(); } bool BasicTTI::shouldBuildLookupTables() const { + const TargetLoweringBase *TLI = getTLI(); return TLI->supportJumpTables() && (TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other)); } +bool BasicTTI::haveFastSqrt(Type *Ty) const { + const TargetLoweringBase *TLI = getTLI(); + EVT VT = TLI->getValueType(Ty); + return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); +} + +void BasicTTI::getUnrollingPreferences(Loop *L, + UnrollingPreferences &UP) const { + // This unrolling functionality is target independent, but to provide some + // motivation for its intended use, for x86: + + // According to the Intel 64 and IA-32 Architectures Optimization Reference + // Manual, Intel Core models and later have a loop stream detector + // (and associated uop queue) that can benefit from partial unrolling. + // The relevant requirements are: + // - The loop must have no more than 4 (8 for Nehalem and later) branches + // taken, and none of them may be calls. + // - The loop can have no more than 18 (28 for Nehalem and later) uops. + + // According to the Software Optimization Guide for AMD Family 15h Processors, + // models 30h-4fh (Steamroller and later) have a loop predictor and loop + // buffer which can benefit from partial unrolling. + // The relevant requirements are: + // - The loop must have fewer than 16 branches + // - The loop must have less than 40 uops in all executed loop branches + + // The number of taken branches in a loop is hard to estimate here, and + // benchmarking has revealed that it is better not to be conservative when + // estimating the branch count. As a result, we'll ignore the branch limits + // until someone finds a case where it matters in practice. + + unsigned MaxOps; + const TargetSubtargetInfo *ST = &TM->getSubtarget(); + if (PartialUnrollingThreshold.getNumOccurrences() > 0) + MaxOps = PartialUnrollingThreshold; + else if (ST->getSchedModel()->LoopMicroOpBufferSize > 0) + MaxOps = ST->getSchedModel()->LoopMicroOpBufferSize; + else + return; + + // Scan the loop: don't unroll loops with calls. + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) { + BasicBlock *BB = *I; + + for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J) + if (isa(J) || isa(J)) { + ImmutableCallSite CS(J); + if (const Function *F = CS.getCalledFunction()) { + if (!TopTTI->isLoweredToCall(F)) + continue; + } + + return; + } + } + + // Enable runtime and partial unrolling up to the specified size. + UP.Partial = UP.Runtime = true; + UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps; +} + //===----------------------------------------------------------------------===// // // Calls used by the vectorizers. @@ -192,27 +283,35 @@ unsigned BasicTTI::getMaximumUnrollFactor() const { return 1; } -unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { +unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, + OperandValueKind, + OperandValueKind) const { // Check if any of the operands are vector operands. + const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); std::pair LT = TLI->getTypeLegalizationCost(Ty); + bool IsFloat = Ty->getScalarType()->isFloatingPointTy(); + // Assume that floating point arithmetic operations cost twice as much as + // integer operations. + unsigned OpCost = (IsFloat ? 2 : 1); + if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { // The operation is legal. Assume it costs 1. - // If the type is split to multiple registers, assume that thre is some + // If the type is split to multiple registers, assume that there is some // overhead to this. // TODO: Once we have extract/insert subvector cost we need to use them. if (LT.first > 1) - return LT.first * 2; - return LT.first * 1; + return LT.first * 2 * OpCost; + return LT.first * 1 * OpCost; } if (!TLI->isOperationExpand(ISD, LT.second)) { // If the operation is custom lowered then assume // thare the code is twice as expensive. - return LT.first * 2; + return LT.first * 2 * OpCost; } // Else, assume that we need to scalarize this op. @@ -225,7 +324,7 @@ unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { } // We don't know anything about this scalar instruction. - return 1; + return OpCost; } unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, @@ -235,12 +334,35 @@ unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, unsigned BasicTTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { + const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); std::pair SrcLT = TLI->getTypeLegalizationCost(Src); std::pair DstLT = TLI->getTypeLegalizationCost(Dst); + // Check for NOOP conversions. + if (SrcLT.first == DstLT.first && + SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { + + // Bitcast between types that are legalized to the same type are free. + if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc) + return 0; + } + + if (Opcode == Instruction::Trunc && + TLI->isTruncateFree(SrcLT.second, DstLT.second)) + return 0; + + if (Opcode == Instruction::ZExt && + TLI->isZExtFree(SrcLT.second, DstLT.second)) + return 0; + + // If the cast is marked as legal (or promote) then assume low cost. + if (SrcLT.first == DstLT.first && + TLI->isOperationLegalOrPromote(ISD, DstLT.second)) + return 1; + // Handle scalar conversions. if (!Src->isVectorTy() && !Dst->isVectorTy()) { @@ -248,14 +370,6 @@ unsigned BasicTTI::getCastInstrCost(unsigned Opcode, Type *Dst, if (Opcode == Instruction::BitCast) return 0; - if (Opcode == Instruction::Trunc && - TLI->isTruncateFree(SrcLT.second, DstLT.second)) - return 0; - - if (Opcode == Instruction::ZExt && - TLI->isZExtFree(SrcLT.second, DstLT.second)) - return 0; - // Just check the op cost. If the operation is legal then assume it costs 1. if (!TLI->isOperationExpand(ISD, DstLT.second)) return 1; @@ -271,10 +385,6 @@ unsigned BasicTTI::getCastInstrCost(unsigned Opcode, Type *Dst, if (SrcLT.first == DstLT.first && SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { - // Bitcast between types that are legalized to the same type are free. - if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc) - return 0; - // Assume that Zext is done using AND. if (Opcode == Instruction::ZExt) return 1; @@ -319,6 +429,7 @@ unsigned BasicTTI::getCFInstrCost(unsigned Opcode) const { unsigned BasicTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const { + const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -356,38 +467,151 @@ unsigned BasicTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, unsigned BasicTTI::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const { - return 1; + std::pair LT = getTLI()->getTypeLegalizationCost(Val->getScalarType()); + + return LT.first; } unsigned BasicTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const { assert(!Src->isVoidTy() && "Invalid type"); - std::pair LT = TLI->getTypeLegalizationCost(Src); + std::pair LT = getTLI()->getTypeLegalizationCost(Src); + + // Assuming that all loads of legal types cost 1. + unsigned Cost = LT.first; + + if (Src->isVectorTy() && + Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) { + // This is a vector load that legalizes to a larger type than the vector + // itself. Unless the corresponding extending load or truncating store is + // legal, then this will scalarize. + TargetLowering::LegalizeAction LA = TargetLowering::Expand; + EVT MemVT = getTLI()->getValueType(Src, true); + if (MemVT.isSimple() && MemVT != MVT::Other) { + if (Opcode == Instruction::Store) + LA = getTLI()->getTruncStoreAction(LT.second, MemVT.getSimpleVT()); + else + LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, MemVT.getSimpleVT()); + } - // Assume that all loads of legal types cost 1. - return LT.first; + if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { + // This is a vector load/store for some illegal type that is scalarized. + // We must account for the cost of building or decomposing the vector. + Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store, + Opcode == Instruction::Store); + } + } + + return Cost; } -unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy, +unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef Tys) const { - // assume that we need to scalarize this intrinsic. - unsigned ScalarizationCost = 0; - unsigned ScalarCalls = 1; - if (RetTy->isVectorTy()) { - ScalarizationCost = getScalarizationOverhead(RetTy, true, false); - ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); - } - for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { - if (Tys[i]->isVectorTy()) { - ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); + unsigned ISD = 0; + switch (IID) { + default: { + // Assume that we need to scalarize this intrinsic. + unsigned ScalarizationCost = 0; + unsigned ScalarCalls = 1; + if (RetTy->isVectorTy()) { + ScalarizationCost = getScalarizationOverhead(RetTy, true, false); ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); } + for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { + if (Tys[i]->isVectorTy()) { + ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); + ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); + } + } + + return ScalarCalls + ScalarizationCost; + } + // Look for intrinsics that can be lowered directly or turned into a scalar + // intrinsic call. + case Intrinsic::sqrt: ISD = ISD::FSQRT; break; + case Intrinsic::sin: ISD = ISD::FSIN; break; + case Intrinsic::cos: ISD = ISD::FCOS; break; + case Intrinsic::exp: ISD = ISD::FEXP; break; + case Intrinsic::exp2: ISD = ISD::FEXP2; break; + case Intrinsic::log: ISD = ISD::FLOG; break; + case Intrinsic::log10: ISD = ISD::FLOG10; break; + case Intrinsic::log2: ISD = ISD::FLOG2; break; + case Intrinsic::fabs: ISD = ISD::FABS; break; + case Intrinsic::copysign: ISD = ISD::FCOPYSIGN; break; + case Intrinsic::floor: ISD = ISD::FFLOOR; break; + case Intrinsic::ceil: ISD = ISD::FCEIL; break; + case Intrinsic::trunc: ISD = ISD::FTRUNC; break; + case Intrinsic::nearbyint: + ISD = ISD::FNEARBYINT; break; + case Intrinsic::rint: ISD = ISD::FRINT; break; + case Intrinsic::round: ISD = ISD::FROUND; break; + case Intrinsic::pow: ISD = ISD::FPOW; break; + case Intrinsic::fma: ISD = ISD::FMA; break; + case Intrinsic::fmuladd: ISD = ISD::FMA; break; + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + return 0; } - return ScalarCalls + ScalarizationCost; + + const TargetLoweringBase *TLI = getTLI(); + std::pair LT = TLI->getTypeLegalizationCost(RetTy); + + if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { + // The operation is legal. Assume it costs 1. + // If the type is split to multiple registers, assume that thre is some + // overhead to this. + // TODO: Once we have extract/insert subvector cost we need to use them. + if (LT.first > 1) + return LT.first * 2; + return LT.first * 1; + } + + if (!TLI->isOperationExpand(ISD, LT.second)) { + // If the operation is custom lowered then assume + // thare the code is twice as expensive. + return LT.first * 2; + } + + // If we can't lower fmuladd into an FMA estimate the cost as a floating + // point mul followed by an add. + if (IID == Intrinsic::fmuladd) + return TopTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) + + TopTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy); + + // Else, assume that we need to scalarize this intrinsic. For math builtins + // this will emit a costly libcall, adding call overhead and spills. Make it + // very expensive. + if (RetTy->isVectorTy()) { + unsigned Num = RetTy->getVectorNumElements(); + unsigned Cost = TopTTI->getIntrinsicInstrCost(IID, RetTy->getScalarType(), + Tys); + return 10 * Cost * Num; + } + + // This is going to be turned into a library call, make it expensive. + return 10; } unsigned BasicTTI::getNumberOfParts(Type *Tp) const { - std::pair LT = TLI->getTypeLegalizationCost(Tp); + std::pair LT = getTLI()->getTypeLegalizationCost(Tp); return LT.first; } + +unsigned BasicTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { + return 0; +} + +unsigned BasicTTI::getReductionCost(unsigned Opcode, Type *Ty, + bool IsPairwise) const { + assert(Ty->isVectorTy() && "Expect a vector type"); + unsigned NumVecElts = Ty->getVectorNumElements(); + unsigned NumReduxLevels = Log2_32(NumVecElts); + unsigned ArithCost = NumReduxLevels * + TopTTI->getArithmeticInstrCost(Opcode, Ty); + // Assume the pairwise shuffles add a cost. + unsigned ShuffleCost = + NumReduxLevels * (IsPairwise + 1) * + TopTTI->getShuffleCost(SK_ExtractSubvector, Ty, NumVecElts / 2, Ty); + return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true); +}