X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FARM%2FARMTargetTransformInfo.cpp;h=4635e4308f7251c6c5b063c52e5c4c1b792a51b7;hb=12af22e8cc217827cf4f118b0f5e4ebbda9925ae;hp=61b39e9a5011c3714bc9f158ac3ece7620bac52a;hpb=c0d8dc0eb6e1df872affadba01f60e42275e2863;p=oota-llvm.git diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 61b39e9a501..4635e4308f7 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -14,17 +14,18 @@ /// //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "armtti" #include "ARM.h" #include "ARMTargetMachine.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLowering.h" #include "llvm/Target/CostTable.h" +#include "llvm/Target/TargetLowering.h" using namespace llvm; +#define DEBUG_TYPE "armtti" + // Declare the pass initialization routine locally as target-specific passes -// don't havve a target-wide initialization entry point, and so we rely on the +// don't have a target-wide initialization entry point, and so we rely on the // pass constructor initialization. namespace llvm { void initializeARMTTIPass(PassRegistry &); @@ -32,7 +33,7 @@ void initializeARMTTIPass(PassRegistry &); namespace { -class ARMTTI : public ImmutablePass, public TargetTransformInfo { +class ARMTTI final : public ImmutablePass, public TargetTransformInfo { const ARMBaseTargetMachine *TM; const ARMSubtarget *ST; const ARMTargetLowering *TLI; @@ -42,25 +43,21 @@ class ARMTTI : public ImmutablePass, public TargetTransformInfo { unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; public: - ARMTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) { + ARMTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) { llvm_unreachable("This pass cannot be directly constructed"); } ARMTTI(const ARMBaseTargetMachine *TM) : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), - TLI(TM->getTargetLowering()) { + TLI(TM->getSubtargetImpl()->getTargetLowering()) { initializeARMTTIPass(*PassRegistry::getPassRegistry()); } - virtual void initializePass() { + void initializePass() override { pushTTIStack(this); } - virtual void finalizePass() { - popTTIStack(); - } - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { + void getAnalysisUsage(AnalysisUsage &AU) const override { TargetTransformInfo::getAnalysisUsage(AU); } @@ -68,7 +65,7 @@ public: static char ID; /// Provide necessary pointer adjustments for the two base classes. - virtual void *getAdjustedAnalysisPointer(const void *ID) { + void *getAdjustedAnalysisPointer(const void *ID) override { if (ID == &TargetTransformInfo::ID) return (TargetTransformInfo*)this; return this; @@ -76,8 +73,8 @@ public: /// \name Scalar TTI Implementations /// @{ - - virtual unsigned getIntImmCost(const APInt &Imm, Type *Ty) const; + using TargetTransformInfo::getIntImmCost; + unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; /// @} @@ -85,7 +82,7 @@ public: /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector) const { + unsigned getNumberOfRegisters(bool Vector) const override { if (Vector) { if (ST->hasNEON()) return 16; @@ -94,10 +91,10 @@ public: if (ST->isThumb1Only()) return 8; - return 16; + return 13; } - unsigned getRegisterBitWidth(bool Vector) const { + unsigned getRegisterBitWidth(bool Vector) const override { if (Vector) { if (ST->hasNEON()) return 128; @@ -107,7 +104,7 @@ public: return 32; } - unsigned getMaximumUnrollFactor() const { + unsigned getMaximumUnrollFactor() const override { // These are out of order CPUs: if (ST->isCortexA15() || ST->isSwift()) return 2; @@ -115,16 +112,28 @@ public: } unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, - int Index, Type *SubTp) const; + int Index, Type *SubTp) const override; unsigned getCastInstrCost(unsigned Opcode, Type *Dst, - Type *Src) const; + Type *Src) const override; + + unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) const override; - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const; + unsigned getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) const override; - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const; + unsigned getAddressComputationCost(Type *Val, + bool IsComplex) const override; - unsigned getAddressComputationCost(Type *Val) const; + unsigned getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Op1Info = OK_AnyValue, + OperandValueKind Op2Info = OK_AnyValue, + OperandValueProperties Opd1PropInfo = OP_None, + OperandValueProperties Opd2PropInfo = OP_None) const override; + + unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) const override; /// @} }; @@ -155,30 +164,30 @@ unsigned ARMTTI::getIntImmCost(const APInt &Imm, Type *Ty) const { (ARM_AM::getSOImmVal(~ZImmVal) != -1)) return 1; return ST->hasV6T2Ops() ? 2 : 3; - } else if (ST->isThumb2()) { + } + if (ST->isThumb2()) { if ((SImmVal >= 0 && SImmVal < 65536) || (ARM_AM::getT2SOImmVal(ZImmVal) != -1) || (ARM_AM::getT2SOImmVal(~ZImmVal) != -1)) return 1; return ST->hasV6T2Ops() ? 2 : 3; - } else /*Thumb1*/ { - if (SImmVal >= 0 && SImmVal < 256) - return 1; - if ((~ZImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal)) - return 2; - // Load from constantpool. - return 3; } - return 2; + // Thumb1. + if (SImmVal >= 0 && SImmVal < 256) + return 1; + if ((~ZImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal)) + return 2; + // Load from constantpool. + return 3; } unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst, - Type *Src) const { + Type *Src) const { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); // Single to/from double precision conversions. - static const CostTblEntry NEONFltDblTbl[] = { + static const CostTblEntry NEONFltDblTbl[] = { // Vector fptrunc/fpext conversions. { ISD::FP_ROUND, MVT::v2f64, 2 }, { ISD::FP_EXTEND, MVT::v2f32, 2 }, @@ -188,8 +197,7 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst, if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND)) { std::pair LT = TLI->getTypeLegalizationCost(Src); - int Idx = CostTableLookup(NEONFltDblTbl, array_lengthof(NEONFltDblTbl), - ISD, LT.second); + int Idx = CostTableLookup(NEONFltDblTbl, ISD, LT.second); if (Idx != -1) return LT.first * NEONFltDblTbl[Idx].Cost; } @@ -203,7 +211,8 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst, // Some arithmetic, load and store operations have specific instructions // to cast up/down their types automatically at no extra cost. // TODO: Get these tables to know at least what the related operations are. - static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = { + static const TypeConversionCostTblEntry + NEONVectorConversionTbl[] = { { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 }, { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 }, { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, @@ -211,37 +220,83 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, - // Operations that we legalize using load/stores to the stack. - { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 16*2 + 4*4 }, - { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 16*2 + 4*3 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 8*2 + 2*4 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 8*2 + 2*3 }, - { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4*1 + 16*2 + 2*1 }, - { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2*1 + 8*2 + 1 }, + // The number of vmovl instructions for the extension. + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + + // Operations that we legalize using splitting. + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, // Vector float <-> i32 conversions. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, + { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, + { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 }, + { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 }, + { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, + { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, // Vector double <-> i32 conversions. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, + + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, + { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, - { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 } + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, + { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 }, + { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 }, + { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 }, + { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 } }; if (SrcTy.isVector() && ST->hasNEON()) { - int Idx = ConvertCostTableLookup(NEONVectorConversionTbl, - array_lengthof(NEONVectorConversionTbl), - ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()); + int Idx = ConvertCostTableLookup(NEONVectorConversionTbl, ISD, + DstTy.getSimpleVT(), SrcTy.getSimpleVT()); if (Idx != -1) return NEONVectorConversionTbl[Idx].Cost; } // Scalar float to integer conversions. - static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = { + static const TypeConversionCostTblEntry + NEONFloatConversionTbl[] = { { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 }, { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 }, { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 }, @@ -264,16 +319,15 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 } }; if (SrcTy.isFloatingPoint() && ST->hasNEON()) { - int Idx = ConvertCostTableLookup(NEONFloatConversionTbl, - array_lengthof(NEONFloatConversionTbl), - ISD, DstTy.getSimpleVT(), - SrcTy.getSimpleVT()); + int Idx = ConvertCostTableLookup(NEONFloatConversionTbl, ISD, + DstTy.getSimpleVT(), SrcTy.getSimpleVT()); if (Idx != -1) return NEONFloatConversionTbl[Idx].Cost; } // Scalar integer to float conversions. - static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = { + static const TypeConversionCostTblEntry + NEONIntegerConversionTbl[] = { { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 }, { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 }, { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 }, @@ -297,16 +351,15 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst, }; if (SrcTy.isInteger() && ST->hasNEON()) { - int Idx = ConvertCostTableLookup(NEONIntegerConversionTbl, - array_lengthof(NEONIntegerConversionTbl), - ISD, DstTy.getSimpleVT(), - SrcTy.getSimpleVT()); + int Idx = ConvertCostTableLookup(NEONIntegerConversionTbl, ISD, + DstTy.getSimpleVT(), SrcTy.getSimpleVT()); if (Idx != -1) return NEONIntegerConversionTbl[Idx].Cost; } // Scalar integer conversion costs. - static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { + static const TypeConversionCostTblEntry + ARMIntegerConversionTbl[] = { // i16 -> i64 requires two dependent operations. { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 }, @@ -318,11 +371,8 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst, }; if (SrcTy.isInteger()) { - int Idx = - ConvertCostTableLookup(ARMIntegerConversionTbl, - array_lengthof(ARMIntegerConversionTbl), - ISD, DstTy.getSimpleVT(), - SrcTy.getSimpleVT()); + int Idx = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD, + DstTy.getSimpleVT(), SrcTy.getSimpleVT()); if (Idx != -1) return ARMIntegerConversionTbl[Idx].Cost; } @@ -350,12 +400,8 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, // On NEON a a vector select gets lowered to vbsl. if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) { // Lowering of some vector selects is currently far from perfect. - static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = { - { ISD::SELECT, MVT::v4i1, MVT::v4i8, 2*4 + 2*1 }, - { ISD::SELECT, MVT::v8i1, MVT::v8i8, 2*8 + 1 }, - { ISD::SELECT, MVT::v16i1, MVT::v16i8, 2*16 + 1 }, - { ISD::SELECT, MVT::v4i1, MVT::v4i16, 2*4 + 1 }, - { ISD::SELECT, MVT::v8i1, MVT::v8i16, 2*8 + 1 }, + static const TypeConversionCostTblEntry + NEONVectorSelectTbl[] = { { ISD::SELECT, MVT::v16i1, MVT::v16i16, 2*16 + 1 + 3*1 + 4*1 }, { ISD::SELECT, MVT::v8i1, MVT::v8i32, 4*8 + 1*3 + 1*4 + 1*2 }, { ISD::SELECT, MVT::v16i1, MVT::v16i32, 4*16 + 1*6 + 1*8 + 1*4 }, @@ -366,12 +412,13 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, EVT SelCondTy = TLI->getValueType(CondTy); EVT SelValTy = TLI->getValueType(ValTy); - int Idx = ConvertCostTableLookup(NEONVectorSelectTbl, - array_lengthof(NEONVectorSelectTbl), - ISD, SelCondTy.getSimpleVT(), - SelValTy.getSimpleVT()); - if (Idx != -1) - return NEONVectorSelectTbl[Idx].Cost; + if (SelCondTy.isSimple() && SelValTy.isSimple()) { + int Idx = ConvertCostTableLookup(NEONVectorSelectTbl, ISD, + SelCondTy.getSimpleVT(), + SelValTy.getSimpleVT()); + if (Idx != -1) + return NEONVectorSelectTbl[Idx].Cost; + } std::pair LT = TLI->getTypeLegalizationCost(ValTy); return LT.first; @@ -380,7 +427,16 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned ARMTTI::getAddressComputationCost(Type *Ty) const { +unsigned ARMTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { + // Address computations in vectorized code with non-consecutive addresses will + // likely result in more instructions compared to scalar code where the + // computation can more often be merged into the index mode. The resulting + // extra micro-ops can significantly decrease throughput. + unsigned NumVectorInstToHideOverhead = 10; + + if (Ty->isVectorTy() && IsComplex) + return NumVectorInstToHideOverhead; + // In many cases the address computation is not merged into the instruction // addressing mode. return 1; @@ -388,30 +444,145 @@ unsigned ARMTTI::getAddressComputationCost(Type *Ty) const { unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const { - // We only handle costs of reverse shuffles for now. - if (Kind != SK_Reverse) + // We only handle costs of reverse and alternate shuffles for now. + if (Kind != SK_Reverse && Kind != SK_Alternate) return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); - static const CostTblEntry NEONShuffleTbl[] = { - // Reverse shuffle cost one instruction if we are shuffling within a double - // word (vrev) or two if we shuffle a quad word (vrev, vext). - { ISD::VECTOR_SHUFFLE, MVT::v2i32, 1 }, - { ISD::VECTOR_SHUFFLE, MVT::v2f32, 1 }, - { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, - { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, - - { ISD::VECTOR_SHUFFLE, MVT::v4i32, 2 }, - { ISD::VECTOR_SHUFFLE, MVT::v4f32, 2 }, - { ISD::VECTOR_SHUFFLE, MVT::v8i16, 2 }, - { ISD::VECTOR_SHUFFLE, MVT::v16i8, 2 } + if (Kind == SK_Reverse) { + static const CostTblEntry NEONShuffleTbl[] = { + // Reverse shuffle cost one instruction if we are shuffling within a + // double word (vrev) or two if we shuffle a quad word (vrev, vext). + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; + + std::pair LT = TLI->getTypeLegalizationCost(Tp); + + int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx == -1) + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + + return LT.first * NEONShuffleTbl[Idx].Cost; + } + if (Kind == SK_Alternate) { + static const CostTblEntry NEONAltShuffleTbl[] = { + // Alt shuffle cost table for ARM. Cost is the number of instructions + // required to create the shuffled vector. + + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, + + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, + + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; + + std::pair LT = TLI->getTypeLegalizationCost(Tp); + int Idx = + CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx == -1) + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + return LT.first * NEONAltShuffleTbl[Idx].Cost; + } + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); +} + +unsigned ARMTTI::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Op1Info, + OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, + OperandValueProperties Opd2PropInfo) const { + + int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); + std::pair LT = TLI->getTypeLegalizationCost(Ty); + + const unsigned FunctionCallDivCost = 20; + const unsigned ReciprocalDivCost = 10; + static const CostTblEntry CostTbl[] = { + // Division. + // These costs are somewhat random. Choose a cost of 20 to indicate that + // vectorizing devision (added function call) is going to be very expensive. + // Double registers types. + { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v4i16, ReciprocalDivCost}, + { ISD::UDIV, MVT::v4i16, ReciprocalDivCost}, + { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost}, + { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v8i8, ReciprocalDivCost}, + { ISD::UDIV, MVT::v8i8, ReciprocalDivCost}, + { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost}, + { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost}, + // Quad register types. + { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost}, + { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost}, + { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost}, + { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, + { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, + // Multiplication. }; - std::pair LT = TLI->getTypeLegalizationCost(Tp); + int Idx = -1; - int Idx = CostTableLookup(NEONShuffleTbl, array_lengthof(NEONShuffleTbl), - ISD::VECTOR_SHUFFLE, LT.second); - if (Idx == -1) - return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + if (ST->hasNEON()) + Idx = CostTableLookup(CostTbl, ISDOpcode, LT.second); + + if (Idx != -1) + return LT.first * CostTbl[Idx].Cost; - return LT.first * NEONShuffleTbl[Idx].Cost; + unsigned Cost = TargetTransformInfo::getArithmeticInstrCost( + Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo); + + // This is somewhat of a hack. The problem that we are facing is that SROA + // creates a sequence of shift, and, or instructions to construct values. + // These sequences are recognized by the ISel and have zero-cost. Not so for + // the vectorized code. Because we have support for v2i64 but not i64 those + // sequences look particularly beneficial to vectorize. + // To work around this we increase the cost of v2i64 operations to make them + // seem less beneficial. + if (LT.second == MVT::v2i64 && + Op2Info == TargetTransformInfo::OK_UniformConstantValue) + Cost += 4; + + return Cost; +} + +unsigned ARMTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) const { + std::pair LT = TLI->getTypeLegalizationCost(Src); + + if (Src->isVectorTy() && Alignment != 16 && + Src->getVectorElementType()->isDoubleTy()) { + // Unaligned loads/stores are extremely inefficient. + // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. + return LT.first * 4; + } + return LT.first; }