From c0a11edba6ea46c782672ab3fb4e4ab3dc267a22 Mon Sep 17 00:00:00 2001 From: Arnold Schwaighofer Date: Fri, 12 Jul 2013 19:16:02 +0000 Subject: [PATCH] TargetTransformInfo: address calculation parameter for gather/scather Address calculation for gather/scather in vectorized code can incur a significant cost making vectorization unbeneficial. Add infrastructure to add cost. Tests and cost model for targets will be in follow-up commits. radar://14351991 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186187 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/TargetTransformInfo.h | 6 ++- lib/Analysis/TargetTransformInfo.cpp | 7 +-- lib/CodeGen/BasicTargetTransformInfo.cpp | 4 +- lib/Target/ARM/ARMTargetTransformInfo.cpp | 4 +- lib/Transforms/Vectorize/LoopVectorize.cpp | 57 ++++++++++++++++++++- 5 files changed, 69 insertions(+), 9 deletions(-) diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index eb29e3483d8..b8a44b5b665 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -339,7 +339,11 @@ public: /// merged into the instruction indexing mode. Some targets might want to /// distinguish between address computation for memory operations on vector /// types and scalar types. Such targets should override this function. - virtual unsigned getAddressComputationCost(Type *Ty) const; + /// The 'IsComplex' parameter is a hint that the address computation is likely + /// to involve multiple instructions and as such unlikely to be merged into + /// the address indexing mode. + virtual unsigned getAddressComputationCost(Type *Ty, + bool IsComplex = false) const; /// @} diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 35ce794c7f1..4b2bf3c770b 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -206,8 +206,9 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const { return PrevTTI->getNumberOfParts(Tp); } -unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp) const { - return PrevTTI->getAddressComputationCost(Tp); +unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp, + bool IsComplex) const { + return PrevTTI->getAddressComputationCost(Tp, IsComplex); } namespace { @@ -559,7 +560,7 @@ struct NoTTI : ImmutablePass, TargetTransformInfo { return 0; } - unsigned getAddressComputationCost(Type *Tp) const { + unsigned getAddressComputationCost(Type *Tp, bool) const { return 0; } }; diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp index 19ace642ef1..3755320ab15 100644 --- a/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -108,7 +108,7 @@ public: virtual unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy, ArrayRef Tys) const; virtual unsigned getNumberOfParts(Type *Tp) const; - virtual unsigned getAddressComputationCost(Type *Ty) const; + virtual unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const; /// @} }; @@ -489,6 +489,6 @@ unsigned BasicTTI::getNumberOfParts(Type *Tp) const { return LT.first; } -unsigned BasicTTI::getAddressComputationCost(Type *Ty) const { +unsigned BasicTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { return 0; } diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 53ece668c3f..79f56a4b68c 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -124,7 +124,7 @@ public: unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const; - unsigned getAddressComputationCost(Type *Val) const; + unsigned getAddressComputationCost(Type *Val, bool IsComplex) const; unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Op1Info = OK_AnyValue, @@ -425,7 +425,7 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned ARMTTI::getAddressComputationCost(Type *Ty) const { +unsigned ARMTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { // In many cases the address computation is not merged into the instruction // addressing mode. return 1; diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 020eb615714..b35ed74497c 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4403,6 +4403,59 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { return Cost; } +/// \brief Check whether the address computation for a non-consecutive memory +/// access looks like an unlikely candidate for being merged into the indexing +/// mode. +/// +/// We look for a GEP which has one index that is an induction variable and all +/// other indices are loop invariant. If the stride of this access is also +/// within a small bound we decide that this address computation can likely be +/// merged into the addressing mode. +/// In all other cases, we identify the address computation as complex. +static bool isLikelyComplexAddressComputation(Value *Ptr, + LoopVectorizationLegality *Legal, + ScalarEvolution *SE, + const Loop *TheLoop) { + GetElementPtrInst *Gep = dyn_cast(Ptr); + if (!Gep) + return true; + + // We are looking for a gep with all loop invariant indices except for one + // which should be an induction variable. + unsigned NumOperands = Gep->getNumOperands(); + for (unsigned i = 1; i < NumOperands; ++i) { + Value *Opd = Gep->getOperand(i); + if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && + !Legal->isInductionVariable(Opd)) + return true; + } + + // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step + // can likely be merged into the address computation. + unsigned MaxMergeDistance = 64; + + const SCEVAddRecExpr *AddRec = dyn_cast(SE->getSCEV(Ptr)); + if (!AddRec) + return true; + + // Check the step is constant. + const SCEV *Step = AddRec->getStepRecurrence(*SE); + // Calculate the pointer stride and check if it is consecutive. + const SCEVConstant *C = dyn_cast(Step); + if (!C) + return true; + + const APInt &APStepVal = C->getValue()->getValue(); + + // Huge step value - give up. + if (APStepVal.getBitWidth() > 64) + return true; + + int64_t StepVal = APStepVal.getSExtValue(); + + return StepVal > MaxMergeDistance; +} + unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of @@ -4498,6 +4551,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { + bool IsComplexComputation = + isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); unsigned Cost = 0; // The cost of extracting from the value vector and pointer vector. Type *PtrTy = ToVectorTy(Ptr->getType(), VF); @@ -4513,7 +4568,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { } // The cost of the scalar loads/stores. - Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType()); + Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS); return Cost; -- 2.34.1