From 9f141640b58bef4eb157e92315c4720f6c513862 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Sun, 26 Jul 2015 17:28:13 +0000 Subject: [PATCH] [TTI/CostModel] improve TTI::getGEPCost and use it in CostModel::getInstructionCost Summary: This patch updates TargetTransformInfoImplCRTPBase::getGEPCost to consider addressing modes. It now returns TCC_Free when the GEP can be completely folded to an addresing mode. I started this patch as I refactored SLSR. Function isGEPFoldable looks common and is indeed used by some WIP of mine. So I extracted that logic to getGEPCost. Furthermore, I noticed getGEPCost wasn't directly tested anywhere. The best testing bed seems CostModel, but its getInstructionCost method invokes getAddressComputationCost for GEPs which provides very coarse estimation. So this patch also makes getInstructionCost call the updated getGEPCost for GEPs. This change inevitably breaks some tests because the cost model changes, but nothing looks seriously wrong -- if we believe the new cost model is the right way to go, these tests should be updated. This patch is not perfect yet -- the comments in some tests need to be updated. I want to know whether this is a right approach before fixing those details. Reviewers: chandlerc, hfinkel Subscribers: aschwaighofer, llvm-commits, aemerson Differential Revision: http://reviews.llvm.org/D9819 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@243250 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/TargetTransformInfo.h | 9 +- .../llvm/Analysis/TargetTransformInfoImpl.h | 58 +++++++++++- include/llvm/IR/GetElementPtrTypeIterator.h | 2 +- lib/Analysis/CostModel.cpp | 6 +- .../Scalar/StraightLineStrengthReduce.cpp | 1 + test/Analysis/CostModel/ARM/gep.ll | 88 ++++++++++++++----- test/Analysis/CostModel/no_info.ll | 22 ++++- test/Transforms/LoopUnroll/X86/partial.ll | 9 +- .../LoopVectorize/X86/metadata-enable.ll | 6 +- 9 files changed, 158 insertions(+), 43 deletions(-) diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 01f00896410..eb6fa9da369 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -136,7 +136,8 @@ public: /// The contract for this function is the same as \c getOperationCost except /// that it supports an interface that provides extra information specific to /// the GEP operation. - unsigned getGEPCost(const Value *Ptr, ArrayRef Operands) const; + unsigned getGEPCost(Type *PointeeType, const Value *Ptr, + ArrayRef Operands) const; /// \brief Estimate the cost of a function call when lowered. /// @@ -543,7 +544,7 @@ public: virtual ~Concept() = 0; virtual const DataLayout &getDataLayout() const = 0; virtual unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) = 0; - virtual unsigned getGEPCost(const Value *Ptr, + virtual unsigned getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef Operands) = 0; virtual unsigned getCallCost(FunctionType *FTy, int NumArgs) = 0; virtual unsigned getCallCost(const Function *F, int NumArgs) = 0; @@ -643,9 +644,9 @@ public: unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) override { return Impl.getOperationCost(Opcode, Ty, OpTy); } - unsigned getGEPCost(const Value *Ptr, + unsigned getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef Operands) override { - return Impl.getGEPCost(Ptr, Operands); + return Impl.getGEPCost(PointeeType, Ptr, Operands); } unsigned getCallCost(FunctionType *FTy, int NumArgs) override { return Impl.getCallCost(FTy, NumArgs); diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index 035cb04870a..505126acc0a 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -19,6 +19,7 @@ #include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" @@ -92,7 +93,8 @@ public: } } - unsigned getGEPCost(const Value *Ptr, ArrayRef Operands) { + unsigned getGEPCost(Type *PointeeType, const Value *Ptr, + ArrayRef Operands) { // In the basic model, we just assume that all-constant GEPs will be folded // into their uses via addressing modes. for (unsigned Idx = 0, Size = Operands.size(); Idx != Size; ++Idx) @@ -378,6 +380,54 @@ public: return static_cast(this)->getCallCost(F, Arguments.size()); } + using BaseT::getGEPCost; + + unsigned getGEPCost(Type *PointeeType, const Value *Ptr, + ArrayRef Operands) { + const GlobalValue *BaseGV = nullptr; + if (Ptr != nullptr) { + // TODO: will remove this when pointers have an opaque type. + assert(Ptr->getType()->getScalarType()->getPointerElementType() == + PointeeType && + "explicit pointee type doesn't match operand's pointee type"); + BaseGV = dyn_cast(Ptr->stripPointerCasts()); + } + bool HasBaseReg = (BaseGV == nullptr); + int64_t BaseOffset = 0; + int64_t Scale = 0; + + // Assumes the address space is 0 when Ptr is nullptr. + unsigned AS = + (Ptr == nullptr ? 0 : Ptr->getType()->getPointerAddressSpace()); + auto GTI = gep_type_begin(PointerType::get(PointeeType, AS), Operands); + for (auto I = Operands.begin(); I != Operands.end(); ++I, ++GTI) { + if (isa(*GTI)) { + int64_t ElementSize = DL.getTypeAllocSize(GTI.getIndexedType()); + if (const ConstantInt *ConstIdx = dyn_cast(*I)) { + BaseOffset += ConstIdx->getSExtValue() * ElementSize; + } else { + // Needs scale register. + if (Scale != 0) { + // No addressing mode takes two scale registers. + return TTI::TCC_Basic; + } + Scale = ElementSize; + } + } else { + StructType *STy = cast(*GTI); + uint64_t Field = cast(*I)->getZExtValue(); + BaseOffset += DL.getStructLayout(STy)->getElementOffset(Field); + } + } + + if (static_cast(this)->isLegalAddressingMode( + PointerType::get(*GTI, AS), const_cast(BaseGV), + BaseOffset, HasBaseReg, Scale, AS)) { + return TTI::TCC_Free; + } + return TTI::TCC_Basic; + } + using BaseT::getIntrinsicCost; unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, @@ -397,9 +447,9 @@ public: return TTI::TCC_Free; // Model all PHI nodes as free. if (const GEPOperator *GEP = dyn_cast(U)) { - SmallVector Indices(GEP->idx_begin(), GEP->idx_end()); - return static_cast(this) - ->getGEPCost(GEP->getPointerOperand(), Indices); + SmallVector Indices(GEP->idx_begin(), GEP->idx_end()); + return static_cast(this)->getGEPCost( + GEP->getSourceElementType(), GEP->getPointerOperand(), Indices); } if (auto CS = ImmutableCallSite(U)) { diff --git a/include/llvm/IR/GetElementPtrTypeIterator.h b/include/llvm/IR/GetElementPtrTypeIterator.h index 6bba0ae29e9..7cb13fa33aa 100644 --- a/include/llvm/IR/GetElementPtrTypeIterator.h +++ b/include/llvm/IR/GetElementPtrTypeIterator.h @@ -78,7 +78,7 @@ namespace llvm { // current type directly. Type *operator->() const { return operator*(); } - Value *getOperand() const { return *OpIt; } + Value *getOperand() const { return const_cast(&**OpIt); } generic_gep_type_iterator& operator++() { // Preincrement if (CurTy.getInt()) { diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp index b529c1a70aa..da790d74524 100644 --- a/lib/Analysis/CostModel.cpp +++ b/lib/Analysis/CostModel.cpp @@ -383,10 +383,8 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const { return -1; switch (I->getOpcode()) { - case Instruction::GetElementPtr:{ - Type *ValTy = I->getOperand(0)->getType()->getPointerElementType(); - return TTI->getAddressComputationCost(ValTy); - } + case Instruction::GetElementPtr: + return TTI->getUserCost(I); case Instruction::Ret: case Instruction::PHI: diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 6d9d417ef94..b2b886c4f11 100644 --- a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -234,6 +234,7 @@ bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, Basis.CandidateKind == C.CandidateKind); } +// TODO: use TTI->getGEPCost. static bool isGEPFoldable(GetElementPtrInst *GEP, const TargetTransformInfo *TTI, const DataLayout *DL) { diff --git a/test/Analysis/CostModel/ARM/gep.ll b/test/Analysis/CostModel/ARM/gep.ll index 624ca113a30..a70d6d42b61 100644 --- a/test/Analysis/CostModel/ARM/gep.ll +++ b/test/Analysis/CostModel/ARM/gep.ll @@ -3,41 +3,85 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" target triple = "thumbv7-apple-ios6.0.0" -define void @test_geps() { - ; Cost of scalar integer geps should be one. We can't always expect it to be - ; folded into the instruction addressing mode. -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8* +define void @test_geps(i32 %i) { + ; GEPs with index 0 are essentially NOOPs. +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8* %a0 = getelementptr inbounds i8, i8* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16* %a1 = getelementptr inbounds i16, i16* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32* %a2 = getelementptr inbounds i32, i32* undef, i32 0 - -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64* %a3 = getelementptr inbounds i64, i64* undef, i32 0 - - ; Cost of scalar floating point geps should be one. We cannot fold the address - ; computation. -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds float, float* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds float, float* %a4 = getelementptr inbounds float, float* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds double, double* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds double, double* %a5 = getelementptr inbounds double, double* undef, i32 0 - - - ; Cost of vector geps should be one. We cannot fold the address computation. -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i8>, <4 x i8>* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i8>, <4 x i8>* %a7 = getelementptr inbounds <4 x i8>, <4 x i8>* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i16>, <4 x i16>* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i16>, <4 x i16>* %a8 = getelementptr inbounds <4 x i16>, <4 x i16>* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i32>, <4 x i32>* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i32>, <4 x i32>* %a9 = getelementptr inbounds <4 x i32>, <4 x i32>* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i64>, <4 x i64>* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i64>, <4 x i64>* %a10 = getelementptr inbounds <4 x i64>, <4 x i64>* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x float>, <4 x float>* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x float>, <4 x float>* %a11 = getelementptr inbounds <4 x float>, <4 x float>* undef, i32 0 -;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x double>, <4 x double>* +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x double>, <4 x double>* %a12 = getelementptr inbounds <4 x double>, <4 x double>* undef, i32 0 + ; Cost of GEPs is one if we cannot fold the address computation. +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8* + %b0 = getelementptr inbounds i8, i8* undef, i32 1024 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16* + %b1 = getelementptr inbounds i16, i16* undef, i32 1024 + ; Thumb-2 cannot fold offset >= 2^12 into address computation. +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32* + %b2 = getelementptr inbounds i32, i32* undef, i32 1024 +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64* + %b3 = getelementptr inbounds i64, i64* undef, i32 1024 +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds float, float* + %b4 = getelementptr inbounds float, float* undef, i32 1024 +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds double, double* + %b5 = getelementptr inbounds double, double* undef, i32 1024 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i8>, <4 x i8>* + %b7 = getelementptr inbounds <4 x i8>, <4 x i8>* undef, i32 1 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i16>, <4 x i16>* + %b8 = getelementptr inbounds <4 x i16>, <4 x i16>* undef, i32 1 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i32>, <4 x i32>* + %b9 = getelementptr inbounds <4 x i32>, <4 x i32>* undef, i32 1 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i64>, <4 x i64>* + %b10 = getelementptr inbounds <4 x i64>, <4 x i64>* undef, i32 1 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x float>, <4 x float>* + %b11 = getelementptr inbounds <4 x float>, <4 x float>* undef, i32 1 +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x double>, <4 x double>* + %b12 = getelementptr inbounds <4 x double>, <4 x double>* undef, i32 1 + +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8* + %c0 = getelementptr inbounds i8, i8* undef, i32 %i +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16* + %c1 = getelementptr inbounds i16, i16* undef, i32 %i +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32* + %c2 = getelementptr inbounds i32, i32* undef, i32 %i +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64* + %c3 = getelementptr inbounds i64, i64* undef, i32 %i +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds float, float* + %c4 = getelementptr inbounds float, float* undef, i32 %i +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds double, double* + %c5 = getelementptr inbounds double, double* undef, i32 %i +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i8>, <4 x i8>* + %c7 = getelementptr inbounds <4 x i8>, <4 x i8>* undef, i32 %i +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i16>, <4 x i16>* + %c8 = getelementptr inbounds <4 x i16>, <4 x i16>* undef, i32 %i + ; Thumb-2 cannot fold scales larger than 8 to address computation. +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i32>, <4 x i32>* + %c9 = getelementptr inbounds <4 x i32>, <4 x i32>* undef, i32 %i +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i64>, <4 x i64>* + %c10 = getelementptr inbounds <4 x i64>, <4 x i64>* undef, i32 %i +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x float>, <4 x float>* + %c11 = getelementptr inbounds <4 x float>, <4 x float>* undef, i32 %i +;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x double>, <4 x double>* + %c12 = getelementptr inbounds <4 x double>, <4 x double>* undef, i32 %i ret void } diff --git a/test/Analysis/CostModel/no_info.ll b/test/Analysis/CostModel/no_info.ll index 5f3b56ad9cf..931669b6017 100644 --- a/test/Analysis/CostModel/no_info.ll +++ b/test/Analysis/CostModel/no_info.ll @@ -5,9 +5,27 @@ ; -- No triple in this module -- -;CHECK: cost of 1 {{.*}} add -;CHECK: cost of 1 {{.*}} ret +; CHECK-LABEL: function 'no_info' +; CHECK: cost of 1 {{.*}} add +; CHECK: cost of 1 {{.*}} ret define i32 @no_info(i32 %arg) { %e = add i32 %arg, %arg ret i32 %e } + +define i8 @addressing_mode_reg_reg(i8* %a, i32 %b) { +; CHECK-LABEL: function 'addressing_mode_reg_reg' + %p = getelementptr i8, i8* %a, i32 %b ; NoTTI accepts reg+reg addressing. +; CHECK: cost of 0 {{.*}} getelementptr + %v = load i8, i8* %p + ret i8 %v +} + +; CHECK-LABEL: function 'addressing_mode_scaled_reg' +define i32 @addressing_mode_scaled_reg(i32* %a, i32 %b) { + ; NoTTI rejects reg+scale*reg addressing. + %p = getelementptr i32, i32* %a, i32 %b +; CHECK: cost of 1 {{.*}} getelementptr + %v = load i32, i32* %p + ret i32 %v +} diff --git a/test/Transforms/LoopUnroll/X86/partial.ll b/test/Transforms/LoopUnroll/X86/partial.ll index 4566f792deb..104a38779e5 100644 --- a/test/Transforms/LoopUnroll/X86/partial.ll +++ b/test/Transforms/LoopUnroll/X86/partial.ll @@ -86,17 +86,20 @@ for.body: ; preds = %entry, %for.body %reduction.026 = phi i16 [ %add14, %for.body ], [ 0, %entry ] %arrayidx = getelementptr inbounds i16, i16* %arr, i64 %indvars.iv %0 = load i16, i16* %arrayidx, align 2 - %add = add i16 %0, %reduction.026 + %mul = shl i16 %0, 1 + %add = add i16 %mul, %reduction.026 %sext = mul i64 %indvars.iv, 12884901888 %idxprom3 = ashr exact i64 %sext, 32 %arrayidx4 = getelementptr inbounds i16, i16* %arr, i64 %idxprom3 %1 = load i16, i16* %arrayidx4, align 2 - %add7 = add i16 %add, %1 + %mul2 = shl i16 %1, 1 + %add7 = add i16 %add, %mul2 %sext28 = mul i64 %indvars.iv, 21474836480 %idxprom10 = ashr exact i64 %sext28, 32 %arrayidx11 = getelementptr inbounds i16, i16* %arr, i64 %idxprom10 %2 = load i16, i16* %arrayidx11, align 2 - %add14 = add i16 %add7, %2 + %mul3 = shl i16 %2, 1 + %add14 = add i16 %add7, %mul3 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, %n diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll index ba8e11e5874..74c0c16086f 100644 --- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll +++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll @@ -60,7 +60,7 @@ for.body: ; preds = %for.body, %entry %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv store i32 %add, i32* %arrayidx2, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 32 + %exitcond = icmp eq i64 %indvars.iv.next, 64 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 for.end: ; preds = %for.body @@ -111,7 +111,7 @@ for.body: ; preds = %for.body, %entry %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv store i32 %add, i32* %arrayidx2, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 32 + %exitcond = icmp eq i64 %indvars.iv.next, 64 br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %for.body @@ -162,7 +162,7 @@ for.body: ; preds = %for.body, %entry %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv store i32 %add, i32* %arrayidx2, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 32 + %exitcond = icmp eq i64 %indvars.iv.next, 64 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2 for.end: ; preds = %for.body -- 2.34.1