From cac51be31f152b736beda01b557546e02a92ee6c Mon Sep 17 00:00:00 2001 From: Wei Mi Date: Wed, 6 May 2015 17:12:25 +0000 Subject: [PATCH] [X86] Disable loop unrolling in loop vectorization pass when VF is 1. The patch disabled unrolling in loop vectorization pass when VF==1 on x86 architecture, by setting MaxInterleaveFactor to 1. Unrolling in loop vectorization pass may introduce the cost of overflow check, memory boundary check and extra prologue/epilogue code when regular unroller will unroll the loop another time. Disable it when VF==1 remove the unnecessary cost on x86. The same can be done for other platforms after verifying interleaving/memory bound checking to be not perf critical on those platforms. Differential Revision: http://reviews.llvm.org/D9515 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236613 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/TargetTransformInfo.h | 8 ++-- .../llvm/Analysis/TargetTransformInfoImpl.h | 2 +- include/llvm/CodeGen/BasicTTIImpl.h | 2 +- lib/Analysis/TargetTransformInfo.cpp | 4 +- .../AArch64/AArch64TargetTransformInfo.cpp | 2 +- .../AArch64/AArch64TargetTransformInfo.h | 2 +- lib/Target/ARM/ARMTargetTransformInfo.h | 2 +- lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 2 +- lib/Target/PowerPC/PPCTargetTransformInfo.h | 2 +- lib/Target/R600/AMDGPUTargetTransformInfo.cpp | 2 +- lib/Target/R600/AMDGPUTargetTransformInfo.h | 2 +- lib/Target/X86/X86TargetTransformInfo.cpp | 8 +++- lib/Target/X86/X86TargetTransformInfo.h | 2 +- lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- .../LoopVectorize/X86/unroll-small-loops.ll | 4 +- test/Transforms/LoopVectorize/unroll.ll | 37 +++++++++++++++++++ 16 files changed, 64 insertions(+), 19 deletions(-) create mode 100644 test/Transforms/LoopVectorize/unroll.ll diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index f4195fbb072..86bf1549dc7 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -403,7 +403,7 @@ public: /// \return The maximum interleave factor that any transform should try to /// perform for this target. This number depends on the level of parallelism /// and the number of execution units in the CPU. - unsigned getMaxInterleaveFactor() const; + unsigned getMaxInterleaveFactor(unsigned VF) const; /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc. unsigned @@ -562,7 +562,7 @@ public: const APInt &Imm, Type *Ty) = 0; virtual unsigned getNumberOfRegisters(bool Vector) = 0; virtual unsigned getRegisterBitWidth(bool Vector) = 0; - virtual unsigned getMaxInterleaveFactor() = 0; + virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0; virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, OperandValueKind Opd2Info, @@ -703,8 +703,8 @@ public: unsigned getRegisterBitWidth(bool Vector) override { return Impl.getRegisterBitWidth(Vector); } - unsigned getMaxInterleaveFactor() override { - return Impl.getMaxInterleaveFactor(); + unsigned getMaxInterleaveFactor(unsigned VF) override { + return Impl.getMaxInterleaveFactor(VF); } unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index b00de7723ef..c6f4f0b3458 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -263,7 +263,7 @@ public: unsigned getRegisterBitWidth(bool Vector) { return 32; } - unsigned getMaxInterleaveFactor() { return 1; } + unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h index c5efef32d07..d0726556043 100644 --- a/include/llvm/CodeGen/BasicTTIImpl.h +++ b/include/llvm/CodeGen/BasicTTIImpl.h @@ -285,7 +285,7 @@ public: unsigned getRegisterBitWidth(bool Vector) { return 32; } - unsigned getMaxInterleaveFactor() { return 1; } + unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } unsigned getArithmeticInstrCost( unsigned Opcode, Type *Ty, diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index a1519de25ee..e1744d1f296 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -186,8 +186,8 @@ unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const { return TTIImpl->getRegisterBitWidth(Vector); } -unsigned TargetTransformInfo::getMaxInterleaveFactor() const { - return TTIImpl->getMaxInterleaveFactor(); +unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const { + return TTIImpl->getMaxInterleaveFactor(VF); } unsigned TargetTransformInfo::getArithmeticInstrCost( diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 0533355b01d..ed27cf84bbb 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -419,7 +419,7 @@ unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef Tys) { return Cost; } -unsigned AArch64TTIImpl::getMaxInterleaveFactor() { +unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { if (ST->isCortexA57()) return 4; return 2; diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index dd3fd1f5ab7..25c22bcd58e 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -110,7 +110,7 @@ public: return 64; } - unsigned getMaxInterleaveFactor(); + unsigned getMaxInterleaveFactor(unsigned VF); unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h index 97590f60893..9479d7693eb 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/lib/Target/ARM/ARMTargetTransformInfo.h @@ -96,7 +96,7 @@ public: return 32; } - unsigned getMaxInterleaveFactor() { + unsigned getMaxInterleaveFactor(unsigned VF) { // These are out of order CPUs: if (ST->isCortexA15() || ST->isSwift()) return 2; diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index b46acd47f31..08328d9acac 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -215,7 +215,7 @@ unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) { } -unsigned PPCTTIImpl::getMaxInterleaveFactor() { +unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { unsigned Directive = ST->getDarwinDirective(); // The 440 has no SIMD support, but floating-point instructions // have a 5-cycle latency, so unroll by 5x for latency hiding. diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index 21acea1a36d..35e7a1497c8 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -81,7 +81,7 @@ public: bool enableAggressiveInterleaving(bool LoopHasReductions); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); - unsigned getMaxInterleaveFactor(); + unsigned getMaxInterleaveFactor(unsigned VF); unsigned getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp index 96edc417822..6dacc742b12 100644 --- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp @@ -76,7 +76,7 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; } -unsigned AMDGPUTTIImpl::getMaxInterleaveFactor() { +unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Semi-arbitrary large amount. return 64; } diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.h b/lib/Target/R600/AMDGPUTargetTransformInfo.h index 4abbdf20e76..791c84e6f28 100644 --- a/lib/Target/R600/AMDGPUTargetTransformInfo.h +++ b/lib/Target/R600/AMDGPUTargetTransformInfo.h @@ -70,7 +70,7 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); - unsigned getMaxInterleaveFactor(); + unsigned getMaxInterleaveFactor(unsigned VF); }; } // end namespace llvm diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 5136619235b..17c86a7b9f0 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -66,7 +66,13 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) { } -unsigned X86TTIImpl::getMaxInterleaveFactor() { +unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { + // If the loop will not be vectorized, don't interleave the loop. + // Let regular unroll to unroll the loop, which saves the overflow + // check and memory check cost. + if (VF == 1) + return 1; + if (ST->isAtom()) return 1; diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index 9f0adcfef62..e570bb55710 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -72,7 +72,7 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); - unsigned getMaxInterleaveFactor(); + unsigned getMaxInterleaveFactor(unsigned VF); unsigned getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index cdd3c680e8d..011fd0f6fa8 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4160,7 +4160,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, std::max(1U, (R.MaxLocalUsers - 1))); // Clamp the unroll factor ranges to reasonable factors. - unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(); + unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(VF); // Check if the user has overridden the unroll max. if (VF == 1) { diff --git a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll index 4411da3f0a9..69d2a319a8c 100644 --- a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll +++ b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll @@ -47,9 +47,11 @@ define i32 @foo(i32* nocapture %A) nounwind uwtable ssp { ; CHECK-VECTOR: store <4 x i32> ; CHECK-VECTOR: ret ; +; For x86, loop unroll in loop vectorizer is disabled when VF==1. +; ; CHECK-SCALAR-LABEL: @bar( ; CHECK-SCALAR: store i32 -; CHECK-SCALAR: store i32 +; CHECK-SCALAR-NOT: store i32 ; CHECK-SCALAR: ret define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 diff --git a/test/Transforms/LoopVectorize/unroll.ll b/test/Transforms/LoopVectorize/unroll.ll new file mode 100644 index 00000000000..74076f69510 --- /dev/null +++ b/test/Transforms/LoopVectorize/unroll.ll @@ -0,0 +1,37 @@ +; This test makes sure that loop will not be unrolled in vectorization if VF computed +; equals to 1. +; RUN: opt < %s -loop-vectorize -S | FileCheck %s + +; Make sure there are no geps being merged. +; CHECK-LABEL: @foo( +; CHECK: getelementptr +; CHECK-NOT: getelementptr + +@N = common global i32 0, align 4 +@a = common global [1000 x i32] zeroinitializer, align 16 + +define void @foo() #0 { +entry: + %0 = load i32, i32* @N, align 4 + %cmp5 = icmp sgt i32 %0, 0 + br i1 %cmp5, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %conv = sext i32 %0 to i64 + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %i.06 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %mul = mul nuw nsw i64 %i.06, 7 + %arrayidx = getelementptr inbounds [1000 x i32], [1000 x i32]* @a, i64 0, i64 %mul + store i32 3, i32* %arrayidx, align 4 + %inc = add nuw nsw i64 %i.06, 1 + %cmp = icmp slt i64 %inc, %conv + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} -- 2.34.1