From f35ce2376ca361e059ad9390ce1f8467f8756bcd Mon Sep 17 00:00:00 2001 From: Hal Finkel Date: Thu, 8 May 2014 09:14:44 +0000 Subject: [PATCH] Move late partial-unrolling thresholds into the processor definitions The old method used by X86TTI to determine partial-unrolling thresholds was messy (because it worked by testing target features), and also would not correctly identify the target CPU if certain target features were disabled. After some discussions on IRC with Chandler et al., it was decided that the processor scheduling models were the right containers for this information (because it is often tied to special uop dispatch-buffer sizes). This does represent a small functionality change: - For generic x86-64 (which uses the SB model and, thus, will get some unrolling). - For AMD cores (because they still currently use the SB scheduling model) - For Haswell (based on benchmarking by Louis Gerbarg, it was decided to bump the default threshold to 50; we're working on a test case for this). Otherwise, nothing has changed for any other targets. The logic, however, has been moved into BasicTTI, so other targets may now also opt-in to this functionality simply by setting LoopMicroOpBufferSize in their processor model definitions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208289 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/MC/MCSchedule.h | 14 +++- include/llvm/Target/TargetSchedule.td | 2 + lib/CodeGen/BasicTargetTransformInfo.cpp | 63 ++++++++++++++- lib/Target/X86/X86SchedHaswell.td | 3 + lib/Target/X86/X86SchedSandyBridge.td | 3 + lib/Target/X86/X86ScheduleAtom.td | 4 + lib/Target/X86/X86ScheduleSLM.td | 3 + lib/Target/X86/X86TargetTransformInfo.cpp | 76 ------------------- test/Transforms/LoopUnroll/X86/partial.ll | 4 +- .../LoopVectorize/X86/metadata-enable.ll | 20 ++--- utils/TableGen/SubtargetEmitter.cpp | 1 + 11 files changed, 102 insertions(+), 91 deletions(-) diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h index f2e67393ca7..862a0fd7add 100644 --- a/include/llvm/MC/MCSchedule.h +++ b/include/llvm/MC/MCSchedule.h @@ -159,6 +159,14 @@ public: unsigned MicroOpBufferSize; static const unsigned DefaultMicroOpBufferSize = 0; + // LoopMicroOpBufferSize is the number of micro-ops that the processor may + // buffer for optimized loop execution. More generally, this represents the + // optimal number of micro-ops in a loop body. A loop may be partially + // unrolled to bring the count of micro-ops in the loop body closer to this + // number. + unsigned LoopMicroOpBufferSize; + static const unsigned DefaultLoopMicroOpBufferSize = 0; + // LoadLatency is the expected latency of load instructions. // // If MinLatency >= 0, this may be overriden for individual load opcodes by @@ -198,6 +206,7 @@ public: // MCSchedModel instead of using a generated itinerary. MCSchedModel(): IssueWidth(DefaultIssueWidth), MicroOpBufferSize(DefaultMicroOpBufferSize), + LoopMicroOpBufferSize(DefaultLoopMicroOpBufferSize), LoadLatency(DefaultLoadLatency), HighLatency(DefaultHighLatency), MispredictPenalty(DefaultMispredictPenalty), @@ -209,11 +218,12 @@ public: } // Table-gen driven ctor. - MCSchedModel(unsigned iw, int mbs, unsigned ll, unsigned hl, + MCSchedModel(unsigned iw, int mbs, int lmbs, unsigned ll, unsigned hl, unsigned mp, bool cm, unsigned pi, const MCProcResourceDesc *pr, const MCSchedClassDesc *sc, unsigned npr, unsigned nsc, const InstrItinerary *ii): - IssueWidth(iw), MicroOpBufferSize(mbs), LoadLatency(ll), HighLatency(hl), + IssueWidth(iw), MicroOpBufferSize(mbs), LoopMicroOpBufferSize(lmbs), + LoadLatency(ll), HighLatency(hl), MispredictPenalty(mp), CompleteModel(cm), ProcID(pi), ProcResourceTable(pr), SchedClassTable(sc), NumProcResourceKinds(npr), NumSchedClasses(nsc), InstrItineraries(ii) {} diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td index b4d0c44448e..e6eeb885c0b 100644 --- a/include/llvm/Target/TargetSchedule.td +++ b/include/llvm/Target/TargetSchedule.td @@ -79,6 +79,8 @@ class SchedMachineModel { int MinLatency = -1; // Determines which instructions are allowed in a group. // (-1) inorder (0) ooo, (1): inorder +var latencies. int MicroOpBufferSize = -1; // Max micro-ops that can be buffered. + int LoopMicroOpBufferSize = -1; // Max micro-ops that can be buffered for + // optimized loop dispatch/execution. int LoadLatency = -1; // Cycles for loads to access the cache. int HighLatency = -1; // Approximation of cycles for "high latency" ops. int MispredictPenalty = -1; // Extra cycles for a mispredicted branch. diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp index 4b895092d3b..763a4c0b3cf 100644 --- a/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -16,11 +16,18 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetSubtargetInfo.h" #include using namespace llvm; +static cl::opt +PartialUnrollingThreshold("partial-unrolling-threshold", cl::init(0), + cl::desc("Threshold for partial unrolling"), cl::Hidden); + #define DEBUG_TYPE "basictti" namespace { @@ -187,7 +194,61 @@ bool BasicTTI::haveFastSqrt(Type *Ty) const { return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); } -void BasicTTI::getUnrollingPreferences(Loop *, UnrollingPreferences &) const { } +void BasicTTI::getUnrollingPreferences(Loop *L, + UnrollingPreferences &UP) const { + // This unrolling functionality is target independent, but to provide some + // motivation for its indended use, for x86: + + // According to the Intel 64 and IA-32 Architectures Optimization Reference + // Manual, Intel Core models and later have a loop stream detector + // (and associated uop queue) that can benefit from partial unrolling. + // The relevant requirements are: + // - The loop must have no more than 4 (8 for Nehalem and later) branches + // taken, and none of them may be calls. + // - The loop can have no more than 18 (28 for Nehalem and later) uops. + + // According to the Software Optimization Guide for AMD Family 15h Processors, + // models 30h-4fh (Steamroller and later) have a loop predictor and loop + // buffer which can benefit from partial unrolling. + // The relevant requirements are: + // - The loop must have fewer than 16 branches + // - The loop must have less than 40 uops in all executed loop branches + + // The number of taken branches in a loop is hard to estimate here, and + // benchmarking has revealed that it is better not to be conservative when + // estimating the branch count. As a result, we'll ignore the branch limits + // until someone finds a case where it matters in practice. + + unsigned MaxOps; + const TargetSubtargetInfo *ST = &TM->getSubtarget(); + if (PartialUnrollingThreshold.getNumOccurrences() > 0) + MaxOps = PartialUnrollingThreshold; + else if (ST->getSchedModel()->LoopMicroOpBufferSize > 0) + MaxOps = ST->getSchedModel()->LoopMicroOpBufferSize; + else + return; + + // Scan the loop: don't unroll loops with calls. + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) { + BasicBlock *BB = *I; + + for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J) + if (isa(J) || isa(J)) { + ImmutableCallSite CS(J); + if (const Function *F = CS.getCalledFunction()) { + if (!TopTTI->isLoweredToCall(F)) + continue; + } + + return; + } + } + + // Enable runtime and partial unrolling up to the specified size. + UP.Partial = UP.Runtime = true; + UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps; +} //===----------------------------------------------------------------------===// // diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index f5b51eec05d..6966d616f8e 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -20,6 +20,9 @@ def HaswellModel : SchedMachineModel { let LoadLatency = 4; let MispredictPenalty = 16; + // Based on the LSD (loop-stream detector) queue size and benchmarking data. + let LoopMicroOpBufferSize = 50; + // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow // the scheduler to assign a default model to unrecognized opcodes. let CompleteModel = 0; diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index a58859aa15f..83f053425aa 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -21,6 +21,9 @@ def SandyBridgeModel : SchedMachineModel { let LoadLatency = 4; let MispredictPenalty = 16; + // Based on the LSD (loop-stream detector) queue size. + let LoopMicroOpBufferSize = 28; + // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow // the scheduler to assign a default model to unrecognized opcodes. let CompleteModel = 0; diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index ba72f29910f..3256ee7c6e4 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -535,5 +535,9 @@ def AtomModel : SchedMachineModel { let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles. let HighLatency = 30;// Expected, may be overriden by OperandCycles. + // On the Atom, the throughput for taken branches is 2 cycles. For small + // simple loops, expand by a small factor to hide the backedge cost. + let LoopMicroOpBufferSize = 10; + let Itineraries = AtomItineraries; } diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td index 509f892deb0..823d10140e3 100644 --- a/lib/Target/X86/X86ScheduleSLM.td +++ b/lib/Target/X86/X86ScheduleSLM.td @@ -20,6 +20,9 @@ def SLMModel : SchedMachineModel { let LoadLatency = 3; let MispredictPenalty = 10; + // For small loops, expand by a small factor to hide the backedge cost. + let LoopMicroOpBufferSize = 10; + // FIXME: SSE4 is unimplemented. This flag is set to allow // the scheduler to assign a default model to unrecognized opcodes. let CompleteModel = 0; diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index cad8dfd5221..101574c84c3 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -16,11 +16,8 @@ #include "X86.h" #include "X86TargetMachine.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" @@ -35,13 +32,6 @@ namespace llvm { void initializeX86TTIPass(PassRegistry &); } -static cl::opt -UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true), - cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden); -static cl::opt -PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0), - cl::desc("Threshold for X86 partial unrolling"), cl::Hidden); - namespace { class X86TTI final : public ImmutablePass, public TargetTransformInfo { @@ -84,8 +74,6 @@ public: /// \name Scalar TTI Implementations /// @{ PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; - void getUnrollingPreferences(Loop *L, - UnrollingPreferences &UP) const override; /// @} @@ -150,70 +138,6 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software; } -void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const { - if (!UsePartialUnrolling) - return; - // According to the Intel 64 and IA-32 Architectures Optimization Reference - // Manual, Intel Core models and later have a loop stream detector - // (and associated uop queue) that can benefit from partial unrolling. - // The relevant requirements are: - // - The loop must have no more than 4 (8 for Nehalem and later) branches - // taken, and none of them may be calls. - // - The loop can have no more than 18 (28 for Nehalem and later) uops. - - // According to the Software Optimization Guide for AMD Family 15h Processors, - // models 30h-4fh (Steamroller and later) have a loop predictor and loop - // buffer which can benefit from partial unrolling. - // The relevant requirements are: - // - The loop must have fewer than 16 branches - // - The loop must have less than 40 uops in all executed loop branches - - // The number of taken branches in a loop is hard to estimate here, and - // benchmarking has revealed that it is better not to be conservative when - // estimating the branch count. As a result, we'll ignore the branch limits - // until someone finds a case where it matters in practice. - - unsigned MaxOps; - if (PartialUnrollingThreshold.getNumOccurrences() > 0) { - MaxOps = PartialUnrollingThreshold; - } else if (ST->isAtom()) { - // On the Atom, the throughput for taken branches is 2 cycles. For small - // simple loops, expand by a small factor to hide the backedge cost. - MaxOps = 10; - } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) { - MaxOps = 40; - } else if (ST->hasFMA4() /* Any other recent AMD */) { - return; - } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) { - MaxOps = 28; - } else if (ST->hasSSSE3() /* Intel Core */) { - MaxOps = 18; - } else { - return; - } - - // Scan the loop: don't unroll loops with calls. - for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); - I != E; ++I) { - BasicBlock *BB = *I; - - for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J) - if (isa(J) || isa(J)) { - ImmutableCallSite CS(J); - if (const Function *F = CS.getCalledFunction()) { - if (!isLoweredToCall(F)) - continue; - } - - return; - } - } - - // Enable runtime and partial unrolling up to the specified size. - UP.Partial = UP.Runtime = true; - UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps; -} - unsigned X86TTI::getNumberOfRegisters(bool Vector) const { if (Vector && !ST->hasSSE1()) return 0; diff --git a/test/Transforms/LoopUnroll/X86/partial.ll b/test/Transforms/LoopUnroll/X86/partial.ll index 75b9c3fb89a..a2b04c7d85f 100644 --- a/test/Transforms/LoopUnroll/X86/partial.ll +++ b/test/Transforms/LoopUnroll/X86/partial.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -S -loop-unroll -mcpu=nehalem -x86-use-partial-unrolling=1 | FileCheck %s -; RUN: opt < %s -S -loop-unroll -mcpu=core -x86-use-partial-unrolling=1 | FileCheck -check-prefix=CHECK-NOUNRL %s +; RUN: opt < %s -S -loop-unroll -mcpu=nehalem | FileCheck %s +; RUN: opt < %s -S -loop-unroll -mcpu=core -unroll-runtime=0 | FileCheck -check-prefix=CHECK-NOUNRL %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll index 224823b8ed5..9e4e98948c9 100644 --- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll +++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll @@ -1,13 +1,13 @@ -; RUN: opt < %s -mcpu=corei7 -O1 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1 -; RUN: opt < %s -mcpu=corei7 -O2 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O2 -; RUN: opt < %s -mcpu=corei7 -O3 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3 -; RUN: opt < %s -mcpu=corei7 -Os -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Os -; RUN: opt < %s -mcpu=corei7 -Oz -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Oz -; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC -; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC -; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC2 -; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC2 -; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3DIS +; RUN: opt < %s -mcpu=corei7 -O1 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1 +; RUN: opt < %s -mcpu=corei7 -O2 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O2 +; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3 +; RUN: opt < %s -mcpu=corei7 -Os -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Os +; RUN: opt < %s -mcpu=corei7 -Oz -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Oz +; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC +; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC +; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC2 +; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC2 +; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS ; This file tests the llvm.vectorizer.pragma forcing vectorization even when ; optimization levels are too low, or when vectorization is disabled. diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp index 16bbdb7cd7e..06f869436f1 100644 --- a/utils/TableGen/SubtargetEmitter.cpp +++ b/utils/TableGen/SubtargetEmitter.cpp @@ -1195,6 +1195,7 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) { OS << "static const llvm::MCSchedModel " << PI->ModelName << "(\n"; EmitProcessorProp(OS, PI->ModelDef, "IssueWidth", ','); EmitProcessorProp(OS, PI->ModelDef, "MicroOpBufferSize", ','); + EmitProcessorProp(OS, PI->ModelDef, "LoopMicroOpBufferSize", ','); EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ','); EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ','); EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ','); -- 2.34.1