unsigned MicroOpBufferSize;
static const unsigned DefaultMicroOpBufferSize = 0;
+ // LoopMicroOpBufferSize is the number of micro-ops that the processor may
+ // buffer for optimized loop execution. More generally, this represents the
+ // optimal number of micro-ops in a loop body. A loop may be partially
+ // unrolled to bring the count of micro-ops in the loop body closer to this
+ // number.
+ unsigned LoopMicroOpBufferSize;
+ static const unsigned DefaultLoopMicroOpBufferSize = 0;
+
// LoadLatency is the expected latency of load instructions.
//
// If MinLatency >= 0, this may be overriden for individual load opcodes by
// MCSchedModel instead of using a generated itinerary.
MCSchedModel(): IssueWidth(DefaultIssueWidth),
MicroOpBufferSize(DefaultMicroOpBufferSize),
+ LoopMicroOpBufferSize(DefaultLoopMicroOpBufferSize),
LoadLatency(DefaultLoadLatency),
HighLatency(DefaultHighLatency),
MispredictPenalty(DefaultMispredictPenalty),
}
// Table-gen driven ctor.
- MCSchedModel(unsigned iw, int mbs, unsigned ll, unsigned hl,
+ MCSchedModel(unsigned iw, int mbs, int lmbs, unsigned ll, unsigned hl,
unsigned mp, bool cm, unsigned pi, const MCProcResourceDesc *pr,
const MCSchedClassDesc *sc, unsigned npr, unsigned nsc,
const InstrItinerary *ii):
- IssueWidth(iw), MicroOpBufferSize(mbs), LoadLatency(ll), HighLatency(hl),
+ IssueWidth(iw), MicroOpBufferSize(mbs), LoopMicroOpBufferSize(lmbs),
+ LoadLatency(ll), HighLatency(hl),
MispredictPenalty(mp), CompleteModel(cm), ProcID(pi),
ProcResourceTable(pr), SchedClassTable(sc), NumProcResourceKinds(npr),
NumSchedClasses(nsc), InstrItineraries(ii) {}
int MinLatency = -1; // Determines which instructions are allowed in a group.
// (-1) inorder (0) ooo, (1): inorder +var latencies.
int MicroOpBufferSize = -1; // Max micro-ops that can be buffered.
+ int LoopMicroOpBufferSize = -1; // Max micro-ops that can be buffered for
+ // optimized loop dispatch/execution.
int LoadLatency = -1; // Cycles for loads to access the cache.
int HighLatency = -1; // Approximation of cycles for "high latency" ops.
int MispredictPenalty = -1; // Extra cycles for a mispredicted branch.
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/Passes.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
#include <utility>
using namespace llvm;
+static cl::opt<unsigned>
+PartialUnrollingThreshold("partial-unrolling-threshold", cl::init(0),
+ cl::desc("Threshold for partial unrolling"), cl::Hidden);
+
#define DEBUG_TYPE "basictti"
namespace {
return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
}
-void BasicTTI::getUnrollingPreferences(Loop *, UnrollingPreferences &) const { }
+void BasicTTI::getUnrollingPreferences(Loop *L,
+ UnrollingPreferences &UP) const {
+ // This unrolling functionality is target independent, but to provide some
+ // motivation for its indended use, for x86:
+
+ // According to the Intel 64 and IA-32 Architectures Optimization Reference
+ // Manual, Intel Core models and later have a loop stream detector
+ // (and associated uop queue) that can benefit from partial unrolling.
+ // The relevant requirements are:
+ // - The loop must have no more than 4 (8 for Nehalem and later) branches
+ // taken, and none of them may be calls.
+ // - The loop can have no more than 18 (28 for Nehalem and later) uops.
+
+ // According to the Software Optimization Guide for AMD Family 15h Processors,
+ // models 30h-4fh (Steamroller and later) have a loop predictor and loop
+ // buffer which can benefit from partial unrolling.
+ // The relevant requirements are:
+ // - The loop must have fewer than 16 branches
+ // - The loop must have less than 40 uops in all executed loop branches
+
+ // The number of taken branches in a loop is hard to estimate here, and
+ // benchmarking has revealed that it is better not to be conservative when
+ // estimating the branch count. As a result, we'll ignore the branch limits
+ // until someone finds a case where it matters in practice.
+
+ unsigned MaxOps;
+ const TargetSubtargetInfo *ST = &TM->getSubtarget<TargetSubtargetInfo>();
+ if (PartialUnrollingThreshold.getNumOccurrences() > 0)
+ MaxOps = PartialUnrollingThreshold;
+ else if (ST->getSchedModel()->LoopMicroOpBufferSize > 0)
+ MaxOps = ST->getSchedModel()->LoopMicroOpBufferSize;
+ else
+ return;
+
+ // Scan the loop: don't unroll loops with calls.
+ for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+ I != E; ++I) {
+ BasicBlock *BB = *I;
+
+ for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)
+ if (isa<CallInst>(J) || isa<InvokeInst>(J)) {
+ ImmutableCallSite CS(J);
+ if (const Function *F = CS.getCalledFunction()) {
+ if (!TopTTI->isLoweredToCall(F))
+ continue;
+ }
+
+ return;
+ }
+ }
+
+ // Enable runtime and partial unrolling up to the specified size.
+ UP.Partial = UP.Runtime = true;
+ UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
+}
//===----------------------------------------------------------------------===//
//
let LoadLatency = 4;
let MispredictPenalty = 16;
+ // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+ let LoopMicroOpBufferSize = 50;
+
// FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
// the scheduler to assign a default model to unrecognized opcodes.
let CompleteModel = 0;
let LoadLatency = 4;
let MispredictPenalty = 16;
+ // Based on the LSD (loop-stream detector) queue size.
+ let LoopMicroOpBufferSize = 28;
+
// FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
// the scheduler to assign a default model to unrecognized opcodes.
let CompleteModel = 0;
let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+ // On the Atom, the throughput for taken branches is 2 cycles. For small
+ // simple loops, expand by a small factor to hide the backedge cost.
+ let LoopMicroOpBufferSize = 10;
+
let Itineraries = AtomItineraries;
}
let LoadLatency = 3;
let MispredictPenalty = 10;
+ // For small loops, expand by a small factor to hide the backedge cost.
+ let LoopMicroOpBufferSize = 10;
+
// FIXME: SSE4 is unimplemented. This flag is set to allow
// the scheduler to assign a default model to unrecognized opcodes.
let CompleteModel = 0;
#include "X86.h"
#include "X86TargetMachine.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/CostTable.h"
#include "llvm/Target/TargetLowering.h"
void initializeX86TTIPass(PassRegistry &);
}
-static cl::opt<bool>
-UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true),
- cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden);
-static cl::opt<unsigned>
-PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0),
- cl::desc("Threshold for X86 partial unrolling"), cl::Hidden);
-
namespace {
class X86TTI final : public ImmutablePass, public TargetTransformInfo {
/// \name Scalar TTI Implementations
/// @{
PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
- void getUnrollingPreferences(Loop *L,
- UnrollingPreferences &UP) const override;
/// @}
return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
}
-void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
- if (!UsePartialUnrolling)
- return;
- // According to the Intel 64 and IA-32 Architectures Optimization Reference
- // Manual, Intel Core models and later have a loop stream detector
- // (and associated uop queue) that can benefit from partial unrolling.
- // The relevant requirements are:
- // - The loop must have no more than 4 (8 for Nehalem and later) branches
- // taken, and none of them may be calls.
- // - The loop can have no more than 18 (28 for Nehalem and later) uops.
-
- // According to the Software Optimization Guide for AMD Family 15h Processors,
- // models 30h-4fh (Steamroller and later) have a loop predictor and loop
- // buffer which can benefit from partial unrolling.
- // The relevant requirements are:
- // - The loop must have fewer than 16 branches
- // - The loop must have less than 40 uops in all executed loop branches
-
- // The number of taken branches in a loop is hard to estimate here, and
- // benchmarking has revealed that it is better not to be conservative when
- // estimating the branch count. As a result, we'll ignore the branch limits
- // until someone finds a case where it matters in practice.
-
- unsigned MaxOps;
- if (PartialUnrollingThreshold.getNumOccurrences() > 0) {
- MaxOps = PartialUnrollingThreshold;
- } else if (ST->isAtom()) {
- // On the Atom, the throughput for taken branches is 2 cycles. For small
- // simple loops, expand by a small factor to hide the backedge cost.
- MaxOps = 10;
- } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) {
- MaxOps = 40;
- } else if (ST->hasFMA4() /* Any other recent AMD */) {
- return;
- } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) {
- MaxOps = 28;
- } else if (ST->hasSSSE3() /* Intel Core */) {
- MaxOps = 18;
- } else {
- return;
- }
-
- // Scan the loop: don't unroll loops with calls.
- for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
- I != E; ++I) {
- BasicBlock *BB = *I;
-
- for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)
- if (isa<CallInst>(J) || isa<InvokeInst>(J)) {
- ImmutableCallSite CS(J);
- if (const Function *F = CS.getCalledFunction()) {
- if (!isLoweredToCall(F))
- continue;
- }
-
- return;
- }
- }
-
- // Enable runtime and partial unrolling up to the specified size.
- UP.Partial = UP.Runtime = true;
- UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
-}
-
unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
if (Vector && !ST->hasSSE1())
return 0;
-; RUN: opt < %s -S -loop-unroll -mcpu=nehalem -x86-use-partial-unrolling=1 | FileCheck %s
-; RUN: opt < %s -S -loop-unroll -mcpu=core -x86-use-partial-unrolling=1 | FileCheck -check-prefix=CHECK-NOUNRL %s
+; RUN: opt < %s -S -loop-unroll -mcpu=nehalem | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -mcpu=core -unroll-runtime=0 | FileCheck -check-prefix=CHECK-NOUNRL %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-; RUN: opt < %s -mcpu=corei7 -O1 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1
-; RUN: opt < %s -mcpu=corei7 -O2 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O2
-; RUN: opt < %s -mcpu=corei7 -O3 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3
-; RUN: opt < %s -mcpu=corei7 -Os -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Os
-; RUN: opt < %s -mcpu=corei7 -Oz -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Oz
-; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC
-; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC
-; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC2
-; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC2
-; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3DIS
+; RUN: opt < %s -mcpu=corei7 -O1 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1
+; RUN: opt < %s -mcpu=corei7 -O2 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O2
+; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3
+; RUN: opt < %s -mcpu=corei7 -Os -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Os
+; RUN: opt < %s -mcpu=corei7 -Oz -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Oz
+; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC
+; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC
+; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC2
+; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC2
+; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS
; This file tests the llvm.vectorizer.pragma forcing vectorization even when
; optimization levels are too low, or when vectorization is disabled.
OS << "static const llvm::MCSchedModel " << PI->ModelName << "(\n";
EmitProcessorProp(OS, PI->ModelDef, "IssueWidth", ',');
EmitProcessorProp(OS, PI->ModelDef, "MicroOpBufferSize", ',');
+ EmitProcessorProp(OS, PI->ModelDef, "LoopMicroOpBufferSize", ',');
EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ',');
EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ',');
EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ',');