Move late partial-unrolling thresholds into the processor definitions

author Hal Finkel <hfinkel@anl.gov>

Thu, 8 May 2014 09:14:44 +0000 (09:14 +0000)

committer Hal Finkel <hfinkel@anl.gov>

Thu, 8 May 2014 09:14:44 +0000 (09:14 +0000)
author Hal Finkel <hfinkel@anl.gov>
Thu, 8 May 2014 09:14:44 +0000 (09:14 +0000)
committer Hal Finkel <hfinkel@anl.gov>
Thu, 8 May 2014 09:14:44 +0000 (09:14 +0000)
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h

index f2e67393ca70d6b9b9403d1f4cbc43162fa6672e..862a0fd7addcc42f90d4372a6c2798d2ba9c273a 100644 (file)
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -159,6 +159,14 @@ public:
    unsigned MicroOpBufferSize;
    static const unsigned DefaultMicroOpBufferSize = 0;
  
+  // LoopMicroOpBufferSize is the number of micro-ops that the processor may
+  // buffer for optimized loop execution. More generally, this represents the
+  // optimal number of micro-ops in a loop body. A loop may be partially
+  // unrolled to bring the count of micro-ops in the loop body closer to this
+  // number.
+  unsigned LoopMicroOpBufferSize;
+  static const unsigned DefaultLoopMicroOpBufferSize = 0;
+
    // LoadLatency is the expected latency of load instructions.
    //
    // If MinLatency >= 0, this may be overriden for individual load opcodes by
@@ -198,6 +206,7 @@ public:
    // MCSchedModel instead of using a generated itinerary.
    MCSchedModel(): IssueWidth(DefaultIssueWidth),
                    MicroOpBufferSize(DefaultMicroOpBufferSize),
+                  LoopMicroOpBufferSize(DefaultLoopMicroOpBufferSize),
                    LoadLatency(DefaultLoadLatency),
                    HighLatency(DefaultHighLatency),
                    MispredictPenalty(DefaultMispredictPenalty),
@@ -209,11 +218,12 @@ public:
    }
  
    // Table-gen driven ctor.
-  MCSchedModel(unsigned iw, int mbs, unsigned ll, unsigned hl,
+  MCSchedModel(unsigned iw, int mbs, int lmbs, unsigned ll, unsigned hl,
                 unsigned mp, bool cm, unsigned pi, const MCProcResourceDesc *pr,
                 const MCSchedClassDesc *sc, unsigned npr, unsigned nsc,
                 const InstrItinerary *ii):
-    IssueWidth(iw), MicroOpBufferSize(mbs), LoadLatency(ll), HighLatency(hl),
+    IssueWidth(iw), MicroOpBufferSize(mbs), LoopMicroOpBufferSize(lmbs),
+    LoadLatency(ll), HighLatency(hl),
      MispredictPenalty(mp), CompleteModel(cm), ProcID(pi),
      ProcResourceTable(pr), SchedClassTable(sc), NumProcResourceKinds(npr),
      NumSchedClasses(nsc), InstrItineraries(ii) {}
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td

index b4d0c44448ecf587cf1bd9f8178757188d9461b3..e6eeb885c0b1440611dbb1ef0c1d810baab78d48 100644 (file)
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -79,6 +79,8 @@ class SchedMachineModel {
    int MinLatency = -1; // Determines which instructions are allowed in a group.
                         // (-1) inorder (0) ooo, (1): inorder +var latencies.
    int MicroOpBufferSize = -1; // Max micro-ops that can be buffered.
+  int LoopMicroOpBufferSize = -1; // Max micro-ops that can be buffered for
+                                  // optimized loop dispatch/execution.
    int LoadLatency = -1; // Cycles for loads to access the cache.
    int HighLatency = -1; // Approximation of cycles for "high latency" ops.
    int MispredictPenalty = -1; // Extra cycles for a mispredicted branch.
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp

index 4b895092d3b63e80d9a5b5c98685614537a19d78..763a4c0b3cfc487f0a0beda0030a033139e62b3b 100644 (file)
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -16,11 +16,18 @@
  //===----------------------------------------------------------------------===//
  
  #include "llvm/CodeGen/Passes.h"
+#include "llvm/Analysis/LoopInfo.h"
  #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CommandLine.h"
  #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
  #include <utility>
  using namespace llvm;
  
+static cl::opt<unsigned>
+PartialUnrollingThreshold("partial-unrolling-threshold", cl::init(0),
+  cl::desc("Threshold for partial unrolling"), cl::Hidden);
+
  #define DEBUG_TYPE "basictti"
  
  namespace {
@@ -187,7 +194,61 @@ bool BasicTTI::haveFastSqrt(Type *Ty) const {
    return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
  }
  
-void BasicTTI::getUnrollingPreferences(Loop *, UnrollingPreferences &) const { }
+void BasicTTI::getUnrollingPreferences(Loop *L,
+                                       UnrollingPreferences &UP) const {
+  // This unrolling functionality is target independent, but to provide some
+  // motivation for its indended use, for x86:
+
+  // According to the Intel 64 and IA-32 Architectures Optimization Reference
+  // Manual, Intel Core models and later have a loop stream detector
+  // (and associated uop queue) that can benefit from partial unrolling.
+  // The relevant requirements are:
+  //  - The loop must have no more than 4 (8 for Nehalem and later) branches
+  //    taken, and none of them may be calls.
+  //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
+
+  // According to the Software Optimization Guide for AMD Family 15h Processors,
+  // models 30h-4fh (Steamroller and later) have a loop predictor and loop
+  // buffer which can benefit from partial unrolling.
+  // The relevant requirements are:
+  //  - The loop must have fewer than 16 branches
+  //  - The loop must have less than 40 uops in all executed loop branches
+
+  // The number of taken branches in a loop is hard to estimate here, and
+  // benchmarking has revealed that it is better not to be conservative when
+  // estimating the branch count. As a result, we'll ignore the branch limits
+  // until someone finds a case where it matters in practice.
+
+  unsigned MaxOps;
+  const TargetSubtargetInfo *ST = &TM->getSubtarget<TargetSubtargetInfo>();
+  if (PartialUnrollingThreshold.getNumOccurrences() > 0)
+    MaxOps = PartialUnrollingThreshold;
+  else if (ST->getSchedModel()->LoopMicroOpBufferSize > 0)
+    MaxOps = ST->getSchedModel()->LoopMicroOpBufferSize;
+  else
+    return;
+
+  // Scan the loop: don't unroll loops with calls.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+
+    for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)
+      if (isa<CallInst>(J) || isa<InvokeInst>(J)) {
+        ImmutableCallSite CS(J);
+        if (const Function *F = CS.getCalledFunction()) {
+          if (!TopTTI->isLoweredToCall(F))
+            continue;
+        }
+
+        return;
+      }
+  }
+
+  // Enable runtime and partial unrolling up to the specified size.
+  UP.Partial = UP.Runtime = true;
+  UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
+}
  
  //===----------------------------------------------------------------------===//
  //
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td

index f5b51eec05de7e04f46d94b646ec14549820cddd..6966d616f8e322dbd3e7d636ba5066cda07f9b53 100644 (file)
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -20,6 +20,9 @@ def HaswellModel : SchedMachineModel {
    let LoadLatency = 4;
    let MispredictPenalty = 16;
  
+  // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+  let LoopMicroOpBufferSize = 50;
+
    // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
    // the scheduler to assign a default model to unrecognized opcodes.
    let CompleteModel = 0;
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td

index a58859aa15f7d54a407acf2b58da567d16ee3b87..83f053425aa139f6fc24f3290b5cb1d01b065644 100644 (file)
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -21,6 +21,9 @@ def SandyBridgeModel : SchedMachineModel {
    let LoadLatency = 4;
    let MispredictPenalty = 16;
  
+  // Based on the LSD (loop-stream detector) queue size.
+  let LoopMicroOpBufferSize = 28;
+
    // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
    // the scheduler to assign a default model to unrecognized opcodes.
    let CompleteModel = 0;
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td

index ba72f29910fefe211940cf562f42392517c6969e..3256ee7c6e492b31a64baffa7f8db49a27230b32 100644 (file)
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -535,5 +535,9 @@ def AtomModel : SchedMachineModel {
    let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
    let HighLatency = 30;// Expected, may be overriden by OperandCycles.
  
+  // On the Atom, the throughput for taken branches is 2 cycles. For small
+  // simple loops, expand by a small factor to hide the backedge cost.
+  let LoopMicroOpBufferSize = 10;
+
    let Itineraries = AtomItineraries;
  }
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td

index 509f892deb078ff2fc7546a657111b33987fd667..823d10140e3c7ed6934cf93a9eceed8552cc09e6 100644 (file)
--- a/lib/Target/X86/X86ScheduleSLM.td
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -20,6 +20,9 @@ def SLMModel : SchedMachineModel {
    let LoadLatency = 3;
    let MispredictPenalty = 10;
  
+  // For small loops, expand by a small factor to hide the backedge cost.
+  let LoopMicroOpBufferSize = 10;
+
    // FIXME: SSE4 is unimplemented. This flag is set to allow
    // the scheduler to assign a default model to unrecognized opcodes.
    let CompleteModel = 0;
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp

index cad8dfd5221151534152a6d6d31c7d53df40b51f..101574c84c3923ad08897e7444f3b318401b149b 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -16,11 +16,8 @@
  
  #include "X86.h"
  #include "X86TargetMachine.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/Analysis/LoopInfo.h"
  #include "llvm/Analysis/TargetTransformInfo.h"
  #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Target/CostTable.h"
  #include "llvm/Target/TargetLowering.h"
@@ -35,13 +32,6 @@ namespace llvm {
  void initializeX86TTIPass(PassRegistry &);
  }
  
-static cl::opt<bool>
-UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true),
-  cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden);
-static cl::opt<unsigned>
-PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0),
-  cl::desc("Threshold for X86 partial unrolling"), cl::Hidden);
-
  namespace {
  
  class X86TTI final : public ImmutablePass, public TargetTransformInfo {
@@ -84,8 +74,6 @@ public:
    /// \name Scalar TTI Implementations
    /// @{
    PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
-  void getUnrollingPreferences(Loop *L,
-                               UnrollingPreferences &UP) const override;
  
    /// @}
  
@@ -150,70 +138,6 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
    return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
  }
  
-void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
-  if (!UsePartialUnrolling)
-    return;
-  // According to the Intel 64 and IA-32 Architectures Optimization Reference
-  // Manual, Intel Core models and later have a loop stream detector
-  // (and associated uop queue) that can benefit from partial unrolling.
-  // The relevant requirements are:
-  //  - The loop must have no more than 4 (8 for Nehalem and later) branches
-  //    taken, and none of them may be calls.
-  //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
-
-  // According to the Software Optimization Guide for AMD Family 15h Processors,
-  // models 30h-4fh (Steamroller and later) have a loop predictor and loop
-  // buffer which can benefit from partial unrolling.
-  // The relevant requirements are:
-  //  - The loop must have fewer than 16 branches
-  //  - The loop must have less than 40 uops in all executed loop branches
-
-  // The number of taken branches in a loop is hard to estimate here, and
-  // benchmarking has revealed that it is better not to be conservative when
-  // estimating the branch count. As a result, we'll ignore the branch limits
-  // until someone finds a case where it matters in practice.
-
-  unsigned MaxOps;
-  if (PartialUnrollingThreshold.getNumOccurrences() > 0) {
-    MaxOps = PartialUnrollingThreshold;
-  } else if (ST->isAtom()) {
-    // On the Atom, the throughput for taken branches is 2 cycles. For small
-    // simple loops, expand by a small factor to hide the backedge cost.
-    MaxOps = 10;
-  } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) {
-    MaxOps = 40;
-  } else if (ST->hasFMA4() /* Any other recent AMD */) {
-    return;
-  } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) {
-    MaxOps = 28;
-  } else if (ST->hasSSSE3() /* Intel Core */) {
-    MaxOps = 18;
-  } else {
-    return;
-  }
-
-  // Scan the loop: don't unroll loops with calls.
-  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
-       I != E; ++I) {
-    BasicBlock *BB = *I;
-
-    for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)
-      if (isa<CallInst>(J) || isa<InvokeInst>(J)) {
-        ImmutableCallSite CS(J);
-        if (const Function *F = CS.getCalledFunction()) {
-          if (!isLoweredToCall(F))
-            continue;
-        }
-
-        return;
-      }
-  }
-
-  // Enable runtime and partial unrolling up to the specified size.
-  UP.Partial = UP.Runtime = true;
-  UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
-}
-
  unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
    if (Vector && !ST->hasSSE1())
      return 0;
diff --git a/test/Transforms/LoopUnroll/X86/partial.ll b/test/Transforms/LoopUnroll/X86/partial.ll

index 75b9c3fb89a69a1b7c3c4441765c8e43891d9485..a2b04c7d85f81cab1188832f170fc5c078abb2ff 100644 (file)
--- a/test/Transforms/LoopUnroll/X86/partial.ll
+++ b/test/Transforms/LoopUnroll/X86/partial.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -S -loop-unroll -mcpu=nehalem -x86-use-partial-unrolling=1 | FileCheck %s
-; RUN: opt < %s -S -loop-unroll -mcpu=core -x86-use-partial-unrolling=1 | FileCheck -check-prefix=CHECK-NOUNRL %s
+; RUN: opt < %s -S -loop-unroll -mcpu=nehalem | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -mcpu=core -unroll-runtime=0 | FileCheck -check-prefix=CHECK-NOUNRL %s
  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-unknown-linux-gnu"
  
diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll

index 224823b8ed5dbc1cff250c7b4a50dd0c0a8ce329..9e4e98948c985fb67e868e66afd51369b134d718 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -1,13 +1,13 @@
-; RUN: opt < %s -mcpu=corei7 -O1 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1
-; RUN: opt < %s -mcpu=corei7 -O2 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O2
-; RUN: opt < %s -mcpu=corei7 -O3 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3
-; RUN: opt < %s -mcpu=corei7 -Os -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Os
-; RUN: opt < %s -mcpu=corei7 -Oz -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Oz
-; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC
-; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC
-; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC2
-; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC2
-; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3DIS
+; RUN: opt < %s -mcpu=corei7 -O1 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1
+; RUN: opt < %s -mcpu=corei7 -O2 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O2
+; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3
+; RUN: opt < %s -mcpu=corei7 -Os -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Os
+; RUN: opt < %s -mcpu=corei7 -Oz -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Oz
+; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC
+; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC
+; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC2
+; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC2
+; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS
  
  ; This file tests the llvm.vectorizer.pragma forcing vectorization even when
  ; optimization levels are too low, or when vectorization is disabled.
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp

index 16bbdb7cd7e65e9b0e039d4d9dc37ff1b1f63ef6..06f869436f12669e535063aa7b5c99cb7ce32695 100644 (file)
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -1195,6 +1195,7 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
      OS << "static const llvm::MCSchedModel " << PI->ModelName << "(\n";
      EmitProcessorProp(OS, PI->ModelDef, "IssueWidth", ',');
      EmitProcessorProp(OS, PI->ModelDef, "MicroOpBufferSize", ',');
+    EmitProcessorProp(OS, PI->ModelDef, "LoopMicroOpBufferSize", ',');
      EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ',');
      EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ',');
      EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ',');
author	Hal Finkel <hfinkel@anl.gov>
	Thu, 8 May 2014 09:14:44 +0000 (09:14 +0000)
committer	Hal Finkel <hfinkel@anl.gov>
	Thu, 8 May 2014 09:14:44 +0000 (09:14 +0000)
include/llvm/MC/MCSchedule.h		patch \| blob \| history
include/llvm/Target/TargetSchedule.td		patch \| blob \| history
lib/CodeGen/BasicTargetTransformInfo.cpp		patch \| blob \| history
lib/Target/X86/X86SchedHaswell.td		patch \| blob \| history
lib/Target/X86/X86SchedSandyBridge.td		patch \| blob \| history
lib/Target/X86/X86ScheduleAtom.td		patch \| blob \| history
lib/Target/X86/X86ScheduleSLM.td		patch \| blob \| history
lib/Target/X86/X86TargetTransformInfo.cpp		patch \| blob \| history
test/Transforms/LoopUnroll/X86/partial.ll		patch \| blob \| history
test/Transforms/LoopVectorize/X86/metadata-enable.ll		patch \| blob \| history
utils/TableGen/SubtargetEmitter.cpp		patch \| blob \| history