Cost Model: Move the 'max unroll factor' variable to the TTI and add initial Cost...

author Nadav Rotem <nrotem@apple.com>

Wed, 9 Jan 2013 01:15:42 +0000 (01:15 +0000)

committer Nadav Rotem <nrotem@apple.com>

Wed, 9 Jan 2013 01:15:42 +0000 (01:15 +0000)
author Nadav Rotem <nrotem@apple.com>
Wed, 9 Jan 2013 01:15:42 +0000 (01:15 +0000)
committer Nadav Rotem <nrotem@apple.com>
Wed, 9 Jan 2013 01:15:42 +0000 (01:15 +0000)
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h

index ddf615fe3bac47479bcb59a31bf52a32b866a510..1679c4f2b326c76b0b4c632078aeda1e6d0d1847 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -148,6 +148,11 @@ public:
    /// set to false, it returns the number of scalar registers.
    virtual unsigned getNumberOfRegisters(bool Vector) const;
  
+  /// \return The maximum unroll factor that the vectorizer should try to
+  /// perform for this target. This number depends on the level of parallelism
+  /// and the number of execution units in the CPU.
+  virtual unsigned getMaximumUnrollFactor() const;
+
    /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc.
    virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
  
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp

index 63f495a430526107fa4a849d1753332f218a0953..02af2d34c518050134b8537f9565058d541b4716 100644 (file)
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -92,6 +92,10 @@ unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {
    return PrevTTI->getNumberOfRegisters(Vector);
  }
  
+unsigned TargetTransformInfo::getMaximumUnrollFactor() const {
+  return PrevTTI->getMaximumUnrollFactor();
+}
+
  unsigned TargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
                                                       Type *Ty) const {
    return PrevTTI->getArithmeticInstrCost(Opcode, Ty);
@@ -216,6 +220,10 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
      return 8;
    }
  
+  unsigned getMaximumUnrollFactor() const {
+    return 1;
+  }
+
    unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
      return 1;
    }
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp

index c27e081a5e643d0490d5d5fa3eb76f572f06c5fb..2f3ac9a9017c812b31c424f9c6e2d6ab3dc753ec 100644 (file)
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -83,6 +83,7 @@ public:
    /// @{
  
    virtual unsigned getNumberOfRegisters(bool Vector) const;
+  virtual unsigned getMaximumUnrollFactor() const;
    virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
    virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
                                    int Index, Type *SubTp) const;
@@ -182,6 +183,10 @@ unsigned BasicTTI::getNumberOfRegisters(bool Vector) const {
    return 1;
  }
  
+unsigned BasicTTI::getMaximumUnrollFactor() const {
+  return 1;
+}
+
  unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
    // Check if any of the operands are vector operands.
    int ISD = TLI->InstructionOpcodeToISD(Opcode);
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp

index 03a23be0a617b96132ca5fcb9551c857e9defdb0..634004acb45ed807e8c33e66d520387bd98e586a 100644 (file)
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -77,6 +77,31 @@ public:
    virtual unsigned getIntImmCost(const APInt &Imm, Type *Ty) const;
  
    /// @}
+
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector) const {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 16;
+      return 0;
+    }
+
+    if (ST->isThumb1Only())
+      return 8;
+    return 16;
+  }
+
+  unsigned getMaximumUnrollFactor() const {
+    // These are out of order CPUs:
+    if (ST->isCortexA15() || ST->isSwift())
+      return 2;
+    return 1;
+  }
+
+  /// @}
  };
  
  } // end anonymous namespace
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp

index 9cc1b180e978b6d1eb2adf5b32e196a1ee6ad752..6ab08cbd12997aafac7f323a1aa2d8b7da29e4b3 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -75,7 +75,6 @@ public:
  
    /// \name Scalar TTI Implementations
    /// @{
-
    virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
  
    /// @}
@@ -84,6 +83,7 @@ public:
    /// @{
  
    virtual unsigned getNumberOfRegisters(bool Vector) const;
+  virtual unsigned getMaximumUnrollFactor() const;
    virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
    virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
                                    int Index, Type *SubTp) const;
@@ -156,7 +156,6 @@ FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len,
    return -1;
  }
  
-
  X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
    assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
    // TODO: Currently the __builtin_popcount() implementation using SSE3
@@ -171,6 +170,18 @@ unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
    return 8;
  }
  
+unsigned X86TTI::getMaximumUnrollFactor() const {
+  if (ST->isAtom())
+    return 1;
+
+  // Sandybridge and Haswell have multiple execution ports and pipelined
+  // vector units.
+  if (ST->hasAVX())
+    return 4;
+
+  return 2;
+}
+
  unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
    // Legalize the type.
    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 9c82cb8dca146932a34303c92c1841836c6a5c18..c29f416be7eb01ddf13cc6f62ec699defadf11ec 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -116,9 +116,6 @@ static const unsigned RuntimeMemoryCheckThreshold = 4;
  /// This is the highest vector width that we try to generate.
  static const unsigned MaxVectorSize = 8;
  
-/// This is the highest Unroll Factor.
-static const unsigned MaxUnrollSize = 4;
-
  namespace {
  
  // Forward declarations.
@@ -2715,6 +2712,8 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
    UF = std::min(UF, (MaxLoopSizeThreshold / R.NumInstructions));
  
    // Clamp the unroll factor ranges to reasonable factors.
+  unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
+  
    if (UF > MaxUnrollSize)
      UF = MaxUnrollSize;
    else if (UF < 1)
diff --git a/test/Transforms/LoopVectorize/ARM/lit.local.cfg b/test/Transforms/LoopVectorize/ARM/lit.local.cfg

new file mode 100644 (file)

index 0000000..cb77b09
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ARM/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'ARM' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/LoopVectorize/ARM/sanity.ll b/test/Transforms/LoopVectorize/ARM/sanity.ll

new file mode 100644 (file)

index 0000000..11c28a8
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ARM/sanity.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+; Make sure that we are not crashing on ARM.
+
+define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
+  %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i32 %i.02
+  %3 = load i32* %2, align 4
+  %4 = add nsw i32 %3, %sum.01
+  %5 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %5, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll

index 0f21ba678c302b4c90f16c6265a46c9d0b9a835b..57d63015814f69b412f8bf9ad4100fa822c5ec9c 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -53,8 +53,6 @@ define void @example1() nounwind uwtable ssp {
  ;UNROLL: @example10b
  ;UNROLL: load <4 x i16>
  ;UNROLL: load <4 x i16>
-;UNROLL: load <4 x i16>
-;UNROLL: store <4 x i32>
  ;UNROLL: store <4 x i32>
  ;UNROLL: store <4 x i32>
  ;UNROLL: ret void
author	Nadav Rotem <nrotem@apple.com>
	Wed, 9 Jan 2013 01:15:42 +0000 (01:15 +0000)
committer	Nadav Rotem <nrotem@apple.com>
	Wed, 9 Jan 2013 01:15:42 +0000 (01:15 +0000)
include/llvm/Analysis/TargetTransformInfo.h		patch \| blob \| history
lib/Analysis/TargetTransformInfo.cpp		patch \| blob \| history
lib/CodeGen/BasicTargetTransformInfo.cpp		patch \| blob \| history
lib/Target/ARM/ARMTargetTransformInfo.cpp		patch \| blob \| history
lib/Target/X86/X86TargetTransformInfo.cpp		patch \| blob \| history
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/ARM/lit.local.cfg	[new file with mode: 0644]	patch \| blob
test/Transforms/LoopVectorize/ARM/sanity.ll	[new file with mode: 0644]	patch \| blob
test/Transforms/LoopVectorize/X86/gcc-examples.ll		patch \| blob \| history