Do not restrict interleaved unrolling to small loops, depending on the target.

author Olivier Sallenave <ohsallen@us.ibm.com>

Fri, 6 Mar 2015 23:12:04 +0000 (23:12 +0000)

committer Olivier Sallenave <ohsallen@us.ibm.com>

Fri, 6 Mar 2015 23:12:04 +0000 (23:12 +0000)
author Olivier Sallenave <ohsallen@us.ibm.com>
Fri, 6 Mar 2015 23:12:04 +0000 (23:12 +0000)
committer Olivier Sallenave <ohsallen@us.ibm.com>
Fri, 6 Mar 2015 23:12:04 +0000 (23:12 +0000)
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h

index 11a5b6af0973aa0e0e0fe19445b8706a65bc9996..aeab0e18982e646408d2ab7abcce460ad0fe9b31 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -331,6 +331,9 @@ public:
    /// target.
    bool shouldBuildLookupTables() const;
  
+  /// \brief Don't restrict interleaved unrolling to small loops.
+  bool enableAggressiveInterleaving(bool LoopHasReductions) const;
+
    /// \brief Return hardware support for population count.
    PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
  
@@ -531,6 +534,7 @@ public:
    virtual unsigned getJumpBufAlignment() = 0;
    virtual unsigned getJumpBufSize() = 0;
    virtual bool shouldBuildLookupTables() = 0;
+  virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
    virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
    virtual bool haveFastSqrt(Type *Ty) = 0;
    virtual unsigned getFPOpCost(Type *Ty) = 0;
@@ -648,6 +652,9 @@ public:
    bool shouldBuildLookupTables() override {
      return Impl.shouldBuildLookupTables();
    }
+  bool enableAggressiveInterleaving(bool LoopHasReductions) override {
+    return Impl.enableAggressiveInterleaving(LoopHasReductions);
+  }
    PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override {
      return Impl.getPopcntSupport(IntTyWidthInBit);
    }
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h

index 3e02c0ce3ca92a26e24ca2050626ad97ba8f6f84..9d2a7b5366d9025842db74eef96eaec385768f26 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -235,6 +235,8 @@ public:
  
    bool shouldBuildLookupTables() { return true; }
  
+  bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
+
    TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) {
      return TTI::PSK_Software;
    }
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp

index 4d336363c5f503090e18a633f3ec99080aee9144..1b52d4a5502ed49c1a1edfef661cababcbfb1597 100644 (file)
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -143,6 +143,10 @@ bool TargetTransformInfo::shouldBuildLookupTables() const {
    return TTIImpl->shouldBuildLookupTables();
  }
  
+bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) const {
+  return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
+}
+
  TargetTransformInfo::PopcntSupportKind
  TargetTransformInfo::getPopcntSupport(unsigned IntTyWidthInBit) const {
    return TTIImpl->getPopcntSupport(IntTyWidthInBit);
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp

index 073bbb0c5567801a98c2ef537feeeac056470932..b46acd47f313b1dd86a9c60a0987c20b5fbf17ac 100644 (file)
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -192,6 +192,10 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L,
    BaseT::getUnrollingPreferences(L, UP);
  }
  
+bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
+  return LoopHasReductions;
+}
+
  unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
    if (Vector && !ST->hasAltivec() && !ST->hasQPX())
      return 0;
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h

index cef7079423472f1bbdb9ae3be9ca67c820f18be4..21acea1a36d8de3b77f0a470e96c4e3b42fa0f0c 100644 (file)
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -78,6 +78,7 @@ public:
    /// \name Vector TTI Implementations
    /// @{
  
+  bool enableAggressiveInterleaving(bool LoopHasReductions);
    unsigned getNumberOfRegisters(bool Vector);
    unsigned getRegisterBitWidth(bool Vector);
    unsigned getMaxInterleaveFactor();
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 686b8995a31218e70e7acbf3fea9997bf58e8d4a..ffa3fe13df57e0c8c5d6c62eafb9d466a1a85390 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4564,6 +4564,14 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
      return SmallUF;
    }
  
+  // Unroll if this is a large loop (small loops are already dealt with by this
+  // point) that could benefit from interleaved unrolling.
+  bool HasReductions = (Legal->getReductionVars()->size() > 0);
+  if (TTI.enableAggressiveInterleaving(HasReductions)) {
+    DEBUG(dbgs() << "LV: Unrolling to expose ILP.\n");
+    return UF;
+  }
+
    DEBUG(dbgs() << "LV: Not Unrolling.\n");
    return 1;
  }
diff --git a/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll b/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll

new file mode 100644 (file)

index 0000000..de6595f
--- /dev/null
+++ b/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
@@ -0,0 +1,73 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; CHECK: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT-NOT: fadd
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-ibm-linux-gnu"
+
+define void @QLA_F3_r_veq_norm2_V(float* noalias nocapture %r, [3 x { float, float }]* noalias nocapture readonly %a, i32 signext %n) #0 {
+entry:
+  %cmp24 = icmp sgt i32 %n, 0
+  br i1 %cmp24, label %for.cond1.preheader.preheader, label %for.end13
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
+  %sum.026 = phi double [ %add10.2, %for.cond1.preheader ], [ 0.000000e+00, %for.cond1.preheader.preheader ]
+  %arrayidx5.realp = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 0, i32 0
+  %arrayidx5.real = load float, float* %arrayidx5.realp, align 8
+  %arrayidx5.imagp = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 0, i32 1
+  %arrayidx5.imag = load float, float* %arrayidx5.imagp, align 8
+  %mul = fmul fast float %arrayidx5.real, %arrayidx5.real
+  %mul9 = fmul fast float %arrayidx5.imag, %arrayidx5.imag
+  %add = fadd fast float %mul9, %mul
+  %conv = fpext float %add to double
+  %add10 = fadd fast double %conv, %sum.026
+  %arrayidx5.realp.1 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 1, i32 0
+  %arrayidx5.real.1 = load float, float* %arrayidx5.realp.1, align 8
+  %arrayidx5.imagp.1 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 1, i32 1
+  %arrayidx5.imag.1 = load float, float* %arrayidx5.imagp.1, align 8
+  %mul.1 = fmul fast float %arrayidx5.real.1, %arrayidx5.real.1
+  %mul9.1 = fmul fast float %arrayidx5.imag.1, %arrayidx5.imag.1
+  %add.1 = fadd fast float %mul9.1, %mul.1
+  %conv.1 = fpext float %add.1 to double
+  %add10.1 = fadd fast double %conv.1, %add10
+  %arrayidx5.realp.2 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 2, i32 0
+  %arrayidx5.real.2 = load float, float* %arrayidx5.realp.2, align 8
+  %arrayidx5.imagp.2 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 2, i32 1
+  %arrayidx5.imag.2 = load float, float* %arrayidx5.imagp.2, align 8
+  %mul.2 = fmul fast float %arrayidx5.real.2, %arrayidx5.real.2
+  %mul9.2 = fmul fast float %arrayidx5.imag.2, %arrayidx5.imag.2
+  %add.2 = fadd fast float %mul9.2, %mul.2
+  %conv.2 = fpext float %add.2 to double
+  %add10.2 = fadd fast double %conv.2, %add10.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.for.end13_crit_edge, label %for.cond1.preheader
+
+for.cond.for.end13_crit_edge:                     ; preds = %for.cond1.preheader
+  %add10.2.lcssa = phi double [ %add10.2, %for.cond1.preheader ]
+  %phitmp = fptrunc double %add10.2.lcssa to float
+  br label %for.end13
+
+for.end13:                                        ; preds = %for.cond.for.end13_crit_edge, %entry
+  %sum.0.lcssa = phi float [ %phitmp, %for.cond.for.end13_crit_edge ], [ 0.000000e+00, %entry ]
+  store float %sum.0.lcssa, float* %r, align 4
+  ret void
+}
+
author	Olivier Sallenave <ohsallen@us.ibm.com>
	Fri, 6 Mar 2015 23:12:04 +0000 (23:12 +0000)
committer	Olivier Sallenave <ohsallen@us.ibm.com>
	Fri, 6 Mar 2015 23:12:04 +0000 (23:12 +0000)
include/llvm/Analysis/TargetTransformInfo.h		patch \| blob \| history
include/llvm/Analysis/TargetTransformInfoImpl.h		patch \| blob \| history
lib/Analysis/TargetTransformInfo.cpp		patch \| blob \| history
lib/Target/PowerPC/PPCTargetTransformInfo.cpp		patch \| blob \| history
lib/Target/PowerPC/PPCTargetTransformInfo.h		patch \| blob \| history
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll	[new file with mode: 0644]	patch \| blob