Improve vectorization diagnostic messages and extend vectorize(enable) pragma.

author Tyler Nowicki <tyler.nowicki@gmail.com>

Thu, 27 Aug 2015 18:56:49 +0000 (18:56 +0000)

committer Tyler Nowicki <tyler.nowicki@gmail.com>

Thu, 27 Aug 2015 18:56:49 +0000 (18:56 +0000)
author Tyler Nowicki <tyler.nowicki@gmail.com>
Thu, 27 Aug 2015 18:56:49 +0000 (18:56 +0000)
committer Tyler Nowicki <tyler.nowicki@gmail.com>
Thu, 27 Aug 2015 18:56:49 +0000 (18:56 +0000)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 6fc1aaab6e1a71134d48da8ba709e2a52e71c7e0..eaa3ecd48e47a2060c13af8ad50b86b4d871c602 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -214,6 +214,11 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC(
      cl::desc("The maximum interleave count to use when interleaving a scalar "
               "reduction in a nested loop."));
  
+static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
+    "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+    cl::desc("The maximum allowed number of runtime memory checks with a "
+             "vectorize(enable) pragma."));
+
  namespace {
  
  // Forward declarations.
@@ -929,6 +934,15 @@ public:
      return DiagnosticInfo::AlwaysPrint;
    }
  
+  bool allowReordering() const {
+    // When enabling loop hints are provided we allow the vectorizer to change
+    // the order of operations that is given by the scalar loop. This is not
+    // enabled by default because can be unsafe or inefficient. For example,
+    // reordering floating-point operations will change the way round-off
+    // error accumulates in the loop.
+    return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
+  }
+
  private:
    /// Find hints specified in the loop metadata and update local values.
    void getHintsFromMetadata() {
@@ -1427,29 +1441,25 @@ public:
    bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
      const char *Name = Hints.vectorizeAnalysisPassName();
      bool Failed = false;
-    if (UnsafeAlgebraInst &&
-        Hints.getForce() == LoopVectorizeHints::FK_Undefined &&
-        Hints.getWidth() == 0) {
+    if (UnsafeAlgebraInst && !Hints.allowReordering()) {
        emitOptimizationRemarkAnalysisFPCommute(
            F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(),
-          VectorizationReport() << "vectorization requires changes in the "
-                                   "order of operations, however IEEE 754 "
-                                   "floating-point operations are not "
-                                   "commutative");
+          VectorizationReport() << "cannot prove it is safe to reorder "
+                                   "floating-point operations");
        Failed = true;
      }
  
-    if (NumRuntimePointerChecks >
-        VectorizerParams::RuntimeMemoryCheckThreshold) {
+    // Test if runtime memcheck thresholds are exceeded.
+    bool PragmaThresholdReached =
+        NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+    bool ThresholdReached =
+        NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
+    if ((ThresholdReached && !Hints.allowReordering()) ||
+        PragmaThresholdReached) {
        emitOptimizationRemarkAnalysisAliasing(
            F->getContext(), Name, *F, L->getStartLoc(),
            VectorizationReport()
-              << "cannot prove pointers refer to independent arrays in memory. "
-                 "The loop requires "
-              << NumRuntimePointerChecks
-              << " runtime independence checks to vectorize the loop, but that "
-                 "would exceed the limit of "
-              << VectorizerParams::RuntimeMemoryCheckThreshold << " checks");
+              << "cannot prove it is safe to reorder memory operations");
        DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
        Failed = true;
      }
diff --git a/test/Transforms/LoopVectorize/X86/no_fpmath.ll b/test/Transforms/LoopVectorize/X86/no_fpmath.ll

index bc1173ae9f703ca97b9f4fd94936bdabd9392c69..b1e20e52478bc2d96e0fa7fc990cdd245fbf3226 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/no_fpmath.ll
+++ b/test/Transforms/LoopVectorize/X86/no_fpmath.ll
@@ -1,6 +1,6 @@
  ; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
  
-; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: vectorization requires changes in the order of operations, however IEEE 754 floating-point operations are not commutative
+; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations
  ; CHECK: remark: no_fpmath.c:6:14: loop not vectorized:
  ; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2)
  
diff --git a/test/Transforms/LoopVectorize/runtime-limit.ll b/test/Transforms/LoopVectorize/runtime-limit.ll

index ff8d15576c086024bc5c05e7ef40506d4de657e6..e583e6b713767d9ec54dc11c9d14d4d2f74112e1 100644 (file)
--- a/test/Transforms/LoopVectorize/runtime-limit.ll
+++ b/test/Transforms/LoopVectorize/runtime-limit.ll
@@ -1,17 +1,27 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -pragma-vectorize-memory-check-threshold=6 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
  
  target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
  target triple = "x86_64-apple-macosx10.8.0"
  
  ; First loop produced diagnostic pass remark.
-;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1)
+;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: {{[0-9]}}, interleaved count: 1)
  ; Second loop produces diagnostic analysis remark.
-;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove pointers refer to independent arrays in memory. The loop requires 11 runtime independence checks to vectorize the loop, but that would exceed the limit of 8 checks
+;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations
+
+; First loop produced diagnostic pass remark.
+;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: {{[0-9]}}, interleaved count: 1)
+; Second loop produces diagnostic pass remark.
+;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: {{[0-9]}}, interleaved count: 1)
  
  ; We are vectorizing with 6 runtime checks.
  ;CHECK-LABEL: func1x6(
-;CHECK: <4 x i32>
+;CHECK: <{{[0-9]}} x i32>
  ;CHECK: ret
+;OVERRIDE-LABEL: func1x6(
+;OVERRIDE: <4 x i32>
+;OVERRIDE: ret
  define i32 @func1x6(i32* nocapture %out, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
  entry:
    br label %for.body
@@ -44,8 +54,12 @@ for.end:                                          ; preds = %for.body
  
  ; We are not vectorizing with 12 runtime checks.
  ;CHECK-LABEL: func2x6(
-;CHECK-NOT: <4 x i32>
+;CHECK-NOT: <{{[0-9]}} x i32>
  ;CHECK: ret
+; We vectorize with 12 checks if a vectorization hint is provided.
+;OVERRIDE-LABEL: func2x6(
+;OVERRIDE: <4 x i32>
+;OVERRIDE: ret
  define i32 @func2x6(i32* nocapture %out, i32* nocapture %out2, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
  entry:
    br label %for.body
author	Tyler Nowicki <tyler.nowicki@gmail.com>
	Thu, 27 Aug 2015 18:56:49 +0000 (18:56 +0000)
committer	Tyler Nowicki <tyler.nowicki@gmail.com>
	Thu, 27 Aug 2015 18:56:49 +0000 (18:56 +0000)
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/X86/no_fpmath.ll		patch \| blob \| history
test/Transforms/LoopVectorize/runtime-limit.ll		patch \| blob \| history