cl::desc("The maximum interleave count to use when interleaving a scalar "
"reduction in a nested loop."));
+static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
+ "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum allowed number of runtime memory checks with a "
+ "vectorize(enable) pragma."));
+
namespace {
// Forward declarations.
return DiagnosticInfo::AlwaysPrint;
}
+ bool allowReordering() const {
+ // When enabling loop hints are provided we allow the vectorizer to change
+ // the order of operations that is given by the scalar loop. This is not
+ // enabled by default because can be unsafe or inefficient. For example,
+ // reordering floating-point operations will change the way round-off
+ // error accumulates in the loop.
+ return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
+ }
+
private:
/// Find hints specified in the loop metadata and update local values.
void getHintsFromMetadata() {
bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
const char *Name = Hints.vectorizeAnalysisPassName();
bool Failed = false;
- if (UnsafeAlgebraInst &&
- Hints.getForce() == LoopVectorizeHints::FK_Undefined &&
- Hints.getWidth() == 0) {
+ if (UnsafeAlgebraInst && !Hints.allowReordering()) {
emitOptimizationRemarkAnalysisFPCommute(
F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(),
- VectorizationReport() << "vectorization requires changes in the "
- "order of operations, however IEEE 754 "
- "floating-point operations are not "
- "commutative");
+ VectorizationReport() << "cannot prove it is safe to reorder "
+ "floating-point operations");
Failed = true;
}
- if (NumRuntimePointerChecks >
- VectorizerParams::RuntimeMemoryCheckThreshold) {
+ // Test if runtime memcheck thresholds are exceeded.
+ bool PragmaThresholdReached =
+ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+ bool ThresholdReached =
+ NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
+ if ((ThresholdReached && !Hints.allowReordering()) ||
+ PragmaThresholdReached) {
emitOptimizationRemarkAnalysisAliasing(
F->getContext(), Name, *F, L->getStartLoc(),
VectorizationReport()
- << "cannot prove pointers refer to independent arrays in memory. "
- "The loop requires "
- << NumRuntimePointerChecks
- << " runtime independence checks to vectorize the loop, but that "
- "would exceed the limit of "
- << VectorizerParams::RuntimeMemoryCheckThreshold << " checks");
+ << "cannot prove it is safe to reorder memory operations");
DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
Failed = true;
}
-; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -pragma-vectorize-memory-check-threshold=6 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
; First loop produced diagnostic pass remark.
-;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1)
+;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: {{[0-9]}}, interleaved count: 1)
; Second loop produces diagnostic analysis remark.
-;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove pointers refer to independent arrays in memory. The loop requires 11 runtime independence checks to vectorize the loop, but that would exceed the limit of 8 checks
+;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations
+
+; First loop produced diagnostic pass remark.
+;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: {{[0-9]}}, interleaved count: 1)
+; Second loop produces diagnostic pass remark.
+;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: {{[0-9]}}, interleaved count: 1)
; We are vectorizing with 6 runtime checks.
;CHECK-LABEL: func1x6(
-;CHECK: <4 x i32>
+;CHECK: <{{[0-9]}} x i32>
;CHECK: ret
+;OVERRIDE-LABEL: func1x6(
+;OVERRIDE: <4 x i32>
+;OVERRIDE: ret
define i32 @func1x6(i32* nocapture %out, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
entry:
br label %for.body
; We are not vectorizing with 12 runtime checks.
;CHECK-LABEL: func2x6(
-;CHECK-NOT: <4 x i32>
+;CHECK-NOT: <{{[0-9]}} x i32>
;CHECK: ret
+; We vectorize with 12 checks if a vectorization hint is provided.
+;OVERRIDE-LABEL: func2x6(
+;OVERRIDE: <4 x i32>
+;OVERRIDE: ret
define i32 @func2x6(i32* nocapture %out, i32* nocapture %out2, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
entry:
br label %for.body