From e922c3ebfc8faab7150219629a051a3b3dc034a4 Mon Sep 17 00:00:00 2001 From: Wei Mi Date: Tue, 25 Aug 2015 16:43:47 +0000 Subject: [PATCH] The patch replace the overflow check in loop vectorization with the minimum loop iterations check. The loop minimum iterations check below ensures the loop has enough trip count so the generated vector loop will likely be executed, and it covers the overflow check. Differential Revision: http://reviews.llvm.org/D12107. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245952 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 47 ++++++++++++---------- test/Transforms/LoopVectorize/induction.ll | 4 +- test/Transforms/LoopVectorize/miniters.ll | 45 +++++++++++++++++++++ 3 files changed, 72 insertions(+), 24 deletions(-) create mode 100644 test/Transforms/LoopVectorize/miniters.ll diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 76a49574956..33923624ac2 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2619,7 +2619,7 @@ void InnerLoopVectorizer::createEmptyLoop() { the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- Back-edge taken count overflow check. + [ ] <-- loop iteration number check. / | / v | [ ] <-- vector loop bypass (may consist of multiple blocks). @@ -2683,21 +2683,25 @@ void InnerLoopVectorizer::createEmptyLoop() { // Notice that the pre-header does not change, only the loop body. SCEVExpander Exp(*SE, DL, "induction"); - // We need to test whether the backedge-taken count is uint##_max. Adding one - // to it will cause overflow and an incorrect loop trip count in the vector - // body. In case of overflow we want to directly jump to the scalar remainder - // loop. - Value *BackedgeCount = - Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(), - VectorPH->getTerminator()); - if (BackedgeCount->getType()->isPointerTy()) - BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy, - "backedge.ptrcnt.to.int", - VectorPH->getTerminator()); - Instruction *CheckBCOverflow = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount, - Constant::getAllOnesValue(BackedgeCount->getType()), - "backedge.overflow", VectorPH->getTerminator()); + // The loop minimum iterations check below is to ensure the loop has enough + // trip count so the generated vector loop will likely be executed and the + // preparation and rounding-off costs will likely be worthy. + // + // The minimum iteration check also covers case where the backedge-taken + // count is uint##_max. Adding one to it will cause overflow and an + // incorrect loop trip count being generated in the vector body. In this + // case we also want to directly jump to the scalar remainder loop. + Value *ExitCountValue = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + VectorPH->getTerminator()); + if (ExitCountValue->getType()->isPointerTy()) + ExitCountValue = CastInst::CreatePointerCast(ExitCountValue, IdxTy, + "exitcount.ptrcnt.to.int", + VectorPH->getTerminator()); + + Instruction *CheckMinIters = + CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, ExitCountValue, + ConstantInt::get(ExitCountValue->getType(), VF * UF), + "min.iters.check", VectorPH->getTerminator()); // The loop index does not have to start at Zero. Find the original start // value from the induction PHI node. If we don't have an induction variable @@ -2749,15 +2753,14 @@ void InnerLoopVectorizer::createEmptyLoop() { // times the unroll factor (num of SIMD instructions). Constant *Step = ConstantInt::get(IdxTy, VF * UF); - // Generate code to check that the loop's trip count that we computed by - // adding one to the backedge-taken count will not overflow. + // Generate code to check that the loop's trip count is not less than the + // minimum loop iteration number threshold. BasicBlock *NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "overflow.checked"); + VectorPH->splitBasicBlock(VectorPH->getTerminator(), "min.iters.checked"); if (ParentLoop) ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - ReplaceInstWithInst( - VectorPH->getTerminator(), - BranchInst::Create(ScalarPH, NewVectorPH, CheckBCOverflow)); + ReplaceInstWithInst(VectorPH->getTerminator(), + BranchInst::Create(ScalarPH, NewVectorPH, CheckMinIters)); VectorPH = NewVectorPH; // This is the IR builder that we use to add all of the logic for bypassing diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll index 2fbb2de797a..48566ef92f7 100644 --- a/test/Transforms/LoopVectorize/induction.ll +++ b/test/Transforms/LoopVectorize/induction.ll @@ -113,8 +113,8 @@ define i32 @i16_loop() nounwind readnone ssp uwtable { ; condition and branch directly to the scalar loop. ; CHECK-LABEL: max_i32_backedgetaken -; CHECK: %backedge.overflow = icmp eq i32 -1, -1 -; CHECK: br i1 %backedge.overflow, label %scalar.ph, label %overflow.checked +; CHECK: %min.iters.check = icmp ult i32 0, 2 +; CHECK: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked ; CHECK: scalar.ph: ; CHECK: %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ 0, %0 ] diff --git a/test/Transforms/LoopVectorize/miniters.ll b/test/Transforms/LoopVectorize/miniters.ll new file mode 100644 index 00000000000..81cb2d4ca5a --- /dev/null +++ b/test/Transforms/LoopVectorize/miniters.ll @@ -0,0 +1,45 @@ +; RUN: opt %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s +; RUN: opt %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -S | FileCheck %s -check-prefix=UNROLL + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@b = common global [1000 x i32] zeroinitializer, align 16 +@c = common global [1000 x i32] zeroinitializer, align 16 +@a = common global [1000 x i32] zeroinitializer, align 16 + +; Generate min.iters.check to skip the vector loop and jump to scalar.ph directly when loop iteration number is less than VF * UF. +; CHECK-LABEL: foo( +; CHECK: %min.iters.check = icmp ult i64 %N, 4 +; CHECK: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked +; UNROLL-LABEL: foo( +; UNROLL: %min.iters.check = icmp ult i64 %N, 8 +; UNROLL: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked + +define void @foo(i64 %N) { +entry: + %cmp.8 = icmp sgt i64 %N, 0 + br i1 %cmp.8, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body, %for.body.preheader + %i.09 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds [1000 x i32], [1000 x i32]* @b, i64 0, i64 %i.09 + %tmp = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds [1000 x i32], [1000 x i32]* @c, i64 0, i64 %i.09 + %tmp1 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %tmp1, %tmp + %arrayidx2 = getelementptr inbounds [1000 x i32], [1000 x i32]* @a, i64 0, i64 %i.09 + store i32 %add, i32* %arrayidx2, align 4 + %inc = add nuw nsw i64 %i.09, 1 + %exitcond = icmp eq i64 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} -- 2.34.1