From fc561accb7b9695672a97b95838dc56041f4d05e Mon Sep 17 00:00:00 2001
From: Michael Zolotukhin <mzolotukhin@apple.com>
Date: Fri, 24 Jul 2015 01:53:04 +0000
Subject: [PATCH] Handle resolvable branches in complete loop unroll heuristic.

Summary:
Resolving a branch allows us to ignore blocks that won't be executed, and thus make our estimate more accurate.
This patch is intended to be applied after D10205 (though it could be applied independently).

Reviewers: chandlerc

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D10206

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@243084 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LoopUnrollPass.cpp      |  62 +++++-
 .../LoopUnroll/full-unroll-heuristics-cmp.ll  | 207 ++++++++++++++++++
 2 files changed, 267 insertions(+), 2 deletions(-)
 create mode 100644 test/Transforms/LoopUnroll/full-unroll-heuristics-cmp.ll
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index ff1acc1f579..89b3d47c88f 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -368,8 +368,6 @@ private:
     return simplifyInstWithSCEV(&I);
   }
 
-  /// TODO: Add visitors for other instruction types, e.g. ZExt, SExt.
-
   /// Try to simplify binary operator I.
   ///
   /// TODO: Probaly it's worth to hoist the code for estimating the
@@ -451,6 +449,44 @@ private:
 
     return Base::visitCastInst(I);
   }
+
+  bool visitCmpInst(CmpInst &I) {
+    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+    // First try to handle simplified comparisons.
+    if (!isa<Constant>(LHS))
+      if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+        LHS = SimpleLHS;
+    if (!isa<Constant>(RHS))
+      if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+        RHS = SimpleRHS;
+
+    if (!isa<Constant>(LHS) && !isa<Constant>(RHS)) {
+      auto SimplifiedLHS = SimplifiedAddresses.find(LHS);
+      if (SimplifiedLHS != SimplifiedAddresses.end()) {
+        auto SimplifiedRHS = SimplifiedAddresses.find(RHS);
+        if (SimplifiedRHS != SimplifiedAddresses.end()) {
+          SimplifiedAddress &LHSAddr = SimplifiedLHS->second;
+          SimplifiedAddress &RHSAddr = SimplifiedRHS->second;
+          if (LHSAddr.Base == RHSAddr.Base) {
+            LHS = LHSAddr.Offset;
+            RHS = RHSAddr.Offset;
+          }
+        }
+      }
+    }
+
+    if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
+      if (Constant *CRHS = dyn_cast<Constant>(RHS)) {
+        if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
+          SimplifiedValues[&I] = C;
+          return true;
+        }
+      }
+    }
+
+    return Base::visitCmpInst(I);
+  }
 };
 } // namespace
 
@@ -542,6 +578,28 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
           return None;
       }
 
+      TerminatorInst *TI = BB->getTerminator();
+
+      // Add in the live successors by first checking whether we have terminator
+      // that may be simplified based on the values simplified by this call.
+      if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+        if (BI->isConditional()) {
+          if (Constant *SimpleCond =
+                  SimplifiedValues.lookup(BI->getCondition())) {
+            BBWorklist.insert(BI->getSuccessor(
+                cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0));
+            continue;
+          }
+        }
+      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+        if (Constant *SimpleCond =
+                SimplifiedValues.lookup(SI->getCondition())) {
+          BBWorklist.insert(
+              SI->getSuccessor(cast<ConstantInt>(SimpleCond)->getSExtValue()));
+          continue;
+        }
+      }
+
       // Add BB's successors to the worklist.
       for (BasicBlock *Succ : successors(BB))
         if (L->contains(Succ))
diff --git a/test/Transforms/LoopUnroll/full-unroll-heuristics-cmp.ll b/test/Transforms/LoopUnroll/full-unroll-heuristics-cmp.ll
new file mode 100644
index 00000000000..f7758fa2200
--- /dev/null
+++ b/test/Transforms/LoopUnroll/full-unroll-heuristics-cmp.ll
@@ -0,0 +1,207 @@
+; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=50 | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@known_constant = internal unnamed_addr constant [10 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 16
+
+; We should be able to propagate constant data through comparisons.
+; For example, in this test we have a load, which becomes constant after
+; unrolling, making comparison with 0 also known to be 0 (false) - and that
+; will trigger further simplifications.
+;
+; We expect this loop to be unrolled, because in this case load would become
+; constant, which is always 1, and which, in its turn, helps to simplify
+; following comparison, zero-extension, and addition. In total, unrolling should help to
+; optimize more than 50% of all instructions in this case.
+;
+; CHECK-LABEL: @const_compare
+; CHECK-NOT: br i1 %
+; CHECK: ret i32
+define i32 @const_compare(i32* noalias nocapture readonly %b) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %iv.0 = phi i64 [ 0, %entry ], [ %iv.1, %for.body ]
+  %r.0 = phi i32 [ 0, %entry ], [ %r.1, %for.body ]
+  %arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32]* @known_constant, i64 0, i64 %iv.0
+  %x1 = load i32, i32* %arrayidx1, align 4
+  %cmp = icmp eq i32 %x1, 0
+  %cast = zext i1 %cmp to i32
+  %iv.1 = add nuw nsw i64 %iv.0, 1
+  %r.1 = add i32 %r.0, %cast
+  %exitcond = icmp eq i64 %iv.1, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc
+  ret i32 %r.1
+}
+
+; If we can figure out result of comparison on each iteration, we can resolve
+; the depending branch. That means, that the unrolled version of the loop would
+; have less code, because we don't need not-taken basic blocks there.
+; This test checks that this is taken into consideration.
+; We expect this loop to be unrolled, because the most complicated part of its
+; body (if.then block) is never actually executed.
+; CHECK-LABEL: @branch_folded
+; CHECK-NOT: br i1 %
+; CHECK: ret i32
+define i32 @branch_folded(i32* noalias nocapture readonly %b) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %iv.0 = phi i64 [ 0, %entry ], [ %iv.1, %for.inc ]
+  %r.0 = phi i32 [ 0, %entry ], [ %r.1, %for.inc ]
+  %arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32]* @known_constant, i64 0, i64 %iv.0
+  %x1 = load i32, i32* %arrayidx1, align 4
+  %cmp = icmp eq i32 %x1, 0
+  %iv.1 = add nuw nsw i64 %iv.0, 1
+  br i1 %cmp, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv.0
+  %x2 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %x2, %r.0
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %r.1 = phi i32 [ %add, %if.then ], [ %x1, %for.body ]
+  %exitcond = icmp eq i64 %iv.1, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc
+  ret i32 %r.1
+}
+
+; This test is similar to the previous one, but in this we use IV in comparison
+; (not a loaded value as we did there).
+; CHECK-LABEL: @branch_iv
+; CHECK-NOT: br i1 %
+; CHECK: ret i64
+define i64 @branch_iv(i64* noalias nocapture readonly %b) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %tmp3, %for.inc ]
+  %r.030 = phi i64 [ 0, %entry ], [ %r.1, %for.inc ]
+  %cmp3 = icmp eq i64 %indvars.iv, 5
+  %tmp3 = add nuw nsw i64 %indvars.iv, 1
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i64, i64* %b, i64 %tmp3
+  %tmp1 = load i64, i64* %arrayidx2, align 4
+  %add = add nsw i64 %tmp1, %r.030
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %r.1 = phi i64 [ %add, %if.then ], [ %r.030, %for.body ]
+  %exitcond = icmp eq i64 %tmp3, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc
+  ret i64 %r.1
+}
+
+; Induction variables are often casted to another type, and that shouldn't
+; prevent us from folding branches. Tthis test specifically checks if we can
+; handle this. Other than thatm it's similar to the previous test.
+; CHECK-LABEL: @branch_iv_trunc
+; CHECK-NOT:   br i1 %
+; CHECK:   ret i32
+define i32 @branch_iv_trunc(i32* noalias nocapture readonly %b) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %tmp3, %for.inc ]
+  %r.030 = phi i32 [ 0, %entry ], [ %r.1, %for.inc ]
+  %tmp2 = trunc i64 %indvars.iv to i32
+  %cmp3 = icmp eq i32 %tmp2, 5
+  %tmp3 = add nuw nsw i64 %indvars.iv, 1
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %tmp3
+  %tmp1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %tmp1, %r.030
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %r.1 = phi i32 [ %add, %if.then ], [ %r.030, %for.body ]
+  %exitcond = icmp eq i64 %tmp3, 10
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc
+  ret i32 %r.1
+}
+
+; Check that we don't crash when we analyze icmp with pointer-typed IV and a
+; pointer.
+; CHECK-LABEL: @ptr_cmp_crash
+; CHECK:   ret void
+define void @ptr_cmp_crash() {
+entry:
+  br label %while.body
+
+while.body:
+  %iv.0 = phi i32* [ getelementptr inbounds ([10 x i32], [10 x i32]* @known_constant, i64 0, i64 0), %entry ], [ %iv.1, %while.body ]
+  %iv.1 = getelementptr inbounds i32, i32* %iv.0, i64 1
+  %exitcond = icmp eq i32* %iv.1, getelementptr inbounds ([10 x i32], [10 x i32]* @known_constant, i64 0, i64 9)
+  br i1 %exitcond, label %loop.exit, label %while.body
+
+loop.exit:
+  ret void
+}
+
+; Check that we don't crash when we analyze ptrtoint cast.
+; CHECK-LABEL: @ptrtoint_cast_crash
+; CHECK:   ret void
+define void @ptrtoint_cast_crash(i8 * %a) {
+entry:
+  %limit = getelementptr i8, i8* %a, i64 512
+  br label %loop.body
+
+loop.body:
+  %iv.0 = phi i8* [ %a, %entry ], [ %iv.1, %loop.body ]
+  %cast = ptrtoint i8* %iv.0 to i64
+  %iv.1 = getelementptr inbounds i8, i8* %iv.0, i64 1
+  %exitcond = icmp ne i8* %iv.1, %limit
+  br i1 %exitcond, label %loop.body, label %loop.exit
+
+loop.exit:
+  ret void
+}
+
+; Loop unroller should be able to predict that a comparison would become
+; constant if the operands are pointers with the same base and constant
+; offsets.
+; We expect this loop to be unrolled, since most of its instructions would
+; become constant after it.
+; CHECK-LABEL: @ptr_cmp
+; CHECK-NOT:   br i1 %
+; CHECK:   ret i64
+define i64 @ptr_cmp(i8 * %a) {
+entry:
+  %limit = getelementptr i8, i8* %a, i64 40
+  %start.iv2 = getelementptr i8, i8* %a, i64 7
+  br label %loop.body
+
+loop.body:
+  %iv.0 = phi i8* [ %a, %entry ], [ %iv.1, %loop.body ]
+  %iv2.0 = phi i8* [ %start.iv2, %entry ], [ %iv2.1, %loop.body ]
+  %r.0 = phi i64 [ 0, %entry ], [ %r.1, %loop.body ]
+  %cast = ptrtoint i8* %iv.0 to i64
+  %cmp = icmp eq i8* %iv2.0, %iv.0
+  %sub = sext i1 %cmp to i64
+  %mul = mul i64 %sub, %cast
+  %r.1 = add i64 %r.0, %mul
+  %iv.1 = getelementptr inbounds i8, i8* %iv.0, i64 1
+  %iv2.1 = getelementptr inbounds i8, i8* %iv2.0, i64 1
+  %exitcond = icmp ne i8* %iv.1, %limit
+  br i1 %exitcond, label %loop.body, label %loop.exit
+
+loop.exit:
+  ret i64 %r.1
+}
-- 
2.34.1