struct LoopData {
typedef SmallVector<std::pair<BlockNode, BlockMass>, 4> ExitMap;
typedef SmallVector<BlockNode, 4> NodeList;
- LoopData *Parent; ///< The parent loop.
- bool IsPackaged; ///< Whether this has been packaged.
- uint32_t NumHeaders; ///< Number of headers.
- ExitMap Exits; ///< Successor edges (and weights).
- NodeList Nodes; ///< Header and the members of the loop.
- BlockMass BackedgeMass; ///< Mass returned to loop header.
+ typedef SmallVector<BlockMass, 1> HeaderMassList;
+ LoopData *Parent; ///< The parent loop.
+ bool IsPackaged; ///< Whether this has been packaged.
+ uint32_t NumHeaders; ///< Number of headers.
+ ExitMap Exits; ///< Successor edges (and weights).
+ NodeList Nodes; ///< Header and the members of the loop.
+ HeaderMassList BackedgeMass; ///< Mass returned to each loop header.
BlockMass Mass;
Scaled64 Scale;
LoopData(LoopData *Parent, const BlockNode &Header)
- : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header) {}
+ : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header),
+ BackedgeMass(1) {}
template <class It1, class It2>
LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther,
It2 LastOther)
: Parent(Parent), IsPackaged(false), Nodes(FirstHeader, LastHeader) {
NumHeaders = Nodes.size();
Nodes.insert(Nodes.end(), FirstOther, LastOther);
+ BackedgeMass.resize(NumHeaders);
}
bool isHeader(const BlockNode &Node) const {
if (isIrreducible())
BlockNode getHeader() const { return Nodes[0]; }
bool isIrreducible() const { return NumHeaders > 1; }
+ HeaderMassList::difference_type headerIndexFor(const BlockNode &B) {
+ assert(isHeader(B) && "this is only valid on loop header blocks");
+ if (isIrreducible())
+ return std::lower_bound(Nodes.begin(), Nodes.begin() + NumHeaders, B) -
+ Nodes.begin();
+ return 0;
+ }
+
NodeList::const_iterator members_begin() const {
return Nodes.begin() + NumHeaders;
}
/// \brief Compute the loop scale for a loop.
void computeLoopScale(LoopData &Loop);
+ /// Adjust the mass of all headers in an irreducible loop.
+ ///
+ /// Initially, irreducible loops are assumed to distribute their mass
+ /// equally among its headers. This can lead to wrong frequency estimates
+ /// since some headers may be executed more frequently than others.
+ ///
+ /// This adjusts header mass distribution so it matches the weights of
+ /// the backedges going into each of the loop headers.
+ void adjustLoopHeaderMass(LoopData &Loop);
+
/// \brief Package up a loop.
void packageLoop(LoopData &Loop);
/// as sub-loops, rather than arbitrarily shoving the problematic
/// blocks into the headers of the main irreducible SCC.
///
-/// - Backedge frequencies are assumed to be evenly split between the
-/// headers of a given irreducible SCC. Instead, we could track the
-/// backedge mass separately for each header, and adjust their relative
-/// frequencies.
-///
/// - Entry frequencies are assumed to be evenly split between the
/// headers of a given irreducible SCC, which is the only option if we
/// need to compute mass in the SCC before its parent loop. Instead,
for (const BlockNode &M : Loop.Nodes)
if (!propagateMassToSuccessors(&Loop, M))
llvm_unreachable("unhandled irreducible control flow");
+
+ adjustLoopHeaderMass(Loop);
} else {
Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))
if (isLoopHeader(Resolved)) {
DEBUG(debugSuccessor("backedge"));
- Dist.addBackedge(OuterLoop->getHeader(), Weight);
+ Dist.addBackedge(Resolved, Weight);
return true;
}
// LoopScale == 1 / ExitMass
// ExitMass == HeadMass - BackedgeMass
- BlockMass ExitMass = BlockMass::getFull() - Loop.BackedgeMass;
+ BlockMass TotalBackedgeMass;
+ for (auto &Mass : Loop.BackedgeMass)
+ TotalBackedgeMass += Mass;
+ BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass;
// Block scale stores the inverse of the scale. If this is an infinite loop,
// its exit mass will be zero. In this case, use an arbitrary scale for the
ExitMass.isEmpty() ? InifiniteLoopScale : ExitMass.toScaled().inverse();
DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
- << " - " << Loop.BackedgeMass << ")\n"
+ << " - " << TotalBackedgeMass << ")\n"
<< " - scale = " << Loop.Scale << "\n");
}
Loop.IsPackaged = true;
}
+#ifndef NDEBUG
+static void debugAssign(const BlockFrequencyInfoImplBase &BFI,
+ const DitheringDistributer &D, const BlockNode &T,
+ const BlockMass &M, const char *Desc) {
+ dbgs() << " => assign " << M << " (" << D.RemMass << ")";
+ if (Desc)
+ dbgs() << " [" << Desc << "]";
+ if (T.isValid())
+ dbgs() << " to " << BFI.getBlockName(T);
+ dbgs() << "\n";
+}
+#endif
+
void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
LoopData *OuterLoop,
Distribution &Dist) {
// Distribute mass to successors as laid out in Dist.
DitheringDistributer D(Dist, Mass);
-#ifndef NDEBUG
- auto debugAssign = [&](const BlockNode &T, const BlockMass &M,
- const char *Desc) {
- dbgs() << " => assign " << M << " (" << D.RemMass << ")";
- if (Desc)
- dbgs() << " [" << Desc << "]";
- if (T.isValid())
- dbgs() << " to " << getBlockName(T);
- dbgs() << "\n";
- };
- (void)debugAssign;
-#endif
-
for (const Weight &W : Dist.Weights) {
// Check for a local edge (non-backedge and non-exit).
BlockMass Taken = D.takeMass(W.Amount);
if (W.Type == Weight::Local) {
Working[W.TargetNode.Index].getMass() += Taken;
- DEBUG(debugAssign(W.TargetNode, Taken, nullptr));
+ DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
continue;
}
// Check for a backedge.
if (W.Type == Weight::Backedge) {
- OuterLoop->BackedgeMass += Taken;
- DEBUG(debugAssign(BlockNode(), Taken, "back"));
+ auto ix = OuterLoop->headerIndexFor(W.TargetNode);
+ OuterLoop->BackedgeMass[ix] += Taken;
+ DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back"));
continue;
}
// This must be an exit.
assert(W.Type == Weight::Exit);
OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken));
- DEBUG(debugAssign(W.TargetNode, Taken, "exit"));
+ DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit"));
}
}
void
BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) {
OuterLoop.Exits.clear();
- OuterLoop.BackedgeMass = BlockMass::getEmpty();
+ for (auto &Mass : OuterLoop.BackedgeMass)
+ Mass = BlockMass::getEmpty();
auto O = OuterLoop.Nodes.begin() + 1;
for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I)
if (!Working[I->Index].isPackaged())
*O++ = *I;
OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end());
}
+
+void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) {
+ assert(Loop.isIrreducible() && "this only makes sense on irreducible loops");
+
+ // Since the loop has more than one header block, the mass flowing back into
+ // each header will be different. Adjust the mass in each header loop to
+ // reflect the masses flowing through back edges.
+ //
+ // To do this, we distribute the initial mass using the backedge masses
+ // as weights for the distribution.
+ BlockMass LoopMass = BlockMass::getFull();
+ Distribution Dist;
+
+ DEBUG(dbgs() << "adjust-loop-header-mass:\n");
+ for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
+ auto &HeaderNode = Loop.Nodes[H];
+ auto &BackedgeMass = Loop.BackedgeMass[Loop.headerIndexFor(HeaderNode)];
+ DEBUG(dbgs() << " - Add back edge mass for node "
+ << getBlockName(HeaderNode) << ": " << BackedgeMass << "\n");
+ Dist.addLocal(HeaderNode, BackedgeMass.getMass());
+ }
+
+ DitheringDistributer D(Dist, LoopMass);
+
+ DEBUG(dbgs() << " Distribute loop mass " << LoopMass
+ << " to headers using above weights\n");
+ for (const Weight &W : Dist.Weights) {
+ BlockMass Taken = D.takeMass(W.Amount);
+ assert(W.Type == Weight::Local && "all weights should be local");
+ Working[W.TargetNode.Index].getMass() = Taken;
+ DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
+ }
+}
--- /dev/null
+; RUN: opt < %s -analyze -block-freq | FileCheck %s
+
+@g = global i32 0, align 4
+
+; Function Attrs: inlinehint noinline nounwind uwtable
+define i32 @_Z8hot_loopi(i32 %n) !prof !1 {
+entry:
+ %div = sdiv i32 %n, 2
+ %rem12 = and i32 %n, 1
+ %cmp = icmp eq i32 %rem12, 0
+ br i1 %cmp, label %Next, label %for.cond, !prof !2
+
+; CHECK: - for.cond: float = 25.85{{[0-9]*}}, int = 206
+for.cond: ; preds = %entry, %for.inc
+ %i.0 = phi i32 [ %inc, %for.inc ], [ %div, %entry ]
+ %cmp1 = icmp slt i32 %i.0, %n
+ br i1 %cmp1, label %for.body, label %for.end, !prof !3, !llvm.loop !4
+
+; CHECK: - for.body: float = 24.52, int = 196
+for.body: ; preds = %for.cond
+ %rem213 = and i32 %i.0, 1
+ %cmp3 = icmp eq i32 %rem213, 0
+ br i1 %cmp3, label %if.then.4, label %Next, !prof !6
+
+; CHECK: - if.then.4: float = 12.26{{[0-9]*}}, int = 98
+if.then.4: ; preds = %for.body
+ %0 = load i32, i32* @g, align 4, !tbaa !7
+ %mul = shl nsw i32 %0, 1
+ br label %for.inc
+
+; CHECK: - Next: float = 12.41{{[0-9]*}}, int = 99
+Next: ; preds = %for.body, %entry
+ %i.1 = phi i32 [ %div, %entry ], [ %i.0, %for.body ]
+ %1 = load i32, i32* @g, align 4, !tbaa !7
+ %add = add nsw i32 %1, %n
+ br label %for.inc
+
+; CHECK: - for.inc: float = 38.28{{[0-9]*}}, int = 306
+for.inc: ; preds = %if.then.4, %Next
+ %storemerge = phi i32 [ %add, %Next ], [ %mul, %if.then.4 ]
+ %i.2 = phi i32 [ %i.1, %Next ], [ %i.0, %if.then.4 ]
+ store i32 %storemerge, i32* @g, align 4, !tbaa !7
+ %inc = add nsw i32 %i.2, 1
+ br label %for.cond
+
+; CHECK: - for.end: float = 1.0, int = 8
+for.end: ; preds = %for.cond
+ %2 = load i32, i32* @g, align 4, !tbaa !7
+ ret i32 %2
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @main() !prof !11 {
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret i32 0
+
+for.body: ; preds = %for.body, %entry
+ %i.04 = phi i32 [ 1, %entry ], [ %inc, %for.body ]
+ %call = tail call i32 @_Z8hot_loopi(i32 %i.04)
+ %inc = add nuw nsw i32 %i.04, 1
+ %exitcond = icmp eq i32 %inc, 100
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !12
+}
+
+
+!1 = !{!"function_entry_count", i64 99}
+!2 = !{!"branch_weights", i32 50, i32 51}
+!3 = !{!"branch_weights", i32 2452, i32 100}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.unroll.disable"}
+!6 = !{!"branch_weights", i32 1227, i32 1226}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C/C++ TBAA"}
+!11 = !{!"function_entry_count", i64 1}
+!12 = !{!"branch_weights", i32 2, i32 100}
; At the first step, c1 and c2 each get 1/3 of the entry. At each subsequent
; step, c1 and c2 each get 1/3 of what's left in c1 and c2 combined. This
; infinite series sums to 1.
-;
-; Since the currently algorithm *always* assumes entry blocks are equal,
-; -block-freq gets the right answers here.
define void @crossloops(i2 %x) {
; CHECK-LABEL: Printing analysis {{.*}} for function 'crossloops':
; CHECK-NEXT: block-frequency-info: crossloops
;
; This testcases uses non-trivial branch weights. The CHECK statements here
; will start to fail if we change -block-freq to be more accurate. Currently,
-; we expect left, right and top to be treated as equal headers.
+; loop headers are affected by the weight of their corresponding back edges.
define void @nonentry_header(i1 %x, i2 %y) {
; CHECK-LABEL: Printing analysis {{.*}} for function 'nonentry_header':
; CHECK-NEXT: block-frequency-info: nonentry_header
br i1 %x, label %left, label %right, !prof !21
left:
-; CHECK-NEXT: left: float = 3.0,
+; CHECK-NEXT: left: float = 0.14{{[0-9]*}},
br i1 %x, label %top, label %bottom, !prof !22
right:
-; CHECK-NEXT: right: float = 3.0,
+; CHECK-NEXT: right: float = 0.42{{[0-9]*}},
br i1 %x, label %top, label %bottom, !prof !22
top:
-; CHECK-NEXT: top: float = 3.0,
+; CHECK-NEXT: top: float = 8.43{{[0-9]*}},
switch i2 %y, label %exit [ i2 0, label %left
i2 1, label %right
i2 2, label %bottom ], !prof !23