From 3f53fc8f5f56f6627785ad22bc4dd1471ee6b07d Mon Sep 17 00:00:00 2001 From: Diego Novillo Date: Tue, 16 Jun 2015 19:10:58 +0000 Subject: [PATCH] Fix PR 23525 - Separate header mass propagation in irregular loops. Summary: When propagating mass through irregular loops, the mass flowing through each loop header may not be equal. This was causing wrong frequencies to be computed for irregular loop headers. Fixed by keeping track of masses flowing through each of the headers in an irregular loop. To do this, we now keep track of per-header backedge weights. After the loop mass is distributed through the loop, the backedge weights are used to re-distribute the loop mass to the loop headers. Since each backedge will have a mass proportional to the different branch weights, the loop headers will end up with a more approximate weight distribution (as opposed to the current distribution that assumes that every loop header is the same). Reviewers: dexonsmith Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D10348 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239843 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../llvm/Analysis/BlockFrequencyInfoImpl.h | 42 +++++++--- lib/Analysis/BlockFrequencyInfoImpl.cpp | 80 ++++++++++++++----- test/Analysis/BlockFrequencyInfo/PR23525.ll | 80 +++++++++++++++++++ .../BlockFrequencyInfo/irreducible.ll | 11 +-- 4 files changed, 173 insertions(+), 40 deletions(-) create mode 100644 test/Analysis/BlockFrequencyInfo/PR23525.ll diff --git a/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/include/llvm/Analysis/BlockFrequencyInfoImpl.h index 85a299b6ddd..02084779830 100644 --- a/include/llvm/Analysis/BlockFrequencyInfoImpl.h +++ b/include/llvm/Analysis/BlockFrequencyInfoImpl.h @@ -196,23 +196,26 @@ public: struct LoopData { typedef SmallVector, 4> ExitMap; typedef SmallVector NodeList; - LoopData *Parent; ///< The parent loop. - bool IsPackaged; ///< Whether this has been packaged. - uint32_t NumHeaders; ///< Number of headers. - ExitMap Exits; ///< Successor edges (and weights). - NodeList Nodes; ///< Header and the members of the loop. - BlockMass BackedgeMass; ///< Mass returned to loop header. + typedef SmallVector HeaderMassList; + LoopData *Parent; ///< The parent loop. + bool IsPackaged; ///< Whether this has been packaged. + uint32_t NumHeaders; ///< Number of headers. + ExitMap Exits; ///< Successor edges (and weights). + NodeList Nodes; ///< Header and the members of the loop. + HeaderMassList BackedgeMass; ///< Mass returned to each loop header. BlockMass Mass; Scaled64 Scale; LoopData(LoopData *Parent, const BlockNode &Header) - : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header) {} + : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header), + BackedgeMass(1) {} template LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther, It2 LastOther) : Parent(Parent), IsPackaged(false), Nodes(FirstHeader, LastHeader) { NumHeaders = Nodes.size(); Nodes.insert(Nodes.end(), FirstOther, LastOther); + BackedgeMass.resize(NumHeaders); } bool isHeader(const BlockNode &Node) const { if (isIrreducible()) @@ -223,6 +226,14 @@ public: BlockNode getHeader() const { return Nodes[0]; } bool isIrreducible() const { return NumHeaders > 1; } + HeaderMassList::difference_type headerIndexFor(const BlockNode &B) { + assert(isHeader(B) && "this is only valid on loop header blocks"); + if (isIrreducible()) + return std::lower_bound(Nodes.begin(), Nodes.begin() + NumHeaders, B) - + Nodes.begin(); + return 0; + } + NodeList::const_iterator members_begin() const { return Nodes.begin() + NumHeaders; } @@ -431,6 +442,16 @@ public: /// \brief Compute the loop scale for a loop. void computeLoopScale(LoopData &Loop); + /// Adjust the mass of all headers in an irreducible loop. + /// + /// Initially, irreducible loops are assumed to distribute their mass + /// equally among its headers. This can lead to wrong frequency estimates + /// since some headers may be executed more frequently than others. + /// + /// This adjusts header mass distribution so it matches the weights of + /// the backedges going into each of the loop headers. + void adjustLoopHeaderMass(LoopData &Loop); + /// \brief Package up a loop. void packageLoop(LoopData &Loop); @@ -735,11 +756,6 @@ void IrreducibleGraph::addEdges(const BlockNode &Node, /// as sub-loops, rather than arbitrarily shoving the problematic /// blocks into the headers of the main irreducible SCC. /// -/// - Backedge frequencies are assumed to be evenly split between the -/// headers of a given irreducible SCC. Instead, we could track the -/// backedge mass separately for each header, and adjust their relative -/// frequencies. -/// /// - Entry frequencies are assumed to be evenly split between the /// headers of a given irreducible SCC, which is the only option if we /// need to compute mass in the SCC before its parent loop. Instead, @@ -1042,6 +1058,8 @@ bool BlockFrequencyInfoImpl::computeMassInLoop(LoopData &Loop) { for (const BlockNode &M : Loop.Nodes) if (!propagateMassToSuccessors(&Loop, M)) llvm_unreachable("unhandled irreducible control flow"); + + adjustLoopHeaderMass(Loop); } else { Working[Loop.getHeader().Index].getMass() = BlockMass::getFull(); if (!propagateMassToSuccessors(&Loop, Loop.getHeader())) diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp index 456cee179f0..88fbf7cc4cb 100644 --- a/lib/Analysis/BlockFrequencyInfoImpl.cpp +++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -286,7 +286,7 @@ bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist, if (isLoopHeader(Resolved)) { DEBUG(debugSuccessor("backedge")); - Dist.addBackedge(OuterLoop->getHeader(), Weight); + Dist.addBackedge(Resolved, Weight); return true; } @@ -349,7 +349,10 @@ void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) { // LoopScale == 1 / ExitMass // ExitMass == HeadMass - BackedgeMass - BlockMass ExitMass = BlockMass::getFull() - Loop.BackedgeMass; + BlockMass TotalBackedgeMass; + for (auto &Mass : Loop.BackedgeMass) + TotalBackedgeMass += Mass; + BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass; // Block scale stores the inverse of the scale. If this is an infinite loop, // its exit mass will be zero. In this case, use an arbitrary scale for the @@ -358,7 +361,7 @@ void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) { ExitMass.isEmpty() ? InifiniteLoopScale : ExitMass.toScaled().inverse(); DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull() - << " - " << Loop.BackedgeMass << ")\n" + << " - " << TotalBackedgeMass << ")\n" << " - scale = " << Loop.Scale << "\n"); } @@ -375,6 +378,19 @@ void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) { Loop.IsPackaged = true; } +#ifndef NDEBUG +static void debugAssign(const BlockFrequencyInfoImplBase &BFI, + const DitheringDistributer &D, const BlockNode &T, + const BlockMass &M, const char *Desc) { + dbgs() << " => assign " << M << " (" << D.RemMass << ")"; + if (Desc) + dbgs() << " [" << Desc << "]"; + if (T.isValid()) + dbgs() << " to " << BFI.getBlockName(T); + dbgs() << "\n"; +} +#endif + void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, LoopData *OuterLoop, Distribution &Dist) { @@ -384,25 +400,12 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, // Distribute mass to successors as laid out in Dist. DitheringDistributer D(Dist, Mass); -#ifndef NDEBUG - auto debugAssign = [&](const BlockNode &T, const BlockMass &M, - const char *Desc) { - dbgs() << " => assign " << M << " (" << D.RemMass << ")"; - if (Desc) - dbgs() << " [" << Desc << "]"; - if (T.isValid()) - dbgs() << " to " << getBlockName(T); - dbgs() << "\n"; - }; - (void)debugAssign; -#endif - for (const Weight &W : Dist.Weights) { // Check for a local edge (non-backedge and non-exit). BlockMass Taken = D.takeMass(W.Amount); if (W.Type == Weight::Local) { Working[W.TargetNode.Index].getMass() += Taken; - DEBUG(debugAssign(W.TargetNode, Taken, nullptr)); + DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); continue; } @@ -411,15 +414,16 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, // Check for a backedge. if (W.Type == Weight::Backedge) { - OuterLoop->BackedgeMass += Taken; - DEBUG(debugAssign(BlockNode(), Taken, "back")); + auto ix = OuterLoop->headerIndexFor(W.TargetNode); + OuterLoop->BackedgeMass[ix] += Taken; + DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back")); continue; } // This must be an exit. assert(W.Type == Weight::Exit); OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken)); - DEBUG(debugAssign(W.TargetNode, Taken, "exit")); + DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit")); } } @@ -713,10 +717,44 @@ BlockFrequencyInfoImplBase::analyzeIrreducible( void BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) { OuterLoop.Exits.clear(); - OuterLoop.BackedgeMass = BlockMass::getEmpty(); + for (auto &Mass : OuterLoop.BackedgeMass) + Mass = BlockMass::getEmpty(); auto O = OuterLoop.Nodes.begin() + 1; for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I) if (!Working[I->Index].isPackaged()) *O++ = *I; OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end()); } + +void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) { + assert(Loop.isIrreducible() && "this only makes sense on irreducible loops"); + + // Since the loop has more than one header block, the mass flowing back into + // each header will be different. Adjust the mass in each header loop to + // reflect the masses flowing through back edges. + // + // To do this, we distribute the initial mass using the backedge masses + // as weights for the distribution. + BlockMass LoopMass = BlockMass::getFull(); + Distribution Dist; + + DEBUG(dbgs() << "adjust-loop-header-mass:\n"); + for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { + auto &HeaderNode = Loop.Nodes[H]; + auto &BackedgeMass = Loop.BackedgeMass[Loop.headerIndexFor(HeaderNode)]; + DEBUG(dbgs() << " - Add back edge mass for node " + << getBlockName(HeaderNode) << ": " << BackedgeMass << "\n"); + Dist.addLocal(HeaderNode, BackedgeMass.getMass()); + } + + DitheringDistributer D(Dist, LoopMass); + + DEBUG(dbgs() << " Distribute loop mass " << LoopMass + << " to headers using above weights\n"); + for (const Weight &W : Dist.Weights) { + BlockMass Taken = D.takeMass(W.Amount); + assert(W.Type == Weight::Local && "all weights should be local"); + Working[W.TargetNode.Index].getMass() = Taken; + DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); + } +} diff --git a/test/Analysis/BlockFrequencyInfo/PR23525.ll b/test/Analysis/BlockFrequencyInfo/PR23525.ll new file mode 100644 index 00000000000..43bcd576412 --- /dev/null +++ b/test/Analysis/BlockFrequencyInfo/PR23525.ll @@ -0,0 +1,80 @@ +; RUN: opt < %s -analyze -block-freq | FileCheck %s + +@g = global i32 0, align 4 + +; Function Attrs: inlinehint noinline nounwind uwtable +define i32 @_Z8hot_loopi(i32 %n) !prof !1 { +entry: + %div = sdiv i32 %n, 2 + %rem12 = and i32 %n, 1 + %cmp = icmp eq i32 %rem12, 0 + br i1 %cmp, label %Next, label %for.cond, !prof !2 + +; CHECK: - for.cond: float = 25.85{{[0-9]*}}, int = 206 +for.cond: ; preds = %entry, %for.inc + %i.0 = phi i32 [ %inc, %for.inc ], [ %div, %entry ] + %cmp1 = icmp slt i32 %i.0, %n + br i1 %cmp1, label %for.body, label %for.end, !prof !3, !llvm.loop !4 + +; CHECK: - for.body: float = 24.52, int = 196 +for.body: ; preds = %for.cond + %rem213 = and i32 %i.0, 1 + %cmp3 = icmp eq i32 %rem213, 0 + br i1 %cmp3, label %if.then.4, label %Next, !prof !6 + +; CHECK: - if.then.4: float = 12.26{{[0-9]*}}, int = 98 +if.then.4: ; preds = %for.body + %0 = load i32, i32* @g, align 4, !tbaa !7 + %mul = shl nsw i32 %0, 1 + br label %for.inc + +; CHECK: - Next: float = 12.41{{[0-9]*}}, int = 99 +Next: ; preds = %for.body, %entry + %i.1 = phi i32 [ %div, %entry ], [ %i.0, %for.body ] + %1 = load i32, i32* @g, align 4, !tbaa !7 + %add = add nsw i32 %1, %n + br label %for.inc + +; CHECK: - for.inc: float = 38.28{{[0-9]*}}, int = 306 +for.inc: ; preds = %if.then.4, %Next + %storemerge = phi i32 [ %add, %Next ], [ %mul, %if.then.4 ] + %i.2 = phi i32 [ %i.1, %Next ], [ %i.0, %if.then.4 ] + store i32 %storemerge, i32* @g, align 4, !tbaa !7 + %inc = add nsw i32 %i.2, 1 + br label %for.cond + +; CHECK: - for.end: float = 1.0, int = 8 +for.end: ; preds = %for.cond + %2 = load i32, i32* @g, align 4, !tbaa !7 + ret i32 %2 +} + +; Function Attrs: nounwind uwtable +define i32 @main() !prof !11 { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 0 + +for.body: ; preds = %for.body, %entry + %i.04 = phi i32 [ 1, %entry ], [ %inc, %for.body ] + %call = tail call i32 @_Z8hot_loopi(i32 %i.04) + %inc = add nuw nsw i32 %i.04, 1 + %exitcond = icmp eq i32 %inc, 100 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !12 +} + + +!1 = !{!"function_entry_count", i64 99} +!2 = !{!"branch_weights", i32 50, i32 51} +!3 = !{!"branch_weights", i32 2452, i32 100} +!4 = distinct !{!4, !5} +!5 = !{!"llvm.loop.unroll.disable"} +!6 = !{!"branch_weights", i32 1227, i32 1226} +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !9, i64 0} +!9 = !{!"omnipotent char", !10, i64 0} +!10 = !{!"Simple C/C++ TBAA"} +!11 = !{!"function_entry_count", i64 1} +!12 = !{!"branch_weights", i32 2, i32 100} diff --git a/test/Analysis/BlockFrequencyInfo/irreducible.ll b/test/Analysis/BlockFrequencyInfo/irreducible.ll index b275aae6279..e58b5eba4e6 100644 --- a/test/Analysis/BlockFrequencyInfo/irreducible.ll +++ b/test/Analysis/BlockFrequencyInfo/irreducible.ll @@ -130,9 +130,6 @@ exit: ; At the first step, c1 and c2 each get 1/3 of the entry. At each subsequent ; step, c1 and c2 each get 1/3 of what's left in c1 and c2 combined. This ; infinite series sums to 1. -; -; Since the currently algorithm *always* assumes entry blocks are equal, -; -block-freq gets the right answers here. define void @crossloops(i2 %x) { ; CHECK-LABEL: Printing analysis {{.*}} for function 'crossloops': ; CHECK-NEXT: block-frequency-info: crossloops @@ -386,7 +383,7 @@ exit: ; ; This testcases uses non-trivial branch weights. The CHECK statements here ; will start to fail if we change -block-freq to be more accurate. Currently, -; we expect left, right and top to be treated as equal headers. +; loop headers are affected by the weight of their corresponding back edges. define void @nonentry_header(i1 %x, i2 %y) { ; CHECK-LABEL: Printing analysis {{.*}} for function 'nonentry_header': ; CHECK-NEXT: block-frequency-info: nonentry_header @@ -395,15 +392,15 @@ entry: br i1 %x, label %left, label %right, !prof !21 left: -; CHECK-NEXT: left: float = 3.0, +; CHECK-NEXT: left: float = 0.14{{[0-9]*}}, br i1 %x, label %top, label %bottom, !prof !22 right: -; CHECK-NEXT: right: float = 3.0, +; CHECK-NEXT: right: float = 0.42{{[0-9]*}}, br i1 %x, label %top, label %bottom, !prof !22 top: -; CHECK-NEXT: top: float = 3.0, +; CHECK-NEXT: top: float = 8.43{{[0-9]*}}, switch i2 %y, label %exit [ i2 0, label %left i2 1, label %right i2 2, label %bottom ], !prof !23 -- 2.34.1