From 76d9fb3a148bd33f4378aece787dec62882689e8 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 10 Nov 2015 16:48:53 +0000 Subject: [PATCH] add 'MustReduceDepth' as an objective/cost-metric for the MachineCombiner This is one of the problems noted in PR25016: https://llvm.org/bugs/show_bug.cgi?id=25016 and: http://lists.llvm.org/pipermail/llvm-dev/2015-October/090998.html The spilling problem is independent and not addressed by this patch. The MachineCombiner was doing reassociations that don't improve or even worsen the critical path. This is caused by inclusion of the "slack" factor when calculating the critical path of the original code sequence. If we don't add that, then we have a more conservative cost comparison of the old code sequence vs. a new sequence. The more liberal calculation must be preserved, however, for the AArch64 MULADD patterns because benchmark regressions were observed without that. The two failing test cases now have identical asm that does what we want: a + b + c + d ---> (a + b) + (c + d) Differential Revision: http://reviews.llvm.org/D13417 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@252616 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/MachineCombiner.cpp | 82 ++++++++++++++++++---------- test/CodeGen/X86/machine-combiner.ll | 11 ++-- 2 files changed, 59 insertions(+), 34 deletions(-) diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp index 11b6a79a8b1..e44568d4645 100644 --- a/lib/CodeGen/MachineCombiner.cpp +++ b/lib/CodeGen/MachineCombiner.cpp @@ -69,10 +69,10 @@ private: MachineTraceMetrics::Trace BlockTrace); bool improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, - MachineTraceMetrics::Trace BlockTrace, - SmallVectorImpl &InsInstrs, - DenseMap &InstrIdxForVirtReg, - bool NewCodeHasLessInsts); + MachineTraceMetrics::Trace BlockTrace, + SmallVectorImpl &InsInstrs, + DenseMap &InstrIdxForVirtReg, + MachineCombinerPattern Pattern); bool preservesResourceLen(MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, @@ -210,43 +210,70 @@ unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, return NewRootLatency; } -/// True when the new instruction sequence does not lengthen the critical path -/// and the new sequence has less instructions or the new sequence improves the -/// critical path. +/// The combiner's goal may differ based on which pattern it is attempting +/// to optimize. +enum class CombinerObjective { + MustReduceDepth, // The data dependency chain must be improved. + Default // The critical path must not be lengthened. +}; + +static CombinerObjective getCombinerObjective(MachineCombinerPattern P) { + // TODO: If C++ ever gets a real enum class, make this part of the + // MachineCombinerPattern class. + switch (P) { + case MachineCombinerPattern::REASSOC_AX_BY: + case MachineCombinerPattern::REASSOC_AX_YB: + case MachineCombinerPattern::REASSOC_XA_BY: + case MachineCombinerPattern::REASSOC_XA_YB: + return CombinerObjective::MustReduceDepth; + default: + return CombinerObjective::Default; + } +} + /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. /// The new code sequence ends in MI NewRoot. A necessary condition for the new /// sequence to replace the old sequence is that it cannot lengthen the critical -/// path. This is decided by the formula: -/// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)). -/// If the new sequence has an equal length critical path but does not reduce -/// the number of instructions (NewCodeHasLessInsts is false), then it is not -/// considered an improvement. The slack is the number of cycles Root can be -/// delayed before the critical patch becomes longer. +/// path. The definition of "improve" may be restricted by specifying that the +/// new path improves the data dependency chain (MustReduceDepth). bool MachineCombiner::improvesCriticalPathLen( MachineBasicBlock *MBB, MachineInstr *Root, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, DenseMap &InstrIdxForVirtReg, - bool NewCodeHasLessInsts) { + MachineCombinerPattern Pattern) { assert(TSchedModel.hasInstrSchedModelOrItineraries() && "Missing machine model\n"); // NewRoot is the last instruction in the \p InsInstrs vector. - // Get depth and latency of NewRoot. unsigned NewRootIdx = InsInstrs.size() - 1; MachineInstr *NewRoot = InsInstrs[NewRootIdx]; - unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); - unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); - // Get depth, latency and slack of Root. + // Get depth and latency of NewRoot and Root. + unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth; + + DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; + dbgs() << " NewRootDepth: " << NewRootDepth << "\n"; + dbgs() << " RootDepth: " << RootDepth << "\n"); + + // For a transform such as reassociation, the cost equation is + // conservatively calculated so that we must improve the depth (data + // dependency cycles) in the critical path to proceed with the transform. + // Being conservative also protects against inaccuracies in the underlying + // machine trace metrics and CPU models. + if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) + return NewRootDepth < RootDepth; + + // A more flexible cost calculation for the critical path includes the slack + // of the original code sequence. This may allow the transform to proceed + // even if the instruction depths (data dependency cycles) become worse. + unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); unsigned RootLatency = TSchedModel.computeInstrLatency(Root); unsigned RootSlack = BlockTrace.getInstrSlack(Root); - DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; - dbgs() << " NewRootDepth: " << NewRootDepth - << " NewRootLatency: " << NewRootLatency << "\n"; - dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency - << " RootSlack: " << RootSlack << "\n"; + DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; + dbgs() << " RootLatency: " << RootLatency << "\n"; + dbgs() << " RootSlack: " << RootSlack << "\n"; dbgs() << " NewRootDepth + NewRootLatency = " << NewRootDepth + NewRootLatency << "\n"; dbgs() << " RootDepth + RootLatency + RootSlack = " @@ -255,10 +282,7 @@ bool MachineCombiner::improvesCriticalPathLen( unsigned NewCycleCount = NewRootDepth + NewRootLatency; unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; - if (NewCodeHasLessInsts) - return NewCycleCount <= OldCycleCount; - else - return NewCycleCount < OldCycleCount; + return NewCycleCount <= OldCycleCount; } /// helper routine to convert instructions into SC @@ -380,14 +404,14 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { // in a single instruction. if (!NewInstCount) continue; + // Substitute when we optimize for codesize and the new sequence has // fewer instructions OR // the new sequence neither lengthens the critical path nor increases // resource pressure. if (doSubstitute(NewInstCount, OldInstCount) || (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, - InstrIdxForVirtReg, - NewInstCount < OldInstCount) && + InstrIdxForVirtReg, P) && preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { for (auto *InstrPtr : InsInstrs) MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); diff --git a/test/CodeGen/X86/machine-combiner.ll b/test/CodeGen/X86/machine-combiner.ll index efcf51bba92..3fbb233696c 100644 --- a/test/CodeGen/X86/machine-combiner.ll +++ b/test/CodeGen/X86/machine-combiner.ll @@ -632,10 +632,10 @@ define double @reassociate_adds_from_calls() { ; AVX-NEXT: callq bar ; AVX-NEXT: vmovsd %xmm0, (%rsp) ; AVX-NEXT: callq bar -; AVX-NEXT: vmovsd (%rsp), %xmm1 -; AVX: vaddsd 8(%rsp), %xmm1, %xmm1 +; AVX-NEXT: vmovsd 8(%rsp), %xmm1 +; AVX: vaddsd 16(%rsp), %xmm1, %xmm1 +; AVX-NEXT: vaddsd (%rsp), %xmm0, %xmm0 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vaddsd 16(%rsp), %xmm0, %xmm0 %x0 = call double @bar() %x1 = call double @bar() @@ -656,9 +656,10 @@ define double @already_reassociated() { ; AVX-NEXT: callq bar ; AVX-NEXT: vmovsd %xmm0, (%rsp) ; AVX-NEXT: callq bar +; AVX-NEXT: vmovsd 8(%rsp), %xmm1 +; AVX: vaddsd 16(%rsp), %xmm1, %xmm1 ; AVX-NEXT: vaddsd (%rsp), %xmm0, %xmm0 -; AVX-NEXT: vaddsd 8(%rsp), %xmm0, %xmm0 -; AVX-NEXT: vaddsd 16(%rsp), %xmm0, %xmm0 +; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 %x0 = call double @bar() %x1 = call double @bar() -- 2.34.1