From 12f0dc6bb556976f22d89ebcf42bce273c9e7d38 Mon Sep 17 00:00:00 2001 From: Andrew Trick Date: Thu, 14 Apr 2011 05:15:06 +0000 Subject: [PATCH] In the pre-RA scheduler, maintain cmp+br proximity. This is done by pushing physical register definitions close to their use, which happens to handle flag definitions if they're not glued to the branch. This seems to be generally a good thing though, so I didn't need to add a target hook yet. The primary motivation is to generate code closer to what people expect and rule out missed opportunity from enabling macro-op fusion. As a side benefit, we get several 2-5% gains on x86 benchmarks. There is one regression: SingleSource/Benchmarks/Shootout/lists slows down be -10%. But this is an independent scheduler bug that will be tracked separately. See rdar://problem/9283108. Incidentally, pre-RA scheduling is only half the solution. Fixing the later passes is tracked by: [pre-RA-sched] on x86, attempt to schedule CMP/TEST adjacent with condition jump Fixes: Scheduler unnecessary break of cmp/jump fusion git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@129508 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/ScheduleDAG.h | 7 +- .../SelectionDAG/ScheduleDAGRRList.cpp | 66 +++++++++++++++---- .../SelectionDAG/ScheduleDAGSDNodes.cpp | 8 +++ test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll | 65 ++++++++++++++++++ test/CodeGen/X86/lsr-loop-exit-cond.ll | 1 - test/CodeGen/X86/pr2659.ll | 3 +- test/CodeGen/X86/tail-opts.ll | 6 +- test/CodeGen/X86/test-nofold.ll | 8 +-- 8 files changed, 139 insertions(+), 25 deletions(-) create mode 100644 test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h index 9a2345b368d..281832c6077 100644 --- a/include/llvm/CodeGen/ScheduleDAG.h +++ b/include/llvm/CodeGen/ScheduleDAG.h @@ -260,6 +260,7 @@ namespace llvm { bool isAvailable : 1; // True once available. bool isScheduled : 1; // True once scheduled. bool isScheduleHigh : 1; // True if preferable to schedule high. + bool isScheduleLow : 1; // True if preferable to schedule low. bool isCloned : 1; // True if this node has been cloned. Sched::Preference SchedulingPref; // Scheduling preference. @@ -282,7 +283,7 @@ namespace llvm { isVRegCycle(false), isCall(false), isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), isAvailable(false), isScheduled(false), - isScheduleHigh(false), isCloned(false), + isScheduleHigh(false), isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None), isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), CopyDstRC(NULL), CopySrcRC(NULL) {} @@ -296,7 +297,7 @@ namespace llvm { isVRegCycle(false), isCall(false), isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), isAvailable(false), isScheduled(false), - isScheduleHigh(false), isCloned(false), + isScheduleHigh(false), isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None), isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), CopyDstRC(NULL), CopySrcRC(NULL) {} @@ -309,7 +310,7 @@ namespace llvm { isVRegCycle(false), isCall(false), isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), isAvailable(false), isScheduled(false), - isScheduleHigh(false), isCloned(false), + isScheduleHigh(false), isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None), isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), CopyDstRC(NULL), CopySrcRC(NULL) {} diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index ac2f3d5c851..faad66f2900 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -71,6 +71,7 @@ static cl::opt DisableSchedCycles( cl::desc("Disable cycle-level precision during preRA scheduling")); // Temporary sched=list-ilp flags until the heuristics are robust. +// Some options are also available under sched=list-hybrid. static cl::opt DisableSchedRegPressure( "disable-sched-reg-pressure", cl::Hidden, cl::init(false), cl::desc("Disable regpressure priority in sched=list-ilp")); @@ -80,6 +81,9 @@ static cl::opt DisableSchedLiveUses( static cl::opt DisableSchedVRegCycle( "disable-sched-vrcycle", cl::Hidden, cl::init(false), cl::desc("Disable virtual register cycle interference checks")); +static cl::opt DisableSchedPhysRegJoin( + "disable-sched-physreg-join", cl::Hidden, cl::init(false), + cl::desc("Disable physreg def-use affinity")); static cl::opt DisableSchedStalls( "disable-sched-stalls", cl::Hidden, cl::init(true), cl::desc("Disable no-stall priority in sched=list-ilp")); @@ -1638,6 +1642,20 @@ ILPBURRPriorityQueue; // Static Node Priority for Register Pressure Reduction //===----------------------------------------------------------------------===// +// Check for special nodes that bypass scheduling heuristics. +// Currently this pushes TokenFactor nodes down, but may be used for other +// pseudo-ops as well. +// +// Return -1 to schedule right above left, 1 for left above right. +// Return 0 if no bias exists. +static int checkSpecialNodes(const SUnit *left, const SUnit *right) { + bool LSchedLow = left->isScheduleLow; + bool RSchedLow = right->isScheduleLow; + if (LSchedLow != RSchedLow) + return LSchedLow < RSchedLow ? 1 : -1; + return 0; +} + /// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number. /// Smaller number is the higher priority. static unsigned @@ -2198,25 +2216,32 @@ static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref, } static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) { + // Schedule physical register definitions close to their use. This is + // motivated by microarchitectures that can fuse cmp+jump macro-ops. But as + // long as shortening physreg live ranges is generally good, we can defer + // creating a subtarget hook. + if (!DisableSchedPhysRegJoin) { + bool LHasPhysReg = left->hasPhysRegDefs; + bool RHasPhysReg = right->hasPhysRegDefs; + if (LHasPhysReg != RHasPhysReg) { + DEBUG(++FactorCount[FactRegUses]); + #ifndef NDEBUG + const char *PhysRegMsg[] = {" has no physreg", " defines a physreg"}; + #endif + DEBUG(dbgs() << " SU (" << left->NodeNum << ") " + << PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum << ") " + << PhysRegMsg[RHasPhysReg] << "\n"); + return LHasPhysReg < RHasPhysReg; + } + } + + // Prioritize by Seith-Ulmann number and push CopyToReg nodes down. unsigned LPriority = SPQ->getNodePriority(left); unsigned RPriority = SPQ->getNodePriority(right); if (LPriority != RPriority) { DEBUG(++FactorCount[FactStatic]); return LPriority > RPriority; } - else if(LPriority == 0) { - // Schedule zero-latency TokenFactor below any other special - // nodes. The alternative may be to avoid artificially boosting the - // TokenFactor's height when it is scheduled, but we currently rely on an - // instruction's final height to equal the cycle in which it is scheduled, - // so heights are monotonically increasing. - unsigned LOpc = left->getNode() ? left->getNode()->getOpcode() : 0; - unsigned ROpc = right->getNode() ? right->getNode()->getOpcode() : 0; - if (LOpc == ISD::TokenFactor) - return false; - if (ROpc == ISD::TokenFactor) - return true; - } // Try schedule def + use closer when Sethi-Ullman numbers are the same. // e.g. @@ -2275,11 +2300,17 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) { // Bottom up bool bu_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { + if (int res = checkSpecialNodes(left, right)) + return res > 0; + return BURRSort(left, right, SPQ); } // Source order, otherwise bottom up. bool src_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { + if (int res = checkSpecialNodes(left, right)) + return res > 0; + unsigned LOrder = SPQ->getNodeOrdering(left); unsigned ROrder = SPQ->getNodeOrdering(right); @@ -2311,6 +2342,9 @@ bool hybrid_ls_rr_sort::isReady(SUnit *SU, unsigned CurCycle) const { // Return true if right should be scheduled with higher priority than left. bool hybrid_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { + if (int res = checkSpecialNodes(left, right)) + return res > 0; + if (left->isCall || right->isCall) // No way to compute latency of calls. return BURRSort(left, right, SPQ); @@ -2376,6 +2410,9 @@ static bool canEnableCoalescing(SUnit *SU) { // list-ilp is currently an experimental scheduler that allows various // heuristics to be enabled prior to the normal register reduction logic. bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { + if (int res = checkSpecialNodes(left, right)) + return res > 0; + if (left->isCall || right->isCall) // No way to compute latency of calls. return BURRSort(left, right, SPQ); @@ -2734,6 +2771,9 @@ static unsigned LimitedSumOfUnscheduledPredsOfSuccs(const SUnit *SU, // Top down bool td_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const { + if (int res = checkSpecialNodes(left, right)) + return res < 0; + unsigned LPriority = SPQ->getNodePriority(left); unsigned RPriority = SPQ->getNodePriority(right); bool LIsTarget = left->getNode() && left->getNode()->isMachineOpcode(); diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 078533be843..b258e6eefe2 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -87,6 +87,8 @@ SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) { SU->isCommutable = Old->isCommutable; SU->hasPhysRegDefs = Old->hasPhysRegDefs; SU->hasPhysRegClobbers = Old->hasPhysRegClobbers; + SU->isScheduleHigh = Old->isScheduleHigh; + SU->isScheduleLow = Old->isScheduleLow; SU->SchedulingPref = Old->SchedulingPref; Old->isCloned = true; return SU; @@ -335,6 +337,12 @@ void ScheduleDAGSDNodes::BuildSchedUnits() { if (!HasGlueUse) break; } + // Schedule zero-latency TokenFactor below any nodes that may increase the + // schedule height. Otherwise, ancestors of the TokenFactor may appear to + // have false stalls. + if (NI->getOpcode() == ISD::TokenFactor) + NodeSUnit->isScheduleLow = true; + // If there are glue operands involved, N is now the bottom-most node // of the sequence of nodes that are glued together. // Update the SUnit. diff --git a/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll b/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll new file mode 100644 index 00000000000..07b1971218c --- /dev/null +++ b/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll @@ -0,0 +1,65 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=yonah | FileCheck %s +; Reduced from JavaScriptCore + +%"class.JSC::CodeLocationCall" = type { [8 x i8] } +%"class.JSC::JSGlobalData" = type { [4 x i8] } +%"class.JSC::FunctionPtr" = type { i8* } +%"class.JSC::Structure" = type { [4 x i8] } +%"class.JSC::UString" = type { i8* } +%"class.JSC::JSString" = type { [16 x i8], i32, %"class.JSC::UString", i32 } + +declare hidden fastcc void @_ZN3JSCL23returnToThrowTrampolineEPNS_12JSGlobalDataENS_16ReturnAddressPtrERS2_(%"class.JSC::JSGlobalData"* nocapture, i8*, %"class.JSC::FunctionPtr"* nocapture) nounwind noinline ssp + +; Avoid hoisting the test above loads or copies +; CHECK: %entry +; CHECK: cmpq +; CHECK-NOT: mov +; CHECK: jb +define i32 @cti_op_eq(i8** nocapture %args) nounwind ssp { +entry: + %0 = load i8** null, align 8 + %tmp13 = bitcast i8* %0 to %"class.JSC::CodeLocationCall"* + %tobool.i.i.i = icmp ugt i8* undef, inttoptr (i64 281474976710655 to i8*) + %or.cond.i = and i1 %tobool.i.i.i, undef + br i1 %or.cond.i, label %if.then.i, label %if.end.i + +if.then.i: ; preds = %entry + br i1 undef, label %if.then.i.i.i, label %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit + +if.then.i.i.i: ; preds = %if.then.i + %conv.i.i.i.i = trunc i64 undef to i32 + br label %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit + +if.end.i: ; preds = %entry + br i1 undef, label %land.rhs.i121.i, label %_ZNK3JSC7JSValue8isStringEv.exit122.i + +land.rhs.i121.i: ; preds = %if.end.i + %tmp.i.i117.i = load %"class.JSC::Structure"** undef, align 8 + br label %_ZNK3JSC7JSValue8isStringEv.exit122.i + +_ZNK3JSC7JSValue8isStringEv.exit122.i: ; preds = %land.rhs.i121.i, %if.end.i + %brmerge.i = or i1 undef, false + %or.cond = or i1 false, %brmerge.i + br i1 %or.cond, label %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit, label %if.then.i92.i + +if.then.i92.i: ; preds = %_ZNK3JSC7JSValue8isStringEv.exit122.i + tail call void @_ZNK3JSC8JSString11resolveRopeEPNS_9ExecStateE(%"class.JSC::JSString"* undef, %"class.JSC::CodeLocationCall"* %tmp13) nounwind + unreachable + +_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit: ; preds = %_ZNK3JSC7JSValue8isStringEv.exit122.i, %if.then.i.i.i, %if.then.i + + %1 = load i8** undef, align 8 + br i1 undef, label %do.end39, label %do.body27 + +do.body27: ; preds = %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit + %tmp30 = bitcast i8* %1 to %"class.JSC::JSGlobalData"* + %2 = getelementptr inbounds i8** %args, i64 -1 + %3 = bitcast i8** %2 to %"class.JSC::FunctionPtr"* + tail call fastcc void @_ZN3JSCL23returnToThrowTrampolineEPNS_12JSGlobalDataENS_16ReturnAddressPtrERS2_(%"class.JSC::JSGlobalData"* %tmp30, i8* undef, %"class.JSC::FunctionPtr"* %3) + unreachable + +do.end39: ; preds = %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit + ret i32 undef +} + +declare void @_ZNK3JSC8JSString11resolveRopeEPNS_9ExecStateE(%"class.JSC::JSString"*, %"class.JSC::CodeLocationCall"*) diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll index d33cc3a0966..938023ffe03 100644 --- a/test/CodeGen/X86/lsr-loop-exit-cond.ll +++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll @@ -1,4 +1,3 @@ -; XFAIL: * ; RUN: llc -march=x86-64 < %s | FileCheck %s ; CHECK: decq diff --git a/test/CodeGen/X86/pr2659.ll b/test/CodeGen/X86/pr2659.ll index 54d043d54f8..ef0f9ea8b03 100644 --- a/test/CodeGen/X86/pr2659.ll +++ b/test/CodeGen/X86/pr2659.ll @@ -18,7 +18,8 @@ forcond.preheader: ; preds = %entry ; CHECK: movl $1 ; CHECK-NOT: xorl ; CHECK-NOT: movl -; CHECK-NEXT: je +; CHECK-NOT: LBB +; CHECK: je ifthen: ; preds = %entry ret i32 0 diff --git a/test/CodeGen/X86/tail-opts.ll b/test/CodeGen/X86/tail-opts.ll index 424bd2151ca..77710ad56ba 100644 --- a/test/CodeGen/X86/tail-opts.ll +++ b/test/CodeGen/X86/tail-opts.ll @@ -109,15 +109,15 @@ altret: ; CHECK: dont_merge_oddly: ; CHECK-NOT: ret -; CHECK: ucomiss %xmm1, %xmm2 +; CHECK: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}} ; CHECK-NEXT: jbe .LBB2_3 -; CHECK-NEXT: ucomiss %xmm0, %xmm1 +; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}} ; CHECK-NEXT: ja .LBB2_4 ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB2_3: -; CHECK-NEXT: ucomiss %xmm0, %xmm2 +; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}} ; CHECK-NEXT: jbe .LBB2_2 ; CHECK-NEXT: .LBB2_4: ; CHECK-NEXT: xorb %al, %al diff --git a/test/CodeGen/X86/test-nofold.ll b/test/CodeGen/X86/test-nofold.ll index f1063dcabf4..97db1b340e8 100644 --- a/test/CodeGen/X86/test-nofold.ll +++ b/test/CodeGen/X86/test-nofold.ll @@ -2,10 +2,10 @@ ; rdar://5752025 ; We want: -; CHECK: movl 4(%esp), %ecx -; CHECK-NEXT: andl $15, %ecx -; CHECK-NEXT: movl $42, %eax -; CHECK-NEXT: cmovel %ecx, %eax +; CHECK: movl $42, %ecx +; CHECK-NEXT: movl 4(%esp), %eax +; CHECK-NEXT: andl $15, %eax +; CHECK-NEXT: cmovnel %ecx, %eax ; CHECK-NEXT: ret ; ; We don't want: -- 2.34.1