In the pre-RA scheduler, maintain cmp+br proximity.

author Andrew Trick <atrick@apple.com>

Thu, 14 Apr 2011 05:15:06 +0000 (05:15 +0000)

committer Andrew Trick <atrick@apple.com>

Thu, 14 Apr 2011 05:15:06 +0000 (05:15 +0000)
author Andrew Trick <atrick@apple.com>
Thu, 14 Apr 2011 05:15:06 +0000 (05:15 +0000)
committer Andrew Trick <atrick@apple.com>
Thu, 14 Apr 2011 05:15:06 +0000 (05:15 +0000)
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h

index 9a2345b368dfb65e35bb29d70f303419a5bafcbc..281832c6077719f6b7718ea3bec38e12f3125a7e 100644 (file)
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -260,6 +260,7 @@ namespace llvm {
      bool isAvailable      : 1;          // True once available.
      bool isScheduled      : 1;          // True once scheduled.
      bool isScheduleHigh   : 1;          // True if preferable to schedule high.
+    bool isScheduleLow    : 1;          // True if preferable to schedule low.
      bool isCloned         : 1;          // True if this node has been cloned.
      Sched::Preference SchedulingPref;   // Scheduling preference.
  
@@ -282,7 +283,7 @@ namespace llvm {
          isVRegCycle(false), isCall(false), isTwoAddress(false),
          isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
          isPending(false), isAvailable(false), isScheduled(false),
-        isScheduleHigh(false), isCloned(false),
+        isScheduleHigh(false), isScheduleLow(false), isCloned(false),
          SchedulingPref(Sched::None),
          isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
          CopyDstRC(NULL), CopySrcRC(NULL) {}
@@ -296,7 +297,7 @@ namespace llvm {
          isVRegCycle(false), isCall(false), isTwoAddress(false),
          isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
          isPending(false), isAvailable(false), isScheduled(false),
-        isScheduleHigh(false), isCloned(false),
+        isScheduleHigh(false), isScheduleLow(false), isCloned(false),
          SchedulingPref(Sched::None),
          isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
          CopyDstRC(NULL), CopySrcRC(NULL) {}
@@ -309,7 +310,7 @@ namespace llvm {
          isVRegCycle(false), isCall(false), isTwoAddress(false),
          isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
          isPending(false), isAvailable(false), isScheduled(false),
-        isScheduleHigh(false), isCloned(false),
+        isScheduleHigh(false), isScheduleLow(false), isCloned(false),
          SchedulingPref(Sched::None),
          isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
          CopyDstRC(NULL), CopySrcRC(NULL) {}
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp

index ac2f3d5c8510871b4d786574ffe446ff64680956..faad66f29005968a5cfc973a697e2beeea5107a4 100644 (file)
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -71,6 +71,7 @@ static cl::opt<bool> DisableSchedCycles(
    cl::desc("Disable cycle-level precision during preRA scheduling"));
  
  // Temporary sched=list-ilp flags until the heuristics are robust.
+// Some options are also available under sched=list-hybrid.
  static cl::opt<bool> DisableSchedRegPressure(
    "disable-sched-reg-pressure", cl::Hidden, cl::init(false),
    cl::desc("Disable regpressure priority in sched=list-ilp"));
@@ -80,6 +81,9 @@ static cl::opt<bool> DisableSchedLiveUses(
  static cl::opt<bool> DisableSchedVRegCycle(
    "disable-sched-vrcycle", cl::Hidden, cl::init(false),
    cl::desc("Disable virtual register cycle interference checks"));
+static cl::opt<bool> DisableSchedPhysRegJoin(
+  "disable-sched-physreg-join", cl::Hidden, cl::init(false),
+  cl::desc("Disable physreg def-use affinity"));
  static cl::opt<bool> DisableSchedStalls(
    "disable-sched-stalls", cl::Hidden, cl::init(true),
    cl::desc("Disable no-stall priority in sched=list-ilp"));
@@ -1638,6 +1642,20 @@ ILPBURRPriorityQueue;
  //           Static Node Priority for Register Pressure Reduction
  //===----------------------------------------------------------------------===//
  
+// Check for special nodes that bypass scheduling heuristics.
+// Currently this pushes TokenFactor nodes down, but may be used for other
+// pseudo-ops as well.
+//
+// Return -1 to schedule right above left, 1 for left above right.
+// Return 0 if no bias exists.
+static int checkSpecialNodes(const SUnit *left, const SUnit *right) {
+  bool LSchedLow = left->isScheduleLow;
+  bool RSchedLow = right->isScheduleLow;
+  if (LSchedLow != RSchedLow)
+    return LSchedLow < RSchedLow ? 1 : -1;
+  return 0;
+}
+
  /// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
  /// Smaller number is the higher priority.
  static unsigned
@@ -2198,25 +2216,32 @@ static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref,
  }
  
  static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
+  // Schedule physical register definitions close to their use. This is
+  // motivated by microarchitectures that can fuse cmp+jump macro-ops. But as
+  // long as shortening physreg live ranges is generally good, we can defer
+  // creating a subtarget hook.
+  if (!DisableSchedPhysRegJoin) {
+    bool LHasPhysReg = left->hasPhysRegDefs;
+    bool RHasPhysReg = right->hasPhysRegDefs;
+    if (LHasPhysReg != RHasPhysReg) {
+      DEBUG(++FactorCount[FactRegUses]);
+      #ifndef NDEBUG
+      const char *PhysRegMsg[] = {" has no physreg", " defines a physreg"};
+      #endif
+      DEBUG(dbgs() << "  SU (" << left->NodeNum << ") "
+            << PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum << ") "
+            << PhysRegMsg[RHasPhysReg] << "\n");
+      return LHasPhysReg < RHasPhysReg;
+    }
+  }
+
+  // Prioritize by Seith-Ulmann number and push CopyToReg nodes down.
    unsigned LPriority = SPQ->getNodePriority(left);
    unsigned RPriority = SPQ->getNodePriority(right);
    if (LPriority != RPriority) {
      DEBUG(++FactorCount[FactStatic]);
      return LPriority > RPriority;
    }
-  else if(LPriority == 0) {
-    // Schedule zero-latency TokenFactor below any other special
-    // nodes. The alternative may be to avoid artificially boosting the
-    // TokenFactor's height when it is scheduled, but we currently rely on an
-    // instruction's final height to equal the cycle in which it is scheduled,
-    // so heights are monotonically increasing.
-    unsigned LOpc = left->getNode() ? left->getNode()->getOpcode() : 0;
-    unsigned ROpc = right->getNode() ? right->getNode()->getOpcode() : 0;
-    if (LOpc == ISD::TokenFactor)
-      return false;
-    if (ROpc == ISD::TokenFactor)
-      return true;
-  }
  
    // Try schedule def + use closer when Sethi-Ullman numbers are the same.
    // e.g.
@@ -2275,11 +2300,17 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
  
  // Bottom up
  bool bu_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
    return BURRSort(left, right, SPQ);
  }
  
  // Source order, otherwise bottom up.
  bool src_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
    unsigned LOrder = SPQ->getNodeOrdering(left);
    unsigned ROrder = SPQ->getNodeOrdering(right);
  
@@ -2311,6 +2342,9 @@ bool hybrid_ls_rr_sort::isReady(SUnit *SU, unsigned CurCycle) const {
  
  // Return true if right should be scheduled with higher priority than left.
  bool hybrid_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
    if (left->isCall || right->isCall)
      // No way to compute latency of calls.
      return BURRSort(left, right, SPQ);
@@ -2376,6 +2410,9 @@ static bool canEnableCoalescing(SUnit *SU) {
  // list-ilp is currently an experimental scheduler that allows various
  // heuristics to be enabled prior to the normal register reduction logic.
  bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
    if (left->isCall || right->isCall)
      // No way to compute latency of calls.
      return BURRSort(left, right, SPQ);
@@ -2734,6 +2771,9 @@ static unsigned LimitedSumOfUnscheduledPredsOfSuccs(const SUnit *SU,
  
  // Top down
  bool td_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res < 0;
+
    unsigned LPriority = SPQ->getNodePriority(left);
    unsigned RPriority = SPQ->getNodePriority(right);
    bool LIsTarget = left->getNode() && left->getNode()->isMachineOpcode();
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp

index 078533be843ac23e9341a61128e337c259f0b5d9..b258e6eefe20958fe8516b1da5708b317517ee32 100644 (file)
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -87,6 +87,8 @@ SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) {
    SU->isCommutable = Old->isCommutable;
    SU->hasPhysRegDefs = Old->hasPhysRegDefs;
    SU->hasPhysRegClobbers = Old->hasPhysRegClobbers;
+  SU->isScheduleHigh = Old->isScheduleHigh;
+  SU->isScheduleLow = Old->isScheduleLow;
    SU->SchedulingPref = Old->SchedulingPref;
    Old->isCloned = true;
    return SU;
@@ -335,6 +337,12 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
        if (!HasGlueUse) break;
      }
  
+    // Schedule zero-latency TokenFactor below any nodes that may increase the
+    // schedule height. Otherwise, ancestors of the TokenFactor may appear to
+    // have false stalls.
+    if (NI->getOpcode() == ISD::TokenFactor)
+      NodeSUnit->isScheduleLow = true;
+
      // If there are glue operands involved, N is now the bottom-most node
      // of the sequence of nodes that are glued together.
      // Update the SUnit.
diff --git a/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll b/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll

new file mode 100644 (file)

index 0000000..07b1971
--- /dev/null
+++ b/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=yonah | FileCheck %s
+; Reduced from JavaScriptCore
+
+%"class.JSC::CodeLocationCall" = type { [8 x i8] }
+%"class.JSC::JSGlobalData" = type { [4 x i8] }
+%"class.JSC::FunctionPtr" = type { i8* }
+%"class.JSC::Structure" = type { [4 x i8] }
+%"class.JSC::UString" = type { i8* }
+%"class.JSC::JSString" = type { [16 x i8], i32, %"class.JSC::UString", i32 }
+
+declare hidden fastcc void @_ZN3JSCL23returnToThrowTrampolineEPNS_12JSGlobalDataENS_16ReturnAddressPtrERS2_(%"class.JSC::JSGlobalData"* nocapture, i8*, %"class.JSC::FunctionPtr"* nocapture) nounwind noinline ssp
+
+; Avoid hoisting the test above loads or copies
+; CHECK: %entry
+; CHECK: cmpq
+; CHECK-NOT: mov
+; CHECK: jb
+define i32 @cti_op_eq(i8** nocapture %args) nounwind ssp {
+entry:
+  %0 = load i8** null, align 8
+  %tmp13 = bitcast i8* %0 to %"class.JSC::CodeLocationCall"*
+  %tobool.i.i.i = icmp ugt i8* undef, inttoptr (i64 281474976710655 to i8*)
+  %or.cond.i = and i1 %tobool.i.i.i, undef
+  br i1 %or.cond.i, label %if.then.i, label %if.end.i
+
+if.then.i:                                        ; preds = %entry
+  br i1 undef, label %if.then.i.i.i, label %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit
+
+if.then.i.i.i:                                    ; preds = %if.then.i
+  %conv.i.i.i.i = trunc i64 undef to i32
+  br label %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit
+
+if.end.i:                                         ; preds = %entry
+  br i1 undef, label %land.rhs.i121.i, label %_ZNK3JSC7JSValue8isStringEv.exit122.i
+
+land.rhs.i121.i:                                  ; preds = %if.end.i
+  %tmp.i.i117.i = load %"class.JSC::Structure"** undef, align 8
+  br label %_ZNK3JSC7JSValue8isStringEv.exit122.i
+
+_ZNK3JSC7JSValue8isStringEv.exit122.i:            ; preds = %land.rhs.i121.i, %if.end.i
+  %brmerge.i = or i1 undef, false
+  %or.cond = or i1 false, %brmerge.i
+  br i1 %or.cond, label %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit, label %if.then.i92.i
+
+if.then.i92.i:                                    ; preds = %_ZNK3JSC7JSValue8isStringEv.exit122.i
+  tail call void @_ZNK3JSC8JSString11resolveRopeEPNS_9ExecStateE(%"class.JSC::JSString"* undef, %"class.JSC::CodeLocationCall"* %tmp13) nounwind
+  unreachable
+
+_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit: ; preds = %_ZNK3JSC7JSValue8isStringEv.exit122.i, %if.then.i.i.i, %if.then.i
+
+  %1 = load i8** undef, align 8
+  br i1 undef, label %do.end39, label %do.body27
+
+do.body27:                                        ; preds = %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit
+  %tmp30 = bitcast i8* %1 to %"class.JSC::JSGlobalData"*
+  %2 = getelementptr inbounds i8** %args, i64 -1
+  %3 = bitcast i8** %2 to %"class.JSC::FunctionPtr"*
+  tail call fastcc void @_ZN3JSCL23returnToThrowTrampolineEPNS_12JSGlobalDataENS_16ReturnAddressPtrERS2_(%"class.JSC::JSGlobalData"* %tmp30, i8* undef, %"class.JSC::FunctionPtr"* %3)
+  unreachable
+
+do.end39:                                         ; preds = %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit
+  ret i32 undef
+}
+
+declare void @_ZNK3JSC8JSString11resolveRopeEPNS_9ExecStateE(%"class.JSC::JSString"*, %"class.JSC::CodeLocationCall"*)
diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll

index d33cc3a09665b6726c48b1354294129d55ee5023..938023ffe03701cb6c5613763774fad999e25393 100644 (file)
--- a/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
  ; RUN: llc -march=x86-64 < %s | FileCheck %s
  
  ; CHECK: decq
diff --git a/test/CodeGen/X86/pr2659.ll b/test/CodeGen/X86/pr2659.ll

index 54d043d54f835207a8252cdfeb921f6ef599aec3..ef0f9ea8b03aa6d9b5b1b0bcfa6026aa3707c2ba 100644 (file)
--- a/test/CodeGen/X86/pr2659.ll
+++ b/test/CodeGen/X86/pr2659.ll
@@ -18,7 +18,8 @@ forcond.preheader:              ; preds = %entry
  ; CHECK: movl $1
  ; CHECK-NOT: xorl
  ; CHECK-NOT: movl
-; CHECK-NEXT: je
+; CHECK-NOT: LBB
+; CHECK: je
  
  ifthen:         ; preds = %entry
    ret i32 0
diff --git a/test/CodeGen/X86/tail-opts.ll b/test/CodeGen/X86/tail-opts.ll

index 424bd2151ca46def4b3c8ab81347afd38a3b0728..77710ad56bae99963aa0ef063f6ee034854f4988 100644 (file)
--- a/test/CodeGen/X86/tail-opts.ll
+++ b/test/CodeGen/X86/tail-opts.ll
@@ -109,15 +109,15 @@ altret:
  
  ; CHECK: dont_merge_oddly:
  ; CHECK-NOT:   ret
-; CHECK:        ucomiss %xmm1, %xmm2
+; CHECK:        ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
  ; CHECK-NEXT:   jbe .LBB2_3
-; CHECK-NEXT:   ucomiss %xmm0, %xmm1
+; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
  ; CHECK-NEXT:   ja .LBB2_4
  ; CHECK-NEXT: .LBB2_2:
  ; CHECK-NEXT:   movb $1, %al
  ; CHECK-NEXT:   ret
  ; CHECK-NEXT: .LBB2_3:
-; CHECK-NEXT:   ucomiss %xmm0, %xmm2
+; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
  ; CHECK-NEXT:   jbe .LBB2_2
  ; CHECK-NEXT: .LBB2_4:
  ; CHECK-NEXT:   xorb %al, %al
diff --git a/test/CodeGen/X86/test-nofold.ll b/test/CodeGen/X86/test-nofold.ll

index f1063dcabf4f49b6751c64f8f53c8daf536842dd..97db1b340e81c230d63d73ca967093367d28e6ae 100644 (file)
--- a/test/CodeGen/X86/test-nofold.ll
+++ b/test/CodeGen/X86/test-nofold.ll
@@ -2,10 +2,10 @@
  ; rdar://5752025
  
  ; We want:
-;      CHECK: movl     4(%esp), %ecx
-; CHECK-NEXT: andl     $15, %ecx
-; CHECK-NEXT: movl     $42, %eax
-; CHECK-NEXT: cmovel   %ecx, %eax
+;      CHECK: movl     $42, %ecx
+; CHECK-NEXT: movl     4(%esp), %eax
+; CHECK-NEXT: andl     $15, %eax
+; CHECK-NEXT: cmovnel  %ecx, %eax
  ; CHECK-NEXT: ret
  ;
  ; We don't want:
author	Andrew Trick <atrick@apple.com>
	Thu, 14 Apr 2011 05:15:06 +0000 (05:15 +0000)
committer	Andrew Trick <atrick@apple.com>
	Thu, 14 Apr 2011 05:15:06 +0000 (05:15 +0000)
include/llvm/CodeGen/ScheduleDAG.h		patch \| blob \| history
lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp		patch \| blob \| history
lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp		patch \| blob \| history
test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/lsr-loop-exit-cond.ll		patch \| blob \| history
test/CodeGen/X86/pr2659.ll		patch \| blob \| history
test/CodeGen/X86/tail-opts.ll		patch \| blob \| history
test/CodeGen/X86/test-nofold.ll		patch \| blob \| history