Model operand cycles of vldm / vstm; also fixes scheduling itineraries of vldr /...

author Evan Cheng <evan.cheng@apple.com>

Thu, 7 Oct 2010 01:50:48 +0000 (01:50 +0000)

committer Evan Cheng <evan.cheng@apple.com>

Thu, 7 Oct 2010 01:50:48 +0000 (01:50 +0000)
author Evan Cheng <evan.cheng@apple.com>
Thu, 7 Oct 2010 01:50:48 +0000 (01:50 +0000)
committer Evan Cheng <evan.cheng@apple.com>
Thu, 7 Oct 2010 01:50:48 +0000 (01:50 +0000)
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp

index fcb422fcde6a607f9387dcdc0714b1cbaaebf45d..823126be30c78c7038119e234eade3a8ef15fa11 100644 (file)
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1667,6 +1667,41 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
    default:
      DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
      break;
+  case ARM::VLDMD:
+  case ARM::VLDMS:
+  case ARM::VLDMD_UPD:
+  case ARM::VLDMS_UPD:  {
+    int RegNo = (int)(DefIdx+1) - DefTID.getNumOperands() + 1;
+    if (RegNo <= 0) {
+      // Def is the address writeback.
+      DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+      break;
+    }
+    if (Subtarget.isCortexA8()) {
+      // (regno / 2) + (regno % 2) + 1
+      DefCycle = RegNo / 2 + 1;
+      if (RegNo % 2)
+        ++DefCycle;
+    } else if (Subtarget.isCortexA9()) {
+      DefCycle = RegNo;
+      bool isSLoad = false;
+      switch (UseTID.getOpcode()) {
+      default: break;
+      case ARM::VLDMS:
+      case ARM::VLDMS_UPD:
+        isSLoad = true;
+        break;
+      }
+      // If there are odd number of 'S' registers or if it's not 64-bit aligned,
+      // then it takes an extra cycle.
+      if ((isSLoad && (RegNo % 2)) || DefAlign < 8)
+        ++DefCycle;
+    } else {
+      // Assume the worst.
+      DefCycle = RegNo + 2;
+    }
+    break;
+  }
    case ARM::LDM_RET:
    case ARM::LDM:
    case ARM::LDM_UPD:
@@ -1677,7 +1712,12 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
    case ARM::t2LDM:
    case ARM::t2LDM_UPD: {
      LdmBypass = 1;
-    unsigned RegNo = (DefIdx+1) - DefTID.getNumOperands() + 1;
+    int RegNo = (int)(DefIdx+1) - DefTID.getNumOperands() + 1;
+    if (RegNo <= 0) {
+      // Def is the address writeback.
+      DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+      break;
+    }
      if (Subtarget.isCortexA8()) {
        // 4 registers would be issued: 1, 2, 1.
        // 5 registers would be issued: 1, 2, 2.
@@ -1710,6 +1750,40 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
    default:
      UseCycle = ItinData->getOperandCycle(UseClass, UseIdx);
      break;
+  case ARM::VSTMD:
+  case ARM::VSTMS:
+  case ARM::VSTMD_UPD:
+  case ARM::VSTMS_UPD: {
+    int RegNo = (int)(UseIdx+1) - UseTID.getNumOperands() + 1;
+    if (RegNo <= 0) {
+      UseCycle = ItinData->getOperandCycle(UseClass, UseIdx);
+      break;
+    }
+    if (Subtarget.isCortexA8()) {
+      // (regno / 2) + (regno % 2) + 1
+      UseCycle = RegNo / 2 + 1;
+      if (RegNo % 2)
+        ++UseCycle;
+    } else if (Subtarget.isCortexA9()) {
+      UseCycle = RegNo;
+      bool isSStore = false;
+      switch (UseTID.getOpcode()) {
+      default: break;
+      case ARM::VSTMS:
+      case ARM::VSTMS_UPD:
+        isSStore = true;
+        break;
+      }
+      // If there are odd number of 'S' registers or if it's not 64-bit aligned,
+      // then it takes an extra cycle.
+      if ((isSStore && (RegNo % 2)) || UseAlign < 8)
+        ++UseCycle;
+    } else {
+      // Assume the worst.
+      UseCycle = RegNo + 2;
+    }
+    break;
+  }
    case ARM::STM:
    case ARM::STM_UPD:
    case ARM::tSTM_UPD:
@@ -1717,14 +1791,16 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
    case ARM::tPOP:
    case ARM::t2STM:
    case ARM::t2STM_UPD: {
-    unsigned RegNo = UseIdx - UseTID.getNumOperands() + 1;
+    int RegNo = (int)(UseIdx+1) - UseTID.getNumOperands() + 1;
+    if (RegNo <= 0) {
+      UseCycle = ItinData->getOperandCycle(UseClass, UseIdx);
+      break;
+    }
      if (Subtarget.isCortexA8()) {
-      // 4 registers would be issued: 1, 2, 1.
-      // 5 registers would be issued: 1, 2, 2.
        UseCycle = RegNo / 2;
        if (UseCycle < 2)
          UseCycle = 2;
-      // Result latency is issue cycle + 2: E2.
+      // Read in E3.
        UseCycle += 2;
      } else if (Subtarget.isCortexA9()) {
        UseCycle = (RegNo / 2);
@@ -1732,12 +1808,11 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
        // then it takes an extra AGU (Address Generation Unit) cycle.
        if ((RegNo % 2) || UseAlign < 8)
          ++UseCycle;
-      // Result latency is AGU cycles + 2.
-      UseCycle += 2;
      } else {
        // Assume the worst.
-      UseCycle = RegNo + 2;
+      UseCycle = 1;
      }
+    break;
    }
    }
  
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td

index e3ff884ee3cd93aff24ce14145dc6f8e41f20878..6f3f6a0f0f8552795269d719e9fbe3e82d58e902 100644 (file)
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -132,13 +132,13 @@ def nModImm : Operand<i32> {
  // Use VLDM to load a Q register as a D register pair.
  // This is a pseudo instruction that is expanded to VLDMD after reg alloc.
  def VLDMQ
-  : PseudoVFPLdStM<(outs QPR:$dst), (ins addrmode4:$addr), IIC_fpLoadm, "",
+  : PseudoVFPLdStM<(outs QPR:$dst), (ins addrmode4:$addr), IIC_fpLoad_m, "",
                     [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]>;
  
  // Use VSTM to store a Q register as a D register pair.
  // This is a pseudo instruction that is expanded to VSTMD after reg alloc.
  def VSTMQ
-  : PseudoVFPLdStM<(outs), (ins QPR:$src, addrmode4:$addr), IIC_fpStorem, "",
+  : PseudoVFPLdStM<(outs), (ins QPR:$src, addrmode4:$addr), IIC_fpStore_m, "",
                     [(store (v2f64 QPR:$src), addrmode4:$addr)]>;
  
  let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td

index 53d181b13b13e55723d0449704bd6909a9c48db2..24344008f3029a40a4fe129cef9618e2e37e7d82 100644 (file)
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -78,20 +78,20 @@ def VSTRS  : ASI5<0b1101, 0b00, (outs), (ins SPR:$src, addrmode5:$addr),
  
  let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
  def VLDMD : AXDI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$dsts,
-                           variable_ops), IndexModeNone, IIC_fpLoadm,
+                           variable_ops), IndexModeNone, IIC_fpLoad_m,
                    "vldm${addr:submode}${p}\t$addr, $dsts", "", []> {
    let Inst{20} = 1;
  }
  
  def VLDMS : AXSI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$dsts,
-                           variable_ops), IndexModeNone, IIC_fpLoadm,
+                           variable_ops), IndexModeNone, IIC_fpLoad_m,
                    "vldm${addr:submode}${p}\t$addr, $dsts", "", []> {
    let Inst{20} = 1;
  }
  
  def VLDMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
                                         reglist:$dsts, variable_ops),
-                      IndexModeUpd, IIC_fpLoadm,
+                      IndexModeUpd, IIC_fpLoad_mu,
                        "vldm${addr:submode}${p}\t$addr!, $dsts",
                        "$addr.addr = $wb", []> {
    let Inst{20} = 1;
@@ -99,7 +99,7 @@ def VLDMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
  
  def VLDMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
                                         reglist:$dsts, variable_ops),
-                      IndexModeUpd, IIC_fpLoadm, 
+                      IndexModeUpd, IIC_fpLoad_mu, 
                        "vldm${addr:submode}${p}\t$addr!, $dsts",
                        "$addr.addr = $wb", []> {
    let Inst{20} = 1;
@@ -108,20 +108,20 @@ def VLDMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
  
  let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
  def VSTMD : AXDI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$srcs,
-                           variable_ops), IndexModeNone, IIC_fpStorem,
+                           variable_ops), IndexModeNone, IIC_fpStore_m,
                    "vstm${addr:submode}${p}\t$addr, $srcs", "", []> {
    let Inst{20} = 0;
  }
  
  def VSTMS : AXSI4<(outs), (ins addrmode4:$addr, pred:$p, reglist:$srcs,
-                           variable_ops), IndexModeNone, IIC_fpStorem,
+                           variable_ops), IndexModeNone, IIC_fpStore_m,
                    "vstm${addr:submode}${p}\t$addr, $srcs", "", []> {
    let Inst{20} = 0;
  }
  
  def VSTMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
                                         reglist:$srcs, variable_ops),
-                      IndexModeUpd, IIC_fpStorem,
+                      IndexModeUpd, IIC_fpStore_mu,
                        "vstm${addr:submode}${p}\t$addr!, $srcs",
                        "$addr.addr = $wb", []> {
    let Inst{20} = 0;
@@ -129,7 +129,7 @@ def VSTMD_UPD : AXDI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
  
  def VSTMS_UPD : AXSI4<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
                                         reglist:$srcs, variable_ops),
-                      IndexModeUpd, IIC_fpStorem,
+                      IndexModeUpd, IIC_fpStore_mu,
                        "vstm${addr:submode}${p}\t$addr!, $srcs",
                        "$addr.addr = $wb", []> {
    let Inst{20} = 0;
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td

index b7ce8322ba223958329dd7e241e5138ee730c97b..d4abc3534de4ed49973b7e7cf974fa242ac84200 100644 (file)
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -120,10 +120,12 @@ def IIC_fpSQRT32   : InstrItinClass;
  def IIC_fpSQRT64   : InstrItinClass;
  def IIC_fpLoad32   : InstrItinClass;
  def IIC_fpLoad64   : InstrItinClass;
-def IIC_fpLoadm    : InstrItinClass<0>;  // micro-coded
+def IIC_fpLoad_m   : InstrItinClass<0>;  // micro-coded
+def IIC_fpLoad_mu  : InstrItinClass<0>;  // micro-coded
  def IIC_fpStore32  : InstrItinClass;
  def IIC_fpStore64  : InstrItinClass;
-def IIC_fpStorem   : InstrItinClass<0>;  // micro-coded
+def IIC_fpStore_m  : InstrItinClass<0>;  // micro-coded
+def IIC_fpStore_mu : InstrItinClass<0>;  // micro-coded
  def IIC_VLD1       : InstrItinClass;
  def IIC_VLD2       : InstrItinClass;
  def IIC_VLD3       : InstrItinClass;
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td

index 714bf2e635316bac373200861723c33f9ae22e54..ac4da75e960760dd3c3892fcf0f7f34eae876b0f 100644 (file)
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -414,54 +414,58 @@ def CortexA8Itineraries : ProcessorItineraries<
    InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Issue], 0>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                 InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>],
+                               InstrStage<2, [A8_NLSPipe]>],
                                [2, 1]>,
    //
    // Double-precision FP Load
    // use A8_Issue to enforce the 1 load/store per cycle limit
    InstrItinData<IIC_fpLoad64, [InstrStage<2, [A8_Issue], 0>,
-                               InstrStage<1, [A8_Pipe0], 0>,
-                               InstrStage<1, [A8_Pipe1]>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                 InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>],
+                               InstrStage<2, [A8_NLSPipe]>],
                                [2, 1]>,
    //
    // FP Load Multiple
    // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpLoadm,  [InstrStage<3, [A8_Issue], 0>,
-                               InstrStage<2, [A8_Pipe0], 0>,
-                               InstrStage<2, [A8_Pipe1]>,
+  InstrItinData<IIC_fpLoad_m, [InstrStage<3, [A8_Issue], 0>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                 InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>]>,
+                               InstrStage<1, [A8_NLSPipe]>], [1, 1, 1, 2]>,
+  //
+  // FP Load Multiple + update
+  InstrItinData<IIC_fpLoad_mu,[InstrStage<3, [A8_Issue], 0>,
+                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<1, [A8_LdSt0], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 1, 1, 1, 2]>,
    //
    // Single-precision FP Store
    // use A8_Issue to enforce the 1 load/store per cycle limit
    InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Issue], 0>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                 InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>],
+                               InstrStage<2, [A8_NLSPipe]>],
                                [1, 1]>,
    //
    // Double-precision FP Store
    // use A8_Issue to enforce the 1 load/store per cycle limit
    InstrItinData<IIC_fpStore64,[InstrStage<2, [A8_Issue], 0>,
-                               InstrStage<1, [A8_Pipe0], 0>,
-                               InstrStage<1, [A8_Pipe1]>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                 InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>],
+                               InstrStage<2, [A8_NLSPipe]>],
                                [1, 1]>,
    //
    // FP Store Multiple
    // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpStorem, [InstrStage<3, [A8_Issue], 0>,
-                               InstrStage<2, [A8_Pipe0], 0>,
-                               InstrStage<2, [A8_Pipe1]>,
+  InstrItinData<IIC_fpStore_m,[InstrStage<3, [A8_Issue], 0>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                 InstrStage<1, [A8_LdSt0], 0>,
-                               InstrStage<1, [A8_NLSPipe]>]>,
+                               InstrStage<1, [A8_NLSPipe]>], [1, 1, 1, 1]>,
+  //
+  // FP Store Multiple + update
+  InstrItinData<IIC_fpStore_mu,[InstrStage<3, [A8_Issue], 0>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                InstrStage<1, [A8_LdSt0], 0>,
+                                InstrStage<1, [A8_NLSPipe]>], [2, 1, 1, 1, 1]>,
  
    // NEON
    // Issue through integer pipeline, and execute in NEON unit.
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td

index 6dd271570309619d7ead94e3260d569c7c736c7e..27745e6530022fc392aa97244c18bbc6266d1e64 100644 (file)
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -629,11 +629,18 @@ def CortexA9Itineraries : ProcessorItineraries<
                                [2, 1]>,
    //
    // FP Load Multiple
-  InstrItinData<IIC_fpLoadm,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                 InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                 InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>]>,
+                               InstrStage<1, [A9_NPipe]>], [1, 1, 1, 1]>,
+  //
+  // FP Load Multiple + update
+  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_NPipe]>], [2, 1, 1, 1]>,
    //
    // Single-precision FP Store
    InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
@@ -652,11 +659,18 @@ def CortexA9Itineraries : ProcessorItineraries<
                                [1, 1]>,
    //
    // FP Store Multiple
-  InstrItinData<IIC_fpStorem, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
+  InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                 InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                 InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_NPipe]>]>,
+                               InstrStage<1, [A9_NPipe]>], [1, 1, 1, 1]>,
+  //
+  // FP Store Multiple + update
+  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                                InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_NPipe]>], [2, 1, 1, 1]>,
    // NEON
    // Issue through integer pipeline, and execute in NEON unit.
    // VLD1
diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td

index f1dbdbc5a71b110460ea0a26e3a2fb2333ccf86c..b845130e3701fbe4df52fe33fac90c47bf282208 100644 (file)
--- a/lib/Target/ARM/ARMScheduleV6.td
+++ b/lib/Target/ARM/ARMScheduleV6.td
@@ -254,7 +254,10 @@ def ARMV6Itineraries : ProcessorItineraries<
    InstrItinData<IIC_fpLoad64 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>,
    //
    // FP Load Multiple
-  InstrItinData<IIC_fpLoadm , [InstrStage<3, [V6_Pipe]>]>,
+  InstrItinData<IIC_fpLoad_m , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 5]>,
+  //
+  // FP Load Multiple + update
+  InstrItinData<IIC_fpLoad_mu, [InstrStage<3, [V6_Pipe]>], [3, 2, 1, 1, 5]>,
    //
    // Single-precision FP Store
    InstrItinData<IIC_fpStore32 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
@@ -264,5 +267,8 @@ def ARMV6Itineraries : ProcessorItineraries<
    InstrItinData<IIC_fpStore64 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
    //
    // FP Store Multiple
-  InstrItinData<IIC_fpStorem , [InstrStage<3, [V6_Pipe]>]>
+  InstrItinData<IIC_fpStore_m, [InstrStage<3, [V6_Pipe]>], [2, 2, 2, 2]>,
+  //
+  // FP Store Multiple + update
+  InstrItinData<IIC_fpStore_mu,[InstrStage<3, [V6_Pipe]>], [3, 2, 2, 2, 2]>
  ]>;
author	Evan Cheng <evan.cheng@apple.com>
	Thu, 7 Oct 2010 01:50:48 +0000 (01:50 +0000)
committer	Evan Cheng <evan.cheng@apple.com>
	Thu, 7 Oct 2010 01:50:48 +0000 (01:50 +0000)
lib/Target/ARM/ARMBaseInstrInfo.cpp		patch \| blob \| history
lib/Target/ARM/ARMInstrNEON.td		patch \| blob \| history
lib/Target/ARM/ARMInstrVFP.td		patch \| blob \| history
lib/Target/ARM/ARMSchedule.td		patch \| blob \| history
lib/Target/ARM/ARMScheduleA8.td		patch \| blob \| history
lib/Target/ARM/ARMScheduleA9.td		patch \| blob \| history
lib/Target/ARM/ARMScheduleV6.td		patch \| blob \| history