From a0792de66c8364d47b0a688c7f408efb7b10f31b Mon Sep 17 00:00:00 2001
From: Evan Cheng <evan.cheng@apple.com>
Date: Wed, 6 Oct 2010 06:27:31 +0000
Subject: [PATCH] - Add TargetInstrInfo::getOperandLatency() to compute operand
 latencies. This   allow target to correctly compute latency for cases where
 static scheduling   itineraries isn't sufficient. e.g. variable_ops
 instructions such as   ARM::ldm.   This also allows target without scheduling
 itineraries to compute operand   latencies. e.g. X86 can return
 (approximated) latencies for high latency   instructions such as division. -
 Compute operand latencies for those defined by load multiple instructions,  
 e.g. ldm and those used by store multiple instructions, e.g. stm.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@115755 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetInstrInfo.h         |  16 ++
 lib/CodeGen/ScheduleDAGInstrs.cpp             |   9 +-
 .../SelectionDAG/ScheduleDAGSDNodes.cpp       |  20 +--
 lib/Target/ARM/ARMBaseInstrInfo.cpp           | 161 ++++++++++++++++++
 lib/Target/ARM/ARMBaseInstrInfo.h             |  15 ++
 lib/Target/ARM/ARMInstrInfo.td                |  10 +-
 lib/Target/ARM/ARMInstrThumb.td               |  14 +-
 lib/Target/ARM/ARMInstrThumb2.td              |  11 +-
 lib/Target/ARM/ARMSchedule.td                 |  10 +-
 lib/Target/ARM/ARMScheduleA8.td               |  44 ++++-
 lib/Target/ARM/ARMScheduleA9.td               |  48 +++++-
 lib/Target/ARM/ARMScheduleV6.td               |  25 ++-
 lib/Target/TargetInstrInfo.cpp                |  33 +++-
 test/CodeGen/ARM/arguments.ll                 |   4 +-
 test/CodeGen/ARM/select.ll                    |   2 +-
 15 files changed, 354 insertions(+), 68 deletions(-)

diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index 0063dfcafee..a2654c97de2 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -608,6 +608,22 @@ public:
   /// instruction will be decoded to on the target cpu.
   virtual unsigned getNumMicroOps(const MachineInstr *MI,
                                   const InstrItineraryData *ItinData) const;
+
+  /// getOperandLatency - Compute and return the use operand latency of a given
+  /// itinerary class and operand index if the value is produced by an
+  /// instruction of the specified itinerary class and def operand index.
+  /// In most cases, the static scheduling itinerary was enough to determine the
+  /// operand latency. But it may not be possible for instructions with variable
+  /// number of defs / uses.
+  virtual
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr *DefMI, unsigned DefIdx,
+                        const MachineInstr *UseMI, unsigned UseIdx) const;
+
+  virtual
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        SDNode *DefNode, unsigned DefIdx,
+                        SDNode *UseNode, unsigned UseIdx) const;
 };
 
 /// TargetInstrInfoImpl - This is the default implementation of
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 3d2565dd18c..e01f0ab1ce1 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -527,10 +527,10 @@ void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use,
   MachineInstr *DefMI = Def->getInstr();
   int DefIdx = DefMI->findRegisterDefOperandIdx(Reg);
   if (DefIdx != -1) {
-    unsigned DefClass = DefMI->getDesc().getSchedClass();
-    MachineInstr *UseMI = Use->getInstr();
-    unsigned UseClass = UseMI->getDesc().getSchedClass();
+    const TargetInstrDesc &DefTID = DefMI->getDesc();
+    unsigned DefClass = DefTID.getSchedClass();
 
+    MachineInstr *UseMI = Use->getInstr();
     // For all uses of the register, calculate the maxmimum latency
     int Latency = -1;
     for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) {
@@ -541,8 +541,7 @@ void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use,
       if (MOReg != Reg)
         continue;
 
-      int UseCycle = InstrItins->getOperandLatency(DefClass, DefIdx,
-                                                   UseClass, i);
+      int UseCycle = TII->getOperandLatency(InstrItins, DefMI, DefIdx, UseMI, i);
       Latency = std::max(Latency, UseCycle);
 
       // If we found a latency, then replace the existing dependence latency.
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 23ff9c5807c..0ffb4da0f36 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -450,29 +450,11 @@ void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use,
   if (ForceUnitLatencies())
     return;
 
-  if (!InstrItins || InstrItins->isEmpty())
-    return;
-  
   if (dep.getKind() != SDep::Data)
     return;
 
   unsigned DefIdx = Use->getOperand(OpIdx).getResNo();
-  if (!Def->isMachineOpcode())
-    return;
-
-  const TargetInstrDesc &II = TII->get(Def->getMachineOpcode());
-  if (DefIdx >= II.getNumDefs())
-    return;
-
-  int Latency = 0;
-  if (!Use->isMachineOpcode()) {
-    Latency = InstrItins->getOperandCycle(II.getSchedClass(), DefIdx);
-  } else {
-    unsigned DefClass = II.getSchedClass();
-    unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass();
-    Latency = InstrItins->getOperandLatency(DefClass, DefIdx, UseClass, OpIdx);
-  }
-
+  int Latency = TII->getOperandLatency(InstrItins, Def, DefIdx, Use, OpIdx);
   if (Latency >= 0)
     dep.setLatency(Latency);
 }
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 0ccf7b6a7c4..6def552e174 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1642,3 +1642,164 @@ ARMBaseInstrInfo::getNumMicroOps(const MachineInstr *MI,
   }
   }
 }
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    const TargetInstrDesc &DefTID,
+                                    unsigned DefIdx, unsigned DefAlign,
+                                    const TargetInstrDesc &UseTID,
+                                    unsigned UseIdx, unsigned UseAlign) const {
+  unsigned DefClass = DefTID.getSchedClass();
+  unsigned UseClass = UseTID.getSchedClass();
+
+  if (DefIdx < DefTID.getNumDefs() && UseIdx < UseTID.getNumOperands())
+    return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+
+  // This may be a def / use of a variable_ops instruction, the operand
+  // latency might be determinable dynamically. Let the target try to
+  // figure it out.
+  bool LdmBypass = false;
+  int DefCycle = -1;
+  switch (DefTID.getOpcode()) {
+  default:
+    DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+    break;
+  case ARM::LDM_RET:
+  case ARM::LDM:
+  case ARM::LDM_UPD:
+  case ARM::tLDM:
+  case ARM::tLDM_UPD:
+  case ARM::tPUSH:
+  case ARM::t2LDM_RET:
+  case ARM::t2LDM:
+  case ARM::t2LDM_UPD: {
+    LdmBypass = 1;
+    unsigned RegNo = (DefIdx+1) - DefTID.getNumOperands() + 1;
+    if (Subtarget.isCortexA8()) {
+      // 4 registers would be issued: 1, 2, 1.
+      // 5 registers would be issued: 1, 2, 2.
+      DefCycle = RegNo / 2;
+      if (DefCycle < 1)
+        DefCycle = 1;
+      // Result latency is issue cycle + 2: E2.
+      DefCycle += 2;
+    } else if (Subtarget.isCortexA9()) {
+      DefCycle = (RegNo / 2);
+      // If there are odd number of registers or if it's not 64-bit aligned,
+      // then it takes an extra AGU (Address Generation Unit) cycle.
+      if ((RegNo % 2) || DefAlign < 8)
+        ++DefCycle;
+      // Result latency is AGU cycles + 2.
+      DefCycle += 2;
+    } else {
+      // Assume the worst.
+      DefCycle = RegNo + 2;
+    }
+  }
+  }
+
+  if (DefCycle == -1)
+    // We can't seem to determine the result latency of the def, assume it's 2.
+    DefCycle = 2;
+
+  int UseCycle = -1;
+  switch (UseTID.getOpcode()) {
+  default:
+    UseCycle = ItinData->getOperandCycle(UseClass, UseIdx);
+    break;
+  case ARM::STM:
+  case ARM::STM_UPD:
+  case ARM::tSTM_UPD:
+  case ARM::tPOP_RET:
+  case ARM::tPOP:
+  case ARM::t2STM:
+  case ARM::t2STM_UPD: {
+    unsigned RegNo = UseIdx - UseTID.getNumOperands() + 1;
+    if (Subtarget.isCortexA8()) {
+      // 4 registers would be issued: 1, 2, 1.
+      // 5 registers would be issued: 1, 2, 2.
+      UseCycle = RegNo / 2;
+      if (UseCycle < 2)
+        UseCycle = 2;
+      // Result latency is issue cycle + 2: E2.
+      UseCycle += 2;
+    } else if (Subtarget.isCortexA9()) {
+      UseCycle = (RegNo / 2);
+      // If there are odd number of registers or if it's not 64-bit aligned,
+      // then it takes an extra AGU (Address Generation Unit) cycle.
+      if ((RegNo % 2) || UseAlign < 8)
+        ++UseCycle;
+      // Result latency is AGU cycles + 2.
+      UseCycle += 2;
+    } else {
+      // Assume the worst.
+      UseCycle = RegNo + 2;
+    }
+  }
+  }
+
+  if (UseCycle == -1)
+    // Assume it's read in the first stage.
+    UseCycle = 1;
+
+  UseCycle = DefCycle - UseCycle + 1;
+  if (UseCycle > 0) {
+    if (LdmBypass) {
+      // It's a variable_ops instruction so we can't use DefIdx here. Just use
+      // first def operand.
+      if (ItinData->hasPipelineForwarding(DefClass, DefTID.getNumOperands()-1,
+                                          UseClass, UseIdx))
+        --UseCycle;
+    } else if (ItinData->hasPipelineForwarding(DefClass, DefIdx,
+                                               UseClass, UseIdx))
+      --UseCycle;
+  }
+
+  return UseCycle;
+}
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                             const MachineInstr *DefMI, unsigned DefIdx,
+                             const MachineInstr *UseMI, unsigned UseIdx) const {
+  if (DefMI->isCopyLike() || DefMI->isInsertSubreg() ||
+      DefMI->isRegSequence() || DefMI->isImplicitDef())
+    return 1;
+
+  const TargetInstrDesc &DefTID = DefMI->getDesc();
+  if (!ItinData || ItinData->isEmpty())
+    return DefTID.mayLoad() ? 3 : 1;
+
+  const TargetInstrDesc &UseTID = UseMI->getDesc();
+  unsigned DefAlign = DefMI->hasOneMemOperand()
+    ? (*DefMI->memoperands_begin())->getAlignment() : 0;
+  unsigned UseAlign = UseMI->hasOneMemOperand()
+    ? (*UseMI->memoperands_begin())->getAlignment() : 0;
+  return getOperandLatency(ItinData, DefTID, DefIdx, DefAlign,
+                           UseTID, UseIdx, UseAlign);
+}
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    SDNode *DefNode, unsigned DefIdx,
+                                    SDNode *UseNode, unsigned UseIdx) const {
+  if (!DefNode->isMachineOpcode())
+    return 1;
+
+  const TargetInstrDesc &DefTID = get(DefNode->getMachineOpcode());
+  if (!ItinData || ItinData->isEmpty())
+    return DefTID.mayLoad() ? 3 : 1;
+
+  if (!UseNode->isMachineOpcode())
+    return ItinData->getOperandCycle(DefTID.getSchedClass(), DefIdx);
+
+  const TargetInstrDesc &UseTID = get(UseNode->getMachineOpcode());
+  const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode);
+  unsigned DefAlign = !DefMN->memoperands_empty()
+    ? (*DefMN->memoperands_begin())->getAlignment() : 0;
+  const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode);
+  unsigned UseAlign = !UseMN->memoperands_empty()
+    ? (*UseMN->memoperands_begin())->getAlignment() : 0;
+  return getOperandLatency(ItinData, DefTID, DefIdx, DefAlign,
+                           UseTID, UseIdx, UseAlign);
+}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index c0b13b340b5..04fee641f1e 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -340,6 +340,21 @@ public:
 
   virtual unsigned getNumMicroOps(const MachineInstr *MI,
                                   const InstrItineraryData *ItinData) const;
+
+  virtual
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr *DefMI, unsigned DefIdx,
+                        const MachineInstr *UseMI, unsigned UseIdx) const;
+  virtual
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        SDNode *DefNode, unsigned DefIdx,
+                        SDNode *UseNode, unsigned UseIdx) const;
+private:
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const TargetInstrDesc &DefTID,
+                        unsigned DefIdx, unsigned DefAlign,
+                        const TargetInstrDesc &UseTID,
+                        unsigned UseIdx, unsigned UseAlign) const;
 };
 
 static inline
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 6ee7e2fd3aa..755e687cb41 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -954,7 +954,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
     hasExtraDefRegAllocReq = 1 in
   def LDM_RET : AXI4ld<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
                                         reglist:$dsts, variable_ops),
-                       IndexModeUpd, LdStMulFrm, IIC_iLoadmBr,
+                       IndexModeUpd, LdStMulFrm, IIC_iLoad_mBr,
                        "ldm${addr:submode}${p}\t$addr!, $dsts",
                        "$addr.addr = $wb", []>;
 
@@ -1463,12 +1463,12 @@ def STRHT: AI3sthpo<(outs GPR:$base_wb),
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 def LDM : AXI4ld<(outs), (ins addrmode4:$addr, pred:$p,
                           reglist:$dsts, variable_ops),
-                 IndexModeNone, LdStMulFrm, IIC_iLoadm,
+                 IndexModeNone, LdStMulFrm, IIC_iLoad_m,
                  "ldm${addr:submode}${p}\t$addr, $dsts", "", []>;
 
 def LDM_UPD : AXI4ld<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
                                       reglist:$dsts, variable_ops),
-                     IndexModeUpd, LdStMulFrm, IIC_iLoadm,
+                     IndexModeUpd, LdStMulFrm, IIC_iLoad_mu,
                      "ldm${addr:submode}${p}\t$addr!, $dsts",
                      "$addr.addr = $wb", []>;
 } // mayLoad, neverHasSideEffects, hasExtraDefRegAllocReq
@@ -1476,12 +1476,12 @@ def LDM_UPD : AXI4ld<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
 let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
 def STM : AXI4st<(outs), (ins addrmode4:$addr, pred:$p,
                           reglist:$srcs, variable_ops),
-                 IndexModeNone, LdStMulFrm, IIC_iStorem,
+                 IndexModeNone, LdStMulFrm, IIC_iStore_m,
                  "stm${addr:submode}${p}\t$addr, $srcs", "", []>;
 
 def STM_UPD : AXI4st<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
                                       reglist:$srcs, variable_ops),
-                     IndexModeUpd, LdStMulFrm, IIC_iStorem,
+                     IndexModeUpd, LdStMulFrm, IIC_iStore_mu,
                      "stm${addr:submode}${p}\t$addr!, $srcs",
                      "$addr.addr = $wb", []>;
 } // mayStore, neverHasSideEffects, hasExtraSrcRegAllocReq
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 7593dadb582..60c5c6017d1 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -280,7 +280,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
     hasExtraDefRegAllocReq = 1 in
 def tPOP_RET : T1I<(outs), (ins pred:$p, reglist:$dsts, variable_ops),
-                   IIC_iLoadmBr,
+                   IIC_iPop_Br,
                    "pop${p}\t$dsts", []>,
                T1Misc<{1,1,0,?,?,?,?}>;
 
@@ -535,13 +535,13 @@ def tSpill : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStore_i,
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 def tLDM : T1I<(outs),
                (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops),
-               IIC_iLoadm,
+               IIC_iLoad_m,
                "ldm${addr:submode}${p}\t$addr, $dsts", []>,
            T1Encoding<{1,1,0,0,1,?}>; // A6.2 & A8.6.53
 
 def tLDM_UPD : T1It<(outs tGPR:$wb),
                     (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops),
-                    IIC_iLoadm,
+                    IIC_iLoad_m,
                     "ldm${addr:submode}${p}\t$addr!, $dsts",
                     "$addr.addr = $wb", []>,
                T1Encoding<{1,1,0,0,1,?}>; // A6.2 & A8.6.53
@@ -550,18 +550,20 @@ def tLDM_UPD : T1It<(outs tGPR:$wb),
 let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
 def tSTM_UPD : T1It<(outs tGPR:$wb),
                     (ins addrmode4:$addr, pred:$p, reglist:$srcs, variable_ops),
-                    IIC_iStorem,
+                    IIC_iStore_mu,
                     "stm${addr:submode}${p}\t$addr!, $srcs",
                     "$addr.addr = $wb", []>,
            T1Encoding<{1,1,0,0,0,?}>; // A6.2 & A8.6.189
 
 let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
-def tPOP : T1I<(outs), (ins pred:$p, reglist:$dsts, variable_ops), IIC_iLoadmBr,
+def tPOP : T1I<(outs), (ins pred:$p, reglist:$dsts, variable_ops),
+               IIC_iPop,
                "pop${p}\t$dsts", []>,
            T1Misc<{1,1,0,?,?,?,?}>;
 
 let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in
-def tPUSH : T1I<(outs), (ins pred:$p, reglist:$srcs, variable_ops), IIC_iStorem,
+def tPUSH : T1I<(outs), (ins pred:$p, reglist:$srcs, variable_ops),
+                IIC_iStore_m,
                 "push${p}\t$srcs", []>,
             T1Misc<{0,1,0,?,?,?,?}>;
 
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index ee000fd9582..87e557bf6cc 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -1243,7 +1243,7 @@ defm t2PLI  : T2Ipl<1, 0, "pli">;
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 def t2LDM : T2XI<(outs), (ins addrmode4:$addr, pred:$p,
-                          reglist:$dsts, variable_ops), IIC_iLoadm,
+                          reglist:$dsts, variable_ops), IIC_iLoad_m,
                  "ldm${addr:submode}${p}${addr:wide}\t$addr, $dsts", []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b00;
@@ -1254,7 +1254,8 @@ def t2LDM : T2XI<(outs), (ins addrmode4:$addr, pred:$p,
 }
 
 def t2LDM_UPD : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
-                                       reglist:$dsts, variable_ops), IIC_iLoadm,
+                                       reglist:$dsts, variable_ops),
+                      IIC_iLoad_mu,
                       "ldm${addr:submode}${p}${addr:wide}\t$addr!, $dsts",
                       "$addr.addr = $wb", []> {
   let Inst{31-27} = 0b11101;
@@ -1268,7 +1269,7 @@ def t2LDM_UPD : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
 
 let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
 def t2STM : T2XI<(outs), (ins addrmode4:$addr, pred:$p,
-                          reglist:$srcs, variable_ops), IIC_iStorem,
+                          reglist:$srcs, variable_ops), IIC_iStore_m,
                  "stm${addr:submode}${p}${addr:wide}\t$addr, $srcs", []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b00;
@@ -1280,7 +1281,7 @@ def t2STM : T2XI<(outs), (ins addrmode4:$addr, pred:$p,
 
 def t2STM_UPD : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
                                        reglist:$srcs, variable_ops),
-                      IIC_iStorem,
+                      IIC_iStore_m,
                       "stm${addr:submode}${p}${addr:wide}\t$addr!, $srcs",
                       "$addr.addr = $wb", []> {
   let Inst{31-27} = 0b11101;
@@ -2473,7 +2474,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
     hasExtraDefRegAllocReq = 1 in
   def t2LDM_RET : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p,
                                          reglist:$dsts, variable_ops),
-                        IIC_iLoadmBr,
+                        IIC_iLoad_mBr,
                         "ldm${addr:submode}${p}${addr:wide}\t$addr!, $dsts",
                         "$addr.addr = $wb", []> {
   let Inst{31-27} = 0b11101;
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index ec7d29aac03..b7ce8322ba2 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -67,8 +67,11 @@ def IIC_iLoad_bh_siu : InstrItinClass;
 def IIC_iLoad_d_i  : InstrItinClass;
 def IIC_iLoad_d_r  : InstrItinClass;
 def IIC_iLoad_d_ru : InstrItinClass;
-def IIC_iLoadm     : InstrItinClass<0>;  // micro-coded
-def IIC_iLoadmBr   : InstrItinClass<0>;  // micro-coded
+def IIC_iLoad_m    : InstrItinClass<0>;  // micro-coded
+def IIC_iLoad_mu   : InstrItinClass<0>;  // micro-coded
+def IIC_iLoad_mBr  : InstrItinClass<0>;  // micro-coded
+def IIC_iPop       : InstrItinClass<0>;  // micro-coded
+def IIC_iPop_Br    : InstrItinClass<0>;  // micro-coded
 def IIC_iLoadiALU  : InstrItinClass;
 def IIC_iStore_i   : InstrItinClass;
 def IIC_iStore_r   : InstrItinClass;
@@ -85,7 +88,8 @@ def IIC_iStore_bh_siu : InstrItinClass;
 def IIC_iStore_d_i   : InstrItinClass;
 def IIC_iStore_d_r   : InstrItinClass;
 def IIC_iStore_d_ru  : InstrItinClass;
-def IIC_iStorem    : InstrItinClass<0>;  // micro-coded
+def IIC_iStore_m   : InstrItinClass<0>;  // micro-coded
+def IIC_iStore_mu  : InstrItinClass<0>;  // micro-coded
 def IIC_Br         : InstrItinClass;
 def IIC_fpSTAT     : InstrItinClass;
 def IIC_fpUNA32    : InstrItinClass;
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td
index 915283bd259..714bf2e6353 100644
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -172,21 +172,44 @@ def CortexA8Itineraries : ProcessorItineraries<
                                  InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                  InstrStage<1, [A8_LdSt0]>], [4, 3, 1, 1]>,
   //
-  // Load multiple
-  InstrItinData<IIC_iLoadm   , [InstrStage<2, [A8_Issue], 0>,
+  // Load multiple, def is the 5th operand.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<2, [A8_Issue], 0>,
                                 InstrStage<2, [A8_Pipe0], 0>,
                                 InstrStage<2, [A8_Pipe1]>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
-                                InstrStage<1, [A8_LdSt0]>]>,
-
+                                InstrStage<1, [A8_LdSt0]>], [1, 1, 1, 1, 3]>,
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<2, [A8_Issue], 0>,
+                                InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_Pipe1]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                InstrStage<1, [A8_LdSt0]>], [2, 1, 1, 1, 3]>,
   //
   // Load multiple plus branch
-  InstrItinData<IIC_iLoadmBr , [InstrStage<2, [A8_Issue], 0>,
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<2, [A8_Issue], 0>,
                                 InstrStage<2, [A8_Pipe0], 0>,
                                 InstrStage<2, [A8_Pipe1]>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                 InstrStage<1, [A8_LdSt0]>,
-                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
+                               [1, 2, 1, 1, 3]>,
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop  ,    [InstrStage<2, [A8_Issue], 0>,
+                                InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_Pipe1]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                InstrStage<1, [A8_LdSt0]>], [1, 1, 3]>,
+  //
+  // Push, def is the 3th operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<2, [A8_Issue], 0>,
+                                InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_Pipe1]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                InstrStage<1, [A8_LdSt0]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
+                               [1, 1, 3]>,
 
   //
   // iLoadi + iALUr for t2LDRpci_pic.
@@ -266,11 +289,18 @@ def CortexA8Itineraries : ProcessorItineraries<
                                  InstrStage<1, [A8_LdSt0]>], [3, 3, 1, 1]>,
   //
   // Store multiple
-  InstrItinData<IIC_iStorem  , [InstrStage<2, [A8_Issue], 0>,
+  InstrItinData<IIC_iStore_m , [InstrStage<2, [A8_Issue], 0>,
                                 InstrStage<2, [A8_Pipe0], 0>,
                                 InstrStage<2, [A8_Pipe1]>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                 InstrStage<1, [A8_LdSt0]>]>,
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu, [InstrStage<2, [A8_Issue], 0>,
+                                InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_Pipe1]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                InstrStage<1, [A8_LdSt0]>], [2]>,
 
   // Branch
   //
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index f96b50448a1..6dd27157030 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -242,18 +242,42 @@ def CortexA9Itineraries : ProcessorItineraries<
                                   InstrStage<2, [A9_AGU]>],
                                  [5, 4, 1, 1], [A9_LdBypass]>,
   //
-  // Load multiple
-  InstrItinData<IIC_iLoadm   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  // Load multiple, def is the 5th operand.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
                                 InstrStage<2, [A9_AGU]>],
-                               [3], [A9_LdBypass]>,
-
+                               [1, 1, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU]>],
+                               [2, 1, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
   //
   // Load multiple plus branch
-  InstrItinData<IIC_iLoadmBr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
                                 InstrStage<1, [A9_AGU]>,
-                                InstrStage<1, [A9_Branch]>]>,
+                                InstrStage<1, [A9_Branch]>],
+                               [1, 2, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>,
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop  ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU]>],
+                               [1, 1, 3],
+                               [NoBypass, NoBypass, A9_LdBypass]>,
+  //
+  // Pop + branch, def is the 3rd operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU]>,
+                                InstrStage<1, [A9_Branch]>],
+                               [1, 1, 3],
+                               [NoBypass, NoBypass, A9_LdBypass]>,
 
   //
   // iLoadi + iALUr for t2LDRpci_pic.
@@ -329,13 +353,21 @@ def CortexA9Itineraries : ProcessorItineraries<
                                    [3, 1, 1, 1]>,
   //
   // Store multiple
-  InstrItinData<IIC_iStorem  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+  InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
                                 InstrStage<1, [A9_AGU]>]>,
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU]>], [2]>,
+
   // Branch
   //
   // no delay slots, so the latency of a branch is unimportant
-  InstrItinData<IIC_Br       , [InstrStage<1, [A9_Branch]>]>,
+  InstrItinData<IIC_Br       , [InstrStage<1, [A9_Issue0], 0>,
+                                InstrStage<1, [A9_Issue1], 0>,
+                                InstrStage<1, [A9_Branch]>]>,
 
   // VFP and NEON shares the same register file. This means that every VFP
   // instruction should wait for full completion of the consecutive NEON
diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td
index b5ae9277fcf..f1dbdbc5a71 100644
--- a/lib/Target/ARM/ARMScheduleV6.td
+++ b/lib/Target/ARM/ARMScheduleV6.td
@@ -116,19 +116,29 @@ def ARMV6Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>,
 
   //
-  // Load multiple
-  InstrItinData<IIC_iLoadm   , [InstrStage<3, [V6_Pipe]>]>,
-
+  // Load multiple, def is the 5th operand.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<3, [V6_Pipe]>], [1, 1, 1, 1, 4]>,
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 1, 4]>,
   //
   // Load multiple plus branch
-  InstrItinData<IIC_iLoadmBr , [InstrStage<3, [V6_Pipe]>,
-                                InstrStage<1, [V6_Pipe]>]>,
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [1, 2, 1, 1, 4]>,
 
   //
   // iLoadi + iALUr for t2LDRpci_pic.
   InstrItinData<IIC_iLoadiALU, [InstrStage<1, [V6_Pipe]>,
                                 InstrStage<1, [V6_Pipe]>], [3, 1]>,
 
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop     , [InstrStage<3, [V6_Pipe]>], [1, 1, 4]>,
+  //
+  // Pop + branch, def is the 3rd operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<3, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [1, 2, 4]>,
+
   // Integer store pipeline
   //
   // Immediate offset
@@ -159,7 +169,10 @@ def ARMV6Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>,
   //
   // Store multiple
-  InstrItinData<IIC_iStorem   , [InstrStage<3, [V6_Pipe]>]>,
+  InstrItinData<IIC_iStore_m  , [InstrStage<3, [V6_Pipe]>]>,
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu , [InstrStage<3, [V6_Pipe]>], [2]>,
   
   // Branch
   //
diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp
index e3ebae94732..7f103226daf 100644
--- a/lib/Target/TargetInstrInfo.cpp
+++ b/lib/Target/TargetInstrInfo.cpp
@@ -12,9 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Target/TargetInstrItineraries.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
@@ -64,6 +65,36 @@ TargetInstrInfo::getNumMicroOps(const MachineInstr *MI,
   return 1;
 }
 
+int
+TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                             const MachineInstr *DefMI, unsigned DefIdx,
+                             const MachineInstr *UseMI, unsigned UseIdx) const {
+  if (!ItinData || ItinData->isEmpty())
+    return -1;
+
+  unsigned DefClass = DefMI->getDesc().getSchedClass();
+  unsigned UseClass = UseMI->getDesc().getSchedClass();
+  return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+}
+
+int
+TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                   SDNode *DefNode, unsigned DefIdx,
+                                   SDNode *UseNode, unsigned UseIdx) const {
+  if (!ItinData || ItinData->isEmpty())
+    return -1;
+
+  if (!DefNode->isMachineOpcode())
+    return -1;
+
+  unsigned DefClass = get(DefNode->getMachineOpcode()).getSchedClass();
+  if (!UseNode->isMachineOpcode())
+    return ItinData->getOperandCycle(DefClass, DefIdx);
+  unsigned UseClass = get(UseNode->getMachineOpcode()).getSchedClass();
+  return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+}
+
+
 /// insertNoop - Insert a noop into the instruction stream at the specified
 /// point.
 void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB, 
diff --git a/test/CodeGen/ARM/arguments.ll b/test/CodeGen/ARM/arguments.ll
index bb7853e66ef..c7fcb9755d9 100644
--- a/test/CodeGen/ARM/arguments.ll
+++ b/test/CodeGen/ARM/arguments.ll
@@ -13,8 +13,8 @@ define i32 @f1(i32 %a, i64 %b) {
 ; test that allocating the double to r2/r3 makes r1 unavailable on gnueabi.
 define i32 @f2() nounwind optsize {
 ; ELF: f2:
-; ELF: mov  r0, #128
-; ELF: str  r0, [sp]
+; ELF: mov  [[REGISTER:(r[0-9]+)]], #128
+; ELF: str  [[REGISTER]], [sp]
 ; DARWIN: f2:
 ; DARWIN: mov	r3, #128
 entry:
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index 7413bed5c5b..06b42f4f448 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -79,9 +79,9 @@ define double @f7(double %a, double %b) {
 ; CHECK-NEON:      movw   [[REGISTER_1:r[0-9]+]], #1123
 ; CHECK-NEON-NEXT: movs   [[REGISTER_2:r[0-9]+]], #0
 ; CHECK-NEON-NEXT: cmp    r0, [[REGISTER_1]]
-; CHECK-NEON-NEXT: adr    [[REGISTER_3:r[0-9]+]], #LCPI
 ; CHECK-NEON-NEXT: it     eq
 ; CHECK-NEON-NEXT: moveq  [[REGISTER_2]], #4
+; CHECK-NEON-NEXT: adr    [[REGISTER_3:r[0-9]+]], #LCPI
 ; CHECK-NEON-NEXT: ldr
 ; CHECK-NEON:      bx
 
-- 
2.34.1