Reduce code size by using a second switch statement to avoid extra calls to SelectAto...

[oota-llvm.git] / lib / Target / X86 / X86ISelDAGToDAG.cpp
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp

index b155f742d649f6a6aebf18c4c82658043198e468..817013011cd9cc6c46c048991b3bf4e189741770 100644 (file)
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -21,7 +21,6 @@
  #include "X86TargetMachine.h"
  #include "llvm/Instructions.h"
  #include "llvm/Intrinsics.h"
-#include "llvm/Support/CFG.h"
  #include "llvm/Type.h"
  #include "llvm/CodeGen/FunctionLoweringInfo.h"
  #include "llvm/CodeGen/MachineConstantPool.h"
@@ -32,11 +31,11 @@
  #include "llvm/CodeGen/SelectionDAGISel.h"
  #include "llvm/Target/TargetMachine.h"
  #include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CFG.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/MathExtras.h"
  #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SmallPtrSet.h"
  #include "llvm/ADT/Statistic.h"
  using namespace llvm;
  
@@ -188,6 +187,7 @@ namespace {
  
    private:
      SDNode *Select(SDNode *N);
+    SDNode *SelectGather(SDNode *N, unsigned Opc);
      SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
      SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);
      SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
@@ -540,7 +540,7 @@ void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
    const TargetInstrInfo *TII = TM.getInstrInfo();
    if (Subtarget->isTargetCygMing()) {
      unsigned CallOp =
-      Subtarget->is64Bit() ? X86::WINCALL64pcrel32 : X86::CALLpcrel32;
+      Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32;
      BuildMI(BB, DebugLoc(),
              TII->get(CallOp)).addExternalSymbol("__main");
    }
@@ -621,14 +621,14 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
  
    // Handle X86-64 rip-relative addresses.  We check this before checking direct
    // folding because RIP is preferable to non-RIP accesses.
-  if (Subtarget->is64Bit() &&
+  if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
        // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
        // they cannot be folded into immediate fields.
        // FIXME: This can be improved for kernel and other models?
-      (M == CodeModel::Small || M == CodeModel::Kernel) &&
-      // Base and index reg must be 0 in order to use %rip as base and lowering
-      // must allow RIP.
-      !AM.hasBaseOrIndexReg() && N.getOpcode() == X86ISD::WrapperRIP) {
+      (M == CodeModel::Small || M == CodeModel::Kernel)) {
+    // Base and index reg must be 0 in order to use %rip as base.
+    if (AM.hasBaseOrIndexReg())
+      return true;
      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
        X86ISelAddressMode Backup = AM;
        AM.GV = G->getGlobal();
@@ -663,11 +663,12 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
    }
  
    // Handle the case when globals fit in our immediate field: This is true for
-  // X86-32 always and X86-64 when in -static -mcmodel=small mode.  In 64-bit
-  // mode, this results in a non-RIP-relative computation.
+  // X86-32 always and X86-64 when in -mcmodel=small mode.  In 64-bit
+  // mode, this only applies to a non-RIP-relative computation.
    if (!Subtarget->is64Bit() ||
-      ((M == CodeModel::Small || M == CodeModel::Kernel) &&
-       TM.getRelocationModel() == Reloc::Static)) {
+      M == CodeModel::Small || M == CodeModel::Kernel) {
+    assert(N.getOpcode() != X86ISD::WrapperRIP &&
+           "RIP-relative addressing already handled");
      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
        AM.GV = G->getGlobal();
        AM.Disp += G->getOffset();
@@ -764,12 +765,16 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
    SDValue ShlCount = DAG.getConstant(ScaleLog, MVT::i8);
    SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
  
-  // Insert the new nodes into the topological ordering.
-  InsertDAGNode(DAG, X, Eight);
-  InsertDAGNode(DAG, X, NewMask);
-  InsertDAGNode(DAG, Shift, Srl);
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  InsertDAGNode(DAG, N, Eight);
+  InsertDAGNode(DAG, N, Srl);
+  InsertDAGNode(DAG, N, NewMask);
    InsertDAGNode(DAG, N, And);
-  InsertDAGNode(DAG, X, ShlCount);
+  InsertDAGNode(DAG, N, ShlCount);
    InsertDAGNode(DAG, N, Shl);
    DAG.ReplaceAllUsesWith(N, Shl);
    AM.IndexReg = And;
@@ -805,9 +810,13 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
    SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
    SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
  
-  // Insert the new nodes into the topological ordering.
-  InsertDAGNode(DAG, X, NewMask);
-  InsertDAGNode(DAG, Shift, NewAnd);
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  InsertDAGNode(DAG, N, NewMask);
+  InsertDAGNode(DAG, N, NewAnd);
    InsertDAGNode(DAG, N, NewShift);
    DAG.ReplaceAllUsesWith(N, NewShift);
  
@@ -847,10 +856,6 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
                                      uint64_t Mask,
                                      SDValue Shift, SDValue X,
                                      X86ISelAddressMode &AM) {
-  // FIXME!! Hack to disable this and see if it is responsible for a miscompile
-  // on llvm-gcc's selfhost.
-  return true;
-
    if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
        !isa<ConstantSDNode>(Shift.getOperand(1)))
      return true;
@@ -893,7 +898,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
    APInt MaskedHighBits = APInt::getHighBitsSet(X.getValueSizeInBits(),
                                                 MaskLZ);
    APInt KnownZero, KnownOne;
-  DAG.ComputeMaskedBits(X, MaskedHighBits, KnownZero, KnownOne);
+  DAG.ComputeMaskedBits(X, KnownZero, KnownOne);
    if (MaskedHighBits != KnownZero) return true;
  
    // We've identified a pattern that can be transformed into a single shift
@@ -911,6 +916,12 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
    SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
    SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8);
    SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
    InsertDAGNode(DAG, N, NewSRLAmt);
    InsertDAGNode(DAG, N, NewSRL);
    InsertDAGNode(DAG, N, NewSHLAmt);
@@ -1644,7 +1655,7 @@ enum AtomicSz {
    AtomicSzEnd
  };
  
-static const unsigned int AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
+static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
    {
      X86::LOCK_OR8mi,
      X86::LOCK_OR8mr,
@@ -1838,6 +1849,134 @@ static bool HasNoSignedComparisonUses(SDNode *N) {
    return true;
  }
  
+/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode
+/// is suitable for doing the {load; increment or decrement; store} to modify
+/// transformation.
+static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, 
+                                SDValue StoredVal, SelectionDAG *CurDAG,
+                                LoadSDNode* &LoadNode, SDValue &InputChain) {
+
+  // is the value stored the result of a DEC or INC?
+  if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false;
+
+  // is the stored value result 0 of the load?
+  if (StoredVal.getResNo() != 0) return false;
+
+  // are there other uses of the loaded value than the inc or dec?
+  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
+
+  // is the store non-extending and non-indexed?
+  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
+    return false;
+
+  SDValue Load = StoredVal->getOperand(0);
+  // Is the stored value a non-extending and non-indexed load?
+  if (!ISD::isNormalLoad(Load.getNode())) return false;
+
+  // Return LoadNode by reference.
+  LoadNode = cast<LoadSDNode>(Load);
+  // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8)
+  EVT LdVT = LoadNode->getMemoryVT();    
+  if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 && 
+      LdVT != MVT::i8)
+    return false;
+
+  // Is store the only read of the loaded value?
+  if (!Load.hasOneUse())
+    return false;
+  
+  // Is the address of the store the same as the load?
+  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
+      LoadNode->getOffset() != StoreNode->getOffset())
+    return false;
+
+  // Check if the chain is produced by the load or is a TokenFactor with
+  // the load output chain as an operand. Return InputChain by reference.
+  SDValue Chain = StoreNode->getChain();
+
+  bool ChainCheck = false;
+  if (Chain == Load.getValue(1)) {
+    ChainCheck = true;
+    InputChain = LoadNode->getChain();
+  } else if (Chain.getOpcode() == ISD::TokenFactor) {
+    SmallVector<SDValue, 4> ChainOps;
+    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
+      SDValue Op = Chain.getOperand(i);
+      if (Op == Load.getValue(1)) {
+        ChainCheck = true;
+        continue;
+      }
+
+      // Make sure using Op as part of the chain would not cause a cycle here.
+      // In theory, we could check whether the chain node is a predecessor of
+      // the load. But that can be very expensive. Instead visit the uses and
+      // make sure they all have smaller node id than the load.
+      int LoadId = LoadNode->getNodeId();
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+             UE = UI->use_end(); UI != UE; ++UI) {
+        if (UI.getUse().getResNo() != 0)
+          continue;
+        if (UI->getNodeId() > LoadId)
+          return false;
+      }
+
+      ChainOps.push_back(Op);
+    }
+
+    if (ChainCheck)
+      // Make a new TokenFactor with all the other input chains except
+      // for the load.
+      InputChain = CurDAG->getNode(ISD::TokenFactor, Chain.getDebugLoc(),
+                                   MVT::Other, &ChainOps[0], ChainOps.size());
+  }
+  if (!ChainCheck)
+    return false;
+
+  return true;
+}
+
+/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory
+/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC.
+static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
+  if (Opc == X86ISD::DEC) {
+    if (LdVT == MVT::i64) return X86::DEC64m;
+    if (LdVT == MVT::i32) return X86::DEC32m;
+    if (LdVT == MVT::i16) return X86::DEC16m;
+    if (LdVT == MVT::i8)  return X86::DEC8m;
+  } else {
+    assert(Opc == X86ISD::INC && "unrecognized opcode");
+    if (LdVT == MVT::i64) return X86::INC64m;
+    if (LdVT == MVT::i32) return X86::INC32m;
+    if (LdVT == MVT::i16) return X86::INC16m;
+    if (LdVT == MVT::i8)  return X86::INC8m;
+  }
+  llvm_unreachable("unrecognized size for LdVT");
+}
+
+/// SelectGather - Customized ISel for GATHER operations.
+///
+SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
+  // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
+  SDValue Chain = Node->getOperand(0);
+  SDValue VSrc = Node->getOperand(2);
+  SDValue Base = Node->getOperand(3);
+  SDValue VIdx = Node->getOperand(4);
+  SDValue VMask = Node->getOperand(5);
+  ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
+  if (!Scale)
+    return 0;
+
+  // Memory Operands: Base, Scale, Index, Disp, Segment
+  SDValue Disp = CurDAG->getTargetConstant(0, MVT::i32);
+  SDValue Segment = CurDAG->getRegister(0, MVT::i32);
+  const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx,
+                          Disp, Segment, VMask, Chain};
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+                                           VSrc.getValueType(), MVT::Other,
+                                           Ops, array_lengthof(Ops));
+  return ResNode;
+}
+
  SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
    EVT NVT = Node->getValueType(0);
    unsigned Opc, MOpc;
@@ -1853,23 +1992,81 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
  
    switch (Opcode) {
    default: break;
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: break;
+    case Intrinsic::x86_avx2_gather_d_pd:
+    case Intrinsic::x86_avx2_gather_d_pd_256:
+    case Intrinsic::x86_avx2_gather_q_pd:
+    case Intrinsic::x86_avx2_gather_q_pd_256:
+    case Intrinsic::x86_avx2_gather_d_ps:
+    case Intrinsic::x86_avx2_gather_d_ps_256:
+    case Intrinsic::x86_avx2_gather_q_ps:
+    case Intrinsic::x86_avx2_gather_q_ps_256:
+    case Intrinsic::x86_avx2_gather_d_q:
+    case Intrinsic::x86_avx2_gather_d_q_256:
+    case Intrinsic::x86_avx2_gather_q_q:
+    case Intrinsic::x86_avx2_gather_q_q_256:
+    case Intrinsic::x86_avx2_gather_d_d:
+    case Intrinsic::x86_avx2_gather_d_d_256:
+    case Intrinsic::x86_avx2_gather_q_d:
+    case Intrinsic::x86_avx2_gather_q_d_256: {
+      unsigned Opc;
+      switch (IntNo) {
+      default: llvm_unreachable("Impossible intrinsic");
+      case Intrinsic::x86_avx2_gather_d_pd:     Opc = X86::VGATHERDPDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_pd:     Opc = X86::VGATHERQPDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
+      case Intrinsic::x86_avx2_gather_d_ps:     Opc = X86::VGATHERDPSrm;  break;
+      case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
+      case Intrinsic::x86_avx2_gather_q_ps:     Opc = X86::VGATHERQPSrm;  break;
+      case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
+      case Intrinsic::x86_avx2_gather_d_q:      Opc = X86::VPGATHERDQrm;  break;
+      case Intrinsic::x86_avx2_gather_d_q_256:  Opc = X86::VPGATHERDQYrm; break;
+      case Intrinsic::x86_avx2_gather_q_q:      Opc = X86::VPGATHERQQrm;  break;
+      case Intrinsic::x86_avx2_gather_q_q_256:  Opc = X86::VPGATHERQQYrm; break;
+      case Intrinsic::x86_avx2_gather_d_d:      Opc = X86::VPGATHERDDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_d_256:  Opc = X86::VPGATHERDDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_d:      Opc = X86::VPGATHERQDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_d_256:  Opc = X86::VPGATHERQDYrm; break;
+      }
+      SDNode *RetVal = SelectGather(Node, Opc);
+      if (RetVal)
+        return RetVal;
+      break;
+    }
+    }
+    break;
+  }
    case X86ISD::GlobalBaseReg:
      return getGlobalBaseReg();
  
+
    case X86ISD::ATOMOR64_DAG:
-    return SelectAtomic64(Node, X86::ATOMOR6432);
    case X86ISD::ATOMXOR64_DAG:
-    return SelectAtomic64(Node, X86::ATOMXOR6432);
    case X86ISD::ATOMADD64_DAG:
-    return SelectAtomic64(Node, X86::ATOMADD6432);
    case X86ISD::ATOMSUB64_DAG:
-    return SelectAtomic64(Node, X86::ATOMSUB6432);
    case X86ISD::ATOMNAND64_DAG:
-    return SelectAtomic64(Node, X86::ATOMNAND6432);
    case X86ISD::ATOMAND64_DAG:
-    return SelectAtomic64(Node, X86::ATOMAND6432);
-  case X86ISD::ATOMSWAP64_DAG:
-    return SelectAtomic64(Node, X86::ATOMSWAP6432);
+  case X86ISD::ATOMSWAP64_DAG: {
+    unsigned Opc;
+    switch (Opcode) {
+    default: llvm_unreachable("Impossible intrinsic");
+    case X86ISD::ATOMOR64_DAG:   Opc = X86::ATOMOR6432;   break;
+    case X86ISD::ATOMXOR64_DAG:  Opc = X86::ATOMXOR6432;  break;
+    case X86ISD::ATOMADD64_DAG:  Opc = X86::ATOMADD6432;  break;
+    case X86ISD::ATOMSUB64_DAG:  Opc = X86::ATOMSUB6432;  break;
+    case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break;
+    case X86ISD::ATOMAND64_DAG:  Opc = X86::ATOMAND6432;  break;
+    case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break;
+    }
+    SDNode *RetVal = SelectAtomic64(Node, Opc);
+    if (RetVal)
+      return RetVal;
+    break;
+  }
  
    case ISD::ATOMIC_LOAD_ADD: {
      SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT);
@@ -1958,7 +2155,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
      SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
      return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
                                  getI8Imm(ShlVal));
-    break;
    }
    case X86ISD::UMUL: {
      SDValue N0 = Node->getOperand(0);
@@ -2029,7 +2225,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
      }
  
      SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
-                                            N0, SDValue()).getValue(1);
+                                          N0, SDValue()).getValue(1);
  
      if (foldedLoad) {
        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
@@ -2069,7 +2265,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
      // Copy the low half of the result, if it is needed.
      if (!SDValue(Node, 0).use_empty()) {
        SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                                LoReg, NVT, InFlag);
+                                              LoReg, NVT, InFlag);
        InFlag = Result.getValue(2);
        ReplaceUses(SDValue(Node, 0), Result);
        DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
@@ -2260,7 +2456,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
  
          // On x86-32, only the ABCD registers have 8-bit subregisters.
          if (!Subtarget->is64Bit()) {
-          TargetRegisterClass *TRC = 0;
+          const TargetRegisterClass *TRC;
            switch (N0.getValueType().getSimpleVT().SimpleTy) {
            case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
            case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
@@ -2289,7 +2485,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
          SDValue Reg = N0.getNode()->getOperand(0);
  
          // Put the value in an ABCD register.
-        TargetRegisterClass *TRC = 0;
+        const TargetRegisterClass *TRC;
          switch (N0.getValueType().getSimpleVT().SimpleTy) {
          case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
          case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
@@ -2346,9 +2542,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
      break;
    }
    case ISD::STORE: {
+    // Change a chain of {load; incr or dec; store} of the same value into
+    // a simple increment or decrement through memory of that value, if the
+    // uses of the modified value and its address are suitable.
      // The DEC64m tablegen pattern is currently not able to match the case where
-    // the EFLAGS on the original DEC are used.
-    // we'll need to improve tablegen to allow flags to be transferred from a
+    // the EFLAGS on the original DEC are used. (This also applies to 
+    // {INC,DEC}X{64,32,16,8}.)
+    // We'll need to improve tablegen to allow flags to be transferred from a
      // node in the pattern to the result node.  probably with a new keyword
      // for example, we have this
      // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
@@ -2358,42 +2558,17 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
      // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
      //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
      //   (transferrable EFLAGS)]>;
+
      StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
-    SDValue Chain = StoreNode->getOperand(0);
      SDValue StoredVal = StoreNode->getOperand(1);
-    SDValue Address = StoreNode->getOperand(2);
-    SDValue Undef = StoreNode->getOperand(3);
-
-    if (StoreNode->getMemOperand()->getSize() != 8 ||
-        Undef->getOpcode() != ISD::UNDEF ||
-        Chain->getOpcode() != ISD::LOAD ||
-        StoredVal->getOpcode() != X86ISD::DEC ||
-        StoredVal.getResNo() != 0 ||
-        StoredVal->getOperand(0).getNode() != Chain.getNode())
-      break;
-
-    //OPC_CheckPredicate, 1, // Predicate_nontemporalstore
-    if (StoreNode->isNonTemporal())
-      break;
-
-    LoadSDNode *LoadNode = cast<LoadSDNode>(Chain.getNode());
-    if (LoadNode->getOperand(1) != Address ||
-        LoadNode->getOperand(2) != Undef)
-      break;
-
-    if (!ISD::isNormalLoad(LoadNode))
-      break;
+    unsigned Opc = StoredVal->getOpcode();
  
-    if (!ISD::isNormalStore(StoreNode))
+    LoadSDNode *LoadNode = 0;
+    SDValue InputChain;
+    if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
+                             LoadNode, InputChain))
        break;
  
-    // check load chain has only one use (from the store)
-    if (!Chain.hasOneUse())
-      break;
-
-    // Merge the input chains if they are not intra-pattern references.
-    SDValue InputChain = LoadNode->getOperand(0);
-
      SDValue Base, Scale, Index, Disp, Segment;
      if (!SelectAddr(LoadNode, LoadNode->getBasePtr(),
                      Base, Scale, Index, Disp, Segment))
@@ -2403,7 +2578,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
      MemOp[0] = StoreNode->getMemOperand();
      MemOp[1] = LoadNode->getMemOperand();
      const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
-    MachineSDNode *Result = CurDAG->getMachineNode(X86::DEC64m,
+    EVT LdVT = LoadNode->getMemoryVT();    
+    unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
+    MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
                                                     Node->getDebugLoc(),
                                                     MVT::i32, MVT::Other, Ops,
                                                     array_lengthof(Ops));
@@ -2454,6 +2631,6 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
  /// X86-specific DAG, ready for instruction scheduling.
  ///
  FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
-                                     llvm::CodeGenOpt::Level OptLevel) {
+                                     CodeGenOpt::Level OptLevel) {
    return new X86DAGToDAGISel(TM, OptLevel);
  }