ELF does not imply GNU/Linux. Do not assume GNU conventions just because we

[oota-llvm.git] / lib / Target / X86 / X86ISelDAGToDAG.cpp
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp

index 64cea478435fff966147fe34f2e1cd9278c69e0b..7161854de0a6b9e203361dd2fc856f0900c444db 100644 (file)
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -21,7 +21,6 @@
  #include "X86TargetMachine.h"
  #include "llvm/Instructions.h"
  #include "llvm/Intrinsics.h"
-#include "llvm/Support/CFG.h"
  #include "llvm/Type.h"
  #include "llvm/CodeGen/FunctionLoweringInfo.h"
  #include "llvm/CodeGen/MachineConstantPool.h"
@@ -32,11 +31,11 @@
  #include "llvm/CodeGen/SelectionDAGISel.h"
  #include "llvm/Target/TargetMachine.h"
  #include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CFG.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/MathExtras.h"
  #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SmallPtrSet.h"
  #include "llvm/ADT/Statistic.h"
  using namespace llvm;
  
@@ -188,6 +187,7 @@ namespace {
  
    private:
      SDNode *Select(SDNode *N);
+    SDNode *SelectGather(SDNode *N, unsigned Opc);
      SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
      SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);
      SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
@@ -540,7 +540,7 @@ void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
    const TargetInstrInfo *TII = TM.getInstrInfo();
    if (Subtarget->isTargetCygMing()) {
      unsigned CallOp =
-      Subtarget->is64Bit() ? X86::WINCALL64pcrel32 : X86::CALLpcrel32;
+      Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32;
      BuildMI(BB, DebugLoc(),
              TII->get(CallOp)).addExternalSymbol("__main");
    }
@@ -593,7 +593,7 @@ bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
    // For more information see http://people.redhat.com/drepper/tls.pdf
    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
      if (C->getSExtValue() == 0 && AM.Segment.getNode() == 0 &&
-        Subtarget->isTargetELF())
+        Subtarget->isTargetLinux())
        switch (N->getPointerInfo().getAddrSpace()) {
        case 256:
          AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
@@ -621,14 +621,14 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
  
    // Handle X86-64 rip-relative addresses.  We check this before checking direct
    // folding because RIP is preferable to non-RIP accesses.
-  if (Subtarget->is64Bit() &&
+  if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
        // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
        // they cannot be folded into immediate fields.
        // FIXME: This can be improved for kernel and other models?
-      (M == CodeModel::Small || M == CodeModel::Kernel) &&
-      // Base and index reg must be 0 in order to use %rip as base and lowering
-      // must allow RIP.
-      !AM.hasBaseOrIndexReg() && N.getOpcode() == X86ISD::WrapperRIP) {
+      (M == CodeModel::Small || M == CodeModel::Kernel)) {
+    // Base and index reg must be 0 in order to use %rip as base.
+    if (AM.hasBaseOrIndexReg())
+      return true;
      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
        X86ISelAddressMode Backup = AM;
        AM.GV = G->getGlobal();
@@ -663,11 +663,12 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
    }
  
    // Handle the case when globals fit in our immediate field: This is true for
-  // X86-32 always and X86-64 when in -static -mcmodel=small mode.  In 64-bit
-  // mode, this results in a non-RIP-relative computation.
+  // X86-32 always and X86-64 when in -mcmodel=small mode.  In 64-bit
+  // mode, this only applies to a non-RIP-relative computation.
    if (!Subtarget->is64Bit() ||
-      ((M == CodeModel::Small || M == CodeModel::Kernel) &&
-       TM.getRelocationModel() == Reloc::Static)) {
+      M == CodeModel::Small || M == CodeModel::Kernel) {
+    assert(N.getOpcode() != X86ISD::WrapperRIP &&
+           "RIP-relative addressing already handled");
      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
        AM.GV = G->getGlobal();
        AM.Disp += G->getOffset();
@@ -725,6 +726,19 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
    return false;
  }
  
+// Insert a node into the DAG at least before the Pos node's position. This
+// will reposition the node as needed, and will assign it a node ID that is <=
+// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
+// IDs! The selection DAG must no longer depend on their uniqueness when this
+// is used.
+static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
+  if (N.getNode()->getNodeId() == -1 ||
+      N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
+    DAG.RepositionNode(Pos.getNode(), N.getNode());
+    N.getNode()->setNodeId(Pos.getNode()->getNodeId());
+  }
+}
+
  // Transform "(X >> (8-C1)) & C2" to "(X >> 8) & 0xff)" if safe. This
  // allows us to convert the shift and and into an h-register extract and
  // a scaled index. Returns false if the simplification is performed.
@@ -751,37 +765,17 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
    SDValue ShlCount = DAG.getConstant(ScaleLog, MVT::i8);
    SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
  
-  // Insert the new nodes into the topological ordering.
-  if (Eight.getNode()->getNodeId() == -1 ||
-      Eight.getNode()->getNodeId() > X.getNode()->getNodeId()) {
-    DAG.RepositionNode(X.getNode(), Eight.getNode());
-    Eight.getNode()->setNodeId(X.getNode()->getNodeId());
-  }
-  if (NewMask.getNode()->getNodeId() == -1 ||
-      NewMask.getNode()->getNodeId() > X.getNode()->getNodeId()) {
-    DAG.RepositionNode(X.getNode(), NewMask.getNode());
-    NewMask.getNode()->setNodeId(X.getNode()->getNodeId());
-  }
-  if (Srl.getNode()->getNodeId() == -1 ||
-      Srl.getNode()->getNodeId() > Shift.getNode()->getNodeId()) {
-    DAG.RepositionNode(Shift.getNode(), Srl.getNode());
-    Srl.getNode()->setNodeId(Shift.getNode()->getNodeId());
-  }
-  if (And.getNode()->getNodeId() == -1 ||
-      And.getNode()->getNodeId() > N.getNode()->getNodeId()) {
-    DAG.RepositionNode(N.getNode(), And.getNode());
-    And.getNode()->setNodeId(N.getNode()->getNodeId());
-  }
-  if (ShlCount.getNode()->getNodeId() == -1 ||
-      ShlCount.getNode()->getNodeId() > X.getNode()->getNodeId()) {
-    DAG.RepositionNode(X.getNode(), ShlCount.getNode());
-    ShlCount.getNode()->setNodeId(N.getNode()->getNodeId());
-  }
-  if (Shl.getNode()->getNodeId() == -1 ||
-      Shl.getNode()->getNodeId() > N.getNode()->getNodeId()) {
-    DAG.RepositionNode(N.getNode(), Shl.getNode());
-    Shl.getNode()->setNodeId(N.getNode()->getNodeId());
-  }
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  InsertDAGNode(DAG, N, Eight);
+  InsertDAGNode(DAG, N, Srl);
+  InsertDAGNode(DAG, N, NewMask);
+  InsertDAGNode(DAG, N, And);
+  InsertDAGNode(DAG, N, ShlCount);
+  InsertDAGNode(DAG, N, Shl);
    DAG.ReplaceAllUsesWith(N, Shl);
    AM.IndexReg = And;
    AM.Scale = (1 << ScaleLog);
@@ -816,22 +810,14 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
    SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
    SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
  
-  // Insert the new nodes into the topological ordering.
-  if (NewMask.getNode()->getNodeId() == -1 ||
-      NewMask.getNode()->getNodeId() > X.getNode()->getNodeId()) {
-    DAG.RepositionNode(X.getNode(), NewMask.getNode());
-    NewMask.getNode()->setNodeId(X.getNode()->getNodeId());
-  }
-  if (NewAnd.getNode()->getNodeId() == -1 ||
-      NewAnd.getNode()->getNodeId() > Shift.getNode()->getNodeId()) {
-    DAG.RepositionNode(Shift.getNode(), NewAnd.getNode());
-    NewAnd.getNode()->setNodeId(Shift.getNode()->getNodeId());
-  }
-  if (NewShift.getNode()->getNodeId() == -1 ||
-      NewShift.getNode()->getNodeId() > N.getNode()->getNodeId()) {
-    DAG.RepositionNode(N.getNode(), NewShift.getNode());
-    NewShift.getNode()->setNodeId(N.getNode()->getNodeId());
-  }
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  InsertDAGNode(DAG, N, NewMask);
+  InsertDAGNode(DAG, N, NewAnd);
+  InsertDAGNode(DAG, N, NewShift);
    DAG.ReplaceAllUsesWith(N, NewShift);
  
    AM.Scale = 1 << ShiftAmt;
@@ -912,7 +898,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
    APInt MaskedHighBits = APInt::getHighBitsSet(X.getValueSizeInBits(),
                                                 MaskLZ);
    APInt KnownZero, KnownOne;
-  DAG.ComputeMaskedBits(X, MaskedHighBits, KnownZero, KnownOne);
+  DAG.ComputeMaskedBits(X, KnownZero, KnownOne);
    if (MaskedHighBits != KnownZero) return true;
  
    // We've identified a pattern that can be transformed into a single shift
@@ -922,11 +908,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
      assert(X.getValueType() != VT);
      // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
      SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, X.getDebugLoc(), VT, X);
-    if (NewX.getNode()->getNodeId() == -1 ||
-        NewX.getNode()->getNodeId() > N.getNode()->getNodeId()) {
-      DAG.RepositionNode(N.getNode(), NewX.getNode());
-      NewX.getNode()->setNodeId(N.getNode()->getNodeId());
-    }
+    InsertDAGNode(DAG, N, NewX);
      X = NewX;
    }
    DebugLoc DL = N.getDebugLoc();
@@ -934,26 +916,16 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
    SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
    SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8);
    SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
-  if (NewSRLAmt.getNode()->getNodeId() == -1 ||
-      NewSRLAmt.getNode()->getNodeId() > N.getNode()->getNodeId()) {
-    DAG.RepositionNode(N.getNode(), NewSRLAmt.getNode());
-    NewSRLAmt.getNode()->setNodeId(N.getNode()->getNodeId());
-  }
-  if (NewSRL.getNode()->getNodeId() == -1 ||
-      NewSRL.getNode()->getNodeId() > N.getNode()->getNodeId()) {
-    DAG.RepositionNode(N.getNode(), NewSRL.getNode());
-    NewSRL.getNode()->setNodeId(N.getNode()->getNodeId());
-  }
-  if (NewSHLAmt.getNode()->getNodeId() == -1 ||
-      NewSHLAmt.getNode()->getNodeId() > N.getNode()->getNodeId()) {
-    DAG.RepositionNode(N.getNode(), NewSHLAmt.getNode());
-    NewSHLAmt.getNode()->setNodeId(N.getNode()->getNodeId());
-  }
-  if (NewSHL.getNode()->getNodeId() == -1 ||
-      NewSHL.getNode()->getNodeId() > N.getNode()->getNodeId()) {
-    DAG.RepositionNode(N.getNode(), NewSHL.getNode());
-    NewSHL.getNode()->setNodeId(N.getNode()->getNodeId());
-  }
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  InsertDAGNode(DAG, N, NewSRLAmt);
+  InsertDAGNode(DAG, N, NewSRL);
+  InsertDAGNode(DAG, N, NewSHLAmt);
+  InsertDAGNode(DAG, N, NewSHL);
    DAG.ReplaceAllUsesWith(N, NewSHL);
  
    AM.Scale = 1 << AMShiftAmt;
@@ -1180,16 +1152,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
      AM.Scale = 1;
  
      // Insert the new nodes into the topological ordering.
-    if (Zero.getNode()->getNodeId() == -1 ||
-        Zero.getNode()->getNodeId() > N.getNode()->getNodeId()) {
-      CurDAG->RepositionNode(N.getNode(), Zero.getNode());
-      Zero.getNode()->setNodeId(N.getNode()->getNodeId());
-    }
-    if (Neg.getNode()->getNodeId() == -1 ||
-        Neg.getNode()->getNodeId() > N.getNode()->getNodeId()) {
-      CurDAG->RepositionNode(N.getNode(), Neg.getNode());
-      Neg.getNode()->setNodeId(N.getNode()->getNodeId());
-    }
+    InsertDAGNode(*CurDAG, N, Zero);
+    InsertDAGNode(*CurDAG, N, Neg);
      return false;
    }
  
@@ -1691,7 +1655,7 @@ enum AtomicSz {
    AtomicSzEnd
  };
  
-static const unsigned int AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
+static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
    {
      X86::LOCK_OR8mi,
      X86::LOCK_OR8mr,
@@ -1885,6 +1849,142 @@ static bool HasNoSignedComparisonUses(SDNode *N) {
    return true;
  }
  
+/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode
+/// is suitable for doing the {load; increment or decrement; store} to modify
+/// transformation.
+static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, 
+                                SDValue StoredVal, SelectionDAG *CurDAG,
+                                LoadSDNode* &LoadNode, SDValue &InputChain) {
+
+  // is the value stored the result of a DEC or INC?
+  if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false;
+
+  // is the stored value result 0 of the load?
+  if (StoredVal.getResNo() != 0) return false;
+
+  // are there other uses of the loaded value than the inc or dec?
+  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
+
+  // is the store non-extending and non-indexed?
+  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
+    return false;
+
+  SDValue Load = StoredVal->getOperand(0);
+  // Is the stored value a non-extending and non-indexed load?
+  if (!ISD::isNormalLoad(Load.getNode())) return false;
+
+  // Return LoadNode by reference.
+  LoadNode = cast<LoadSDNode>(Load);
+  // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8)
+  EVT LdVT = LoadNode->getMemoryVT();    
+  if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 && 
+      LdVT != MVT::i8)
+    return false;
+
+  // Is store the only read of the loaded value?
+  if (!Load.hasOneUse())
+    return false;
+  
+  // Is the address of the store the same as the load?
+  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
+      LoadNode->getOffset() != StoreNode->getOffset())
+    return false;
+
+  // Check if the chain is produced by the load or is a TokenFactor with
+  // the load output chain as an operand. Return InputChain by reference.
+  SDValue Chain = StoreNode->getChain();
+
+  bool ChainCheck = false;
+  if (Chain == Load.getValue(1)) {
+    ChainCheck = true;
+    InputChain = LoadNode->getChain();
+  } else if (Chain.getOpcode() == ISD::TokenFactor) {
+    SmallVector<SDValue, 4> ChainOps;
+    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
+      SDValue Op = Chain.getOperand(i);
+      if (Op == Load.getValue(1)) {
+        ChainCheck = true;
+        continue;
+      }
+
+      // Make sure using Op as part of the chain would not cause a cycle here.
+      // In theory, we could check whether the chain node is a predecessor of
+      // the load. But that can be very expensive. Instead visit the uses and
+      // make sure they all have smaller node id than the load.
+      int LoadId = LoadNode->getNodeId();
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+             UE = UI->use_end(); UI != UE; ++UI) {
+        if (UI.getUse().getResNo() != 0)
+          continue;
+        if (UI->getNodeId() > LoadId)
+          return false;
+      }
+
+      ChainOps.push_back(Op);
+    }
+
+    if (ChainCheck)
+      // Make a new TokenFactor with all the other input chains except
+      // for the load.
+      InputChain = CurDAG->getNode(ISD::TokenFactor, Chain.getDebugLoc(),
+                                   MVT::Other, &ChainOps[0], ChainOps.size());
+  }
+  if (!ChainCheck)
+    return false;
+
+  return true;
+}
+
+/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory
+/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC.
+static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
+  if (Opc == X86ISD::DEC) {
+    if (LdVT == MVT::i64) return X86::DEC64m;
+    if (LdVT == MVT::i32) return X86::DEC32m;
+    if (LdVT == MVT::i16) return X86::DEC16m;
+    if (LdVT == MVT::i8)  return X86::DEC8m;
+  } else {
+    assert(Opc == X86ISD::INC && "unrecognized opcode");
+    if (LdVT == MVT::i64) return X86::INC64m;
+    if (LdVT == MVT::i32) return X86::INC32m;
+    if (LdVT == MVT::i16) return X86::INC16m;
+    if (LdVT == MVT::i8)  return X86::INC8m;
+  }
+  llvm_unreachable("unrecognized size for LdVT");
+}
+
+/// SelectGather - Customized ISel for GATHER operations.
+///
+SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
+  // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
+  SDValue Chain = Node->getOperand(0);
+  SDValue VSrc = Node->getOperand(2);
+  SDValue Base = Node->getOperand(3);
+  SDValue VIdx = Node->getOperand(4);
+  SDValue VMask = Node->getOperand(5);
+  ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
+  if (!Scale)
+    return 0;
+
+  SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
+                                   MVT::Other);
+
+  // Memory Operands: Base, Scale, Index, Disp, Segment
+  SDValue Disp = CurDAG->getTargetConstant(0, MVT::i32);
+  SDValue Segment = CurDAG->getRegister(0, MVT::i32);
+  const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx,
+                          Disp, Segment, VMask, Chain};
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+                                           VTs, Ops, array_lengthof(Ops));
+  // Node has 2 outputs: VDst and MVT::Other.
+  // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
+  // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
+  // of ResNode.
+  ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+  ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
+  return ResNode;
+}
+
  SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
    EVT NVT = Node->getValueType(0);
    unsigned Opc, MOpc;
@@ -1900,23 +2000,82 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
  
    switch (Opcode) {
    default: break;
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: break;
+    case Intrinsic::x86_avx2_gather_d_pd:
+    case Intrinsic::x86_avx2_gather_d_pd_256:
+    case Intrinsic::x86_avx2_gather_q_pd:
+    case Intrinsic::x86_avx2_gather_q_pd_256:
+    case Intrinsic::x86_avx2_gather_d_ps:
+    case Intrinsic::x86_avx2_gather_d_ps_256:
+    case Intrinsic::x86_avx2_gather_q_ps:
+    case Intrinsic::x86_avx2_gather_q_ps_256:
+    case Intrinsic::x86_avx2_gather_d_q:
+    case Intrinsic::x86_avx2_gather_d_q_256:
+    case Intrinsic::x86_avx2_gather_q_q:
+    case Intrinsic::x86_avx2_gather_q_q_256:
+    case Intrinsic::x86_avx2_gather_d_d:
+    case Intrinsic::x86_avx2_gather_d_d_256:
+    case Intrinsic::x86_avx2_gather_q_d:
+    case Intrinsic::x86_avx2_gather_q_d_256: {
+      unsigned Opc;
+      switch (IntNo) {
+      default: llvm_unreachable("Impossible intrinsic");
+      case Intrinsic::x86_avx2_gather_d_pd:     Opc = X86::VGATHERDPDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_pd:     Opc = X86::VGATHERQPDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
+      case Intrinsic::x86_avx2_gather_d_ps:     Opc = X86::VGATHERDPSrm;  break;
+      case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
+      case Intrinsic::x86_avx2_gather_q_ps:     Opc = X86::VGATHERQPSrm;  break;
+      case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
+      case Intrinsic::x86_avx2_gather_d_q:      Opc = X86::VPGATHERDQrm;  break;
+      case Intrinsic::x86_avx2_gather_d_q_256:  Opc = X86::VPGATHERDQYrm; break;
+      case Intrinsic::x86_avx2_gather_q_q:      Opc = X86::VPGATHERQQrm;  break;
+      case Intrinsic::x86_avx2_gather_q_q_256:  Opc = X86::VPGATHERQQYrm; break;
+      case Intrinsic::x86_avx2_gather_d_d:      Opc = X86::VPGATHERDDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_d_256:  Opc = X86::VPGATHERDDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_d:      Opc = X86::VPGATHERQDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_d_256:  Opc = X86::VPGATHERQDYrm; break;
+      }
+      SDNode *RetVal = SelectGather(Node, Opc);
+      if (RetVal)
+        // We already called ReplaceUses inside SelectGather.
+        return NULL;
+      break;
+    }
+    }
+    break;
+  }
    case X86ISD::GlobalBaseReg:
      return getGlobalBaseReg();
  
+
    case X86ISD::ATOMOR64_DAG:
-    return SelectAtomic64(Node, X86::ATOMOR6432);
    case X86ISD::ATOMXOR64_DAG:
-    return SelectAtomic64(Node, X86::ATOMXOR6432);
    case X86ISD::ATOMADD64_DAG:
-    return SelectAtomic64(Node, X86::ATOMADD6432);
    case X86ISD::ATOMSUB64_DAG:
-    return SelectAtomic64(Node, X86::ATOMSUB6432);
    case X86ISD::ATOMNAND64_DAG:
-    return SelectAtomic64(Node, X86::ATOMNAND6432);
    case X86ISD::ATOMAND64_DAG:
-    return SelectAtomic64(Node, X86::ATOMAND6432);
-  case X86ISD::ATOMSWAP64_DAG:
-    return SelectAtomic64(Node, X86::ATOMSWAP6432);
+  case X86ISD::ATOMSWAP64_DAG: {
+    unsigned Opc;
+    switch (Opcode) {
+    default: llvm_unreachable("Impossible intrinsic");
+    case X86ISD::ATOMOR64_DAG:   Opc = X86::ATOMOR6432;   break;
+    case X86ISD::ATOMXOR64_DAG:  Opc = X86::ATOMXOR6432;  break;
+    case X86ISD::ATOMADD64_DAG:  Opc = X86::ATOMADD6432;  break;
+    case X86ISD::ATOMSUB64_DAG:  Opc = X86::ATOMSUB6432;  break;
+    case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break;
+    case X86ISD::ATOMAND64_DAG:  Opc = X86::ATOMAND6432;  break;
+    case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break;
+    }
+    SDNode *RetVal = SelectAtomic64(Node, Opc);
+    if (RetVal)
+      return RetVal;
+    break;
+  }
  
    case ISD::ATOMIC_LOAD_ADD: {
      SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT);
@@ -2005,7 +2164,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
      SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
      return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
                                  getI8Imm(ShlVal));
-    break;
    }
    case X86ISD::UMUL: {
      SDValue N0 = Node->getOperand(0);
@@ -2076,7 +2234,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
      }
  
      SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
-                                            N0, SDValue()).getValue(1);
+                                          N0, SDValue()).getValue(1);
  
      if (foldedLoad) {
        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
@@ -2116,7 +2274,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
      // Copy the low half of the result, if it is needed.
      if (!SDValue(Node, 0).use_empty()) {
        SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                                LoReg, NVT, InFlag);
+                                              LoReg, NVT, InFlag);
        InFlag = Result.getValue(2);
        ReplaceUses(SDValue(Node, 0), Result);
        DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
@@ -2307,7 +2465,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
  
          // On x86-32, only the ABCD registers have 8-bit subregisters.
          if (!Subtarget->is64Bit()) {
-          TargetRegisterClass *TRC = 0;
+          const TargetRegisterClass *TRC;
            switch (N0.getValueType().getSimpleVT().SimpleTy) {
            case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
            case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
@@ -2336,7 +2494,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
          SDValue Reg = N0.getNode()->getOperand(0);
  
          // Put the value in an ABCD register.
-        TargetRegisterClass *TRC = 0;
+        const TargetRegisterClass *TRC;
          switch (N0.getValueType().getSimpleVT().SimpleTy) {
          case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
          case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
@@ -2393,9 +2551,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
      break;
    }
    case ISD::STORE: {
+    // Change a chain of {load; incr or dec; store} of the same value into
+    // a simple increment or decrement through memory of that value, if the
+    // uses of the modified value and its address are suitable.
      // The DEC64m tablegen pattern is currently not able to match the case where
-    // the EFLAGS on the original DEC are used.
-    // we'll need to improve tablegen to allow flags to be transferred from a
+    // the EFLAGS on the original DEC are used. (This also applies to 
+    // {INC,DEC}X{64,32,16,8}.)
+    // We'll need to improve tablegen to allow flags to be transferred from a
      // node in the pattern to the result node.  probably with a new keyword
      // for example, we have this
      // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
@@ -2405,42 +2567,17 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
      // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
      //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
      //   (transferrable EFLAGS)]>;
+
      StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
-    SDValue Chain = StoreNode->getOperand(0);
      SDValue StoredVal = StoreNode->getOperand(1);
-    SDValue Address = StoreNode->getOperand(2);
-    SDValue Undef = StoreNode->getOperand(3);
-
-    if (StoreNode->getMemOperand()->getSize() != 8 ||
-        Undef->getOpcode() != ISD::UNDEF ||
-        Chain->getOpcode() != ISD::LOAD ||
-        StoredVal->getOpcode() != X86ISD::DEC ||
-        StoredVal.getResNo() != 0 ||
-        StoredVal->getOperand(0).getNode() != Chain.getNode())
-      break;
-
-    //OPC_CheckPredicate, 1, // Predicate_nontemporalstore
-    if (StoreNode->isNonTemporal())
-      break;
+    unsigned Opc = StoredVal->getOpcode();
  
-    LoadSDNode *LoadNode = cast<LoadSDNode>(Chain.getNode());
-    if (LoadNode->getOperand(1) != Address ||
-        LoadNode->getOperand(2) != Undef)
+    LoadSDNode *LoadNode = 0;
+    SDValue InputChain;
+    if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
+                             LoadNode, InputChain))
        break;
  
-    if (!ISD::isNormalLoad(LoadNode))
-      break;
-
-    if (!ISD::isNormalStore(StoreNode))
-      break;
-
-    // check load chain has only one use (from the store)
-    if (!Chain.hasOneUse())
-      break;
-
-    // Merge the input chains if they are not intra-pattern references.
-    SDValue InputChain = LoadNode->getOperand(0);
-
      SDValue Base, Scale, Index, Disp, Segment;
      if (!SelectAddr(LoadNode, LoadNode->getBasePtr(),
                      Base, Scale, Index, Disp, Segment))
@@ -2450,7 +2587,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
      MemOp[0] = StoreNode->getMemOperand();
      MemOp[1] = LoadNode->getMemOperand();
      const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
-    MachineSDNode *Result = CurDAG->getMachineNode(X86::DEC64m,
+    EVT LdVT = LoadNode->getMemoryVT();    
+    unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
+    MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
                                                     Node->getDebugLoc(),
                                                     MVT::i32, MVT::Other, Ops,
                                                     array_lengthof(Ops));
@@ -2501,6 +2640,6 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
  /// X86-specific DAG, ready for instruction scheduling.
  ///
  FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
-                                     llvm::CodeGenOpt::Level OptLevel) {
+                                     CodeGenOpt::Level OptLevel) {
    return new X86DAGToDAGISel(TM, OptLevel);
  }