Remove all local variables from X86SelectionDAGInfo, the DAG has

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 0ed30f5a6f054a13fcc543b3b36b767822e94cde..24cc828aca100147848d8ab288052d7a3a61fd7a 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -22,6 +22,7 @@
  #include "llvm/ADT/SmallSet.h"
  #include "llvm/ADT/Statistic.h"
  #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
  #include "llvm/ADT/VariadicFunction.h"
  #include "llvm/CodeGen/IntrinsicLowering.h"
  #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -177,29 +178,26 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
    return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
  }
  
-static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
-  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
-  bool is64Bit = Subtarget->is64Bit();
-
-  if (Subtarget->isTargetMacho()) {
-    if (is64Bit)
+static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getArch() == Triple::x86_64)
        return new X86_64MachoTargetObjectFile();
      return new TargetLoweringObjectFileMachO();
    }
  
-  if (Subtarget->isTargetLinux())
+  if (TT.isOSLinux())
      return new X86LinuxTargetObjectFile();
-  if (Subtarget->isTargetELF())
+  if (TT.isOSBinFormatELF())
      return new TargetLoweringObjectFileELF();
-  if (Subtarget->isTargetKnownWindowsMSVC())
+  if (TT.isKnownWindowsMSVCEnvironment())
      return new X86WindowsTargetObjectFile();
-  if (Subtarget->isTargetCOFF())
+  if (TT.isOSBinFormatCOFF())
      return new TargetLoweringObjectFileCOFF();
    llvm_unreachable("unknown subtarget type");
  }
  
  X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
-  : TargetLowering(TM, createTLOF(TM)) {
+  : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
    Subtarget = &TM.getSubtarget<X86Subtarget>();
    X86ScalarSSEf64 = Subtarget->hasSSE2();
    X86ScalarSSEf32 = Subtarget->hasSSE1();
@@ -1037,6 +1035,10 @@ void X86TargetLowering::resetOperationActions() {
      setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
  
      setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
+
+    setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
+    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
+    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
    }
  
    if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
@@ -1065,11 +1067,14 @@ void X86TargetLowering::resetOperationActions() {
      // FIXME: Do we need to handle scalar-to-vector here?
      setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
  
-    setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
+    // There is no BLENDI for byte vectors. We don't need to custom lower
+    // some vselects for now.
      setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
  
      // i8 and i16 vectors are custom , because the source register and source
      // source memory operand types are not the same width.  f32 vectors are
@@ -1185,10 +1190,10 @@ void X86TargetLowering::resetOperationActions() {
      setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
      setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
  
-    setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
  
      setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
      setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
@@ -1233,6 +1238,7 @@ void X86TargetLowering::resetOperationActions() {
        setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
        setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
  
+      setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
        setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
      } else {
        setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
@@ -1520,6 +1526,15 @@ void X86TargetLowering::resetOperationActions() {
      }
    }
  
+  if (Subtarget->isTargetWin64()) {
+    setOperationAction(ISD::SDIV, MVT::i128, Custom);
+    setOperationAction(ISD::UDIV, MVT::i128, Custom);
+    setOperationAction(ISD::SREM, MVT::i128, Custom);
+    setOperationAction(ISD::UREM, MVT::i128, Custom);
+    setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
+  }
+
    // We have target-specific dag combine patterns for the following nodes:
    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
    setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
@@ -1544,6 +1559,7 @@ void X86TargetLowering::resetOperationActions() {
    setTargetDAGCombine(ISD::TRUNCATE);
    setTargetDAGCombine(ISD::SINT_TO_FP);
    setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
    if (Subtarget->is64Bit())
      setTargetDAGCombine(ISD::MUL);
    setTargetDAGCombine(ISD::XOR);
@@ -2288,22 +2304,25 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
      InVals.push_back(ArgValue);
    }
  
-  // The x86-64 ABIs require that for returning structs by value we copy
-  // the sret argument into %rax/%eax (depending on ABI) for the return.
-  // Win32 requires us to put the sret argument to %eax as well.
-  // Save the argument into a virtual register so that we can access it
-  // from the return points.
-  if (MF.getFunction()->hasStructRetAttr() &&
-      (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
-    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
-    unsigned Reg = FuncInfo->getSRetReturnReg();
-    if (!Reg) {
-      MVT PtrTy = getPointerTy();
-      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
-      FuncInfo->setSRetReturnReg(Reg);
+  if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+      // The x86-64 ABIs require that for returning structs by value we copy
+      // the sret argument into %rax/%eax (depending on ABI) for the return.
+      // Win32 requires us to put the sret argument to %eax as well.
+      // Save the argument into a virtual register so that we can access it
+      // from the return points.
+      if (Ins[i].Flags.isSRet()) {
+        unsigned Reg = FuncInfo->getSRetReturnReg();
+        if (!Reg) {
+          MVT PtrTy = getPointerTy();
+          Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+          FuncInfo->setSRetReturnReg(Reg);
+        }
+        SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+        break;
+      }
      }
-    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
    }
  
    unsigned StackSize = CCInfo.getNextStackOffset();
@@ -3945,14 +3964,22 @@ static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
  
    unsigned CorrectPosV1 = 0;
    unsigned CorrectPosV2 = 0;
-  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
+  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
+    if (Mask[i] == -1) {
+      ++CorrectPosV1;
+      ++CorrectPosV2;
+      continue;
+    }
+
      if (Mask[i] == i)
        ++CorrectPosV1;
      else if (Mask[i] == i + 4)
        ++CorrectPosV2;
+  }
  
    if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
-    // We have 3 elements from one vector, and one from another.
+    // We have 3 elements (undefs count as elements from any vector) from one
+    // vector, and one from another.
      return true;
  
    return false;
@@ -4678,11 +4705,17 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
    return getInsertVINSERTImmediate(N, 256);
  }
  
+/// isZero - Returns true if Elt is a constant integer zero
+static bool isZero(SDValue V) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
+  return C && C->isNullValue();
+}
+
  /// isZeroNode - Returns true if Elt is a constant zero or a floating point
  /// constant +0.0.
  bool X86::isZeroNode(SDValue Elt) {
-  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
-    return CN->isNullValue();
+  if (isZero(Elt))
+    return true;
    if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
      return CFP->getValueAPF().isPosZero();
    return false;
@@ -5425,6 +5458,79 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
    return V;
  }
  
+/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
+static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
+                                     unsigned NonZeros, unsigned NumNonZero,
+                                     unsigned NumZero, SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget,
+                                     const TargetLowering &TLI) {
+  // We know there's at least one non-zero element
+  unsigned FirstNonZeroIdx = 0;
+  SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+  while (FirstNonZero.getOpcode() == ISD::UNDEF ||
+         X86::isZeroNode(FirstNonZero)) {
+    ++FirstNonZeroIdx;
+    FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+  }
+
+  if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
+    return SDValue();
+
+  SDValue V = FirstNonZero.getOperand(0);
+  MVT VVT = V.getSimpleValueType();
+  if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
+    return SDValue();
+
+  unsigned FirstNonZeroDst =
+      cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
+  unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
+  unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
+  unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
+
+  for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
+    SDValue Elem = Op.getOperand(Idx);
+    if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
+      continue;
+
+    // TODO: What else can be here? Deal with it.
+    if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // TODO: Some optimizations are still possible here
+    // ex: Getting one element from a vector, and the rest from another.
+    if (Elem.getOperand(0) != V)
+      return SDValue();
+
+    unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
+    if (Dst == Idx)
+      ++CorrectIdx;
+    else if (IncorrectIdx == -1U) {
+      IncorrectIdx = Idx;
+      IncorrectDst = Dst;
+    } else
+      // There was already one element with an incorrect index.
+      // We can't optimize this case to an insertps.
+      return SDValue();
+  }
+
+  if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
+    SDLoc dl(Op);
+    EVT VT = Op.getSimpleValueType();
+    unsigned ElementMoveMask = 0;
+    if (IncorrectIdx == -1U)
+      ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
+    else
+      ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
+
+    SDValue InsertpsMask =
+        DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
+    return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
+  }
+
+  return SDValue();
+}
+
  /// getVShift - Return a vector logical shift node.
  ///
  static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
@@ -6175,6 +6281,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
      if (V.getNode()) return V;
    }
  
+  // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
+  if (EVTBits == 32 && NumElems == 4) {
+    SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
+                                      NumZero, DAG, Subtarget, *this);
+    if (V.getNode())
+      return V;
+  }
+
    // If element VT is == 32 bits, turn it into a number of shuffles.
    SmallVector<SDValue, 8> V(NumElems);
    if (NumElems == 4 && NumZero > 0) {
@@ -6320,38 +6434,30 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
    return LowerAVXCONCAT_VECTORS(Op, DAG);
  }
  
-// Try to lower a shuffle node into a simple blend instruction.
-static SDValue
-LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
-                           const X86Subtarget *Subtarget, SelectionDAG &DAG) {
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  SDLoc dl(SVOp);
-  MVT VT = SVOp->getSimpleValueType(0);
+static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
+                        bool hasInt256, unsigned *MaskOut = nullptr) {
    MVT EltVT = VT.getVectorElementType();
-  unsigned NumElems = VT.getVectorNumElements();
  
    // There is no blend with immediate in AVX-512.
    if (VT.is512BitVector())
-    return SDValue();
+    return false;
  
-  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
-    return SDValue();
-  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
-    return SDValue();
+  if (!hasSSE41 || EltVT == MVT::i8)
+    return false;
+  if (!hasInt256 && VT == MVT::v16i16)
+    return false;
  
-  // Check the mask for BLEND and build the value.
    unsigned MaskValue = 0;
+  unsigned NumElems = VT.getVectorNumElements();
    // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
-  unsigned NumLanes = (NumElems-1)/8 + 1;
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
    unsigned NumElemsInLane = NumElems / NumLanes;
  
    // Blend for v16i16 should be symetric for the both lanes.
    for (unsigned i = 0; i < NumElemsInLane; ++i) {
  
-    int SndLaneEltIdx = (NumLanes == 2) ?
-      SVOp->getMaskElt(i + NumElemsInLane) : -1;
-    int EltIdx = SVOp->getMaskElt(i);
+    int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
+    int EltIdx = MaskVals[i];
  
      if ((EltIdx < 0 || EltIdx == (int)i) &&
          (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
@@ -6360,11 +6466,34 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
      if (((unsigned)EltIdx == (i + NumElems)) &&
          (SndLaneEltIdx < 0 ||
           (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
-      MaskValue |= (1<<i);
+      MaskValue |= (1 << i);
      else
-      return SDValue();
+      return false;
    }
  
+  if (MaskOut)
+    *MaskOut = MaskValue;
+  return true;
+}
+
+// Try to lower a shuffle node into a simple blend instruction.
+// This function assumes isBlendMask returns true for this
+// SuffleVectorSDNode
+static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
+                                          unsigned MaskValue,
+                                          const X86Subtarget *Subtarget,
+                                          SelectionDAG &DAG) {
+  MVT VT = SVOp->getSimpleValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
+                     Subtarget->hasInt256() && "Trying to lower a "
+                                               "VECTOR_SHUFFLE to a Blend but "
+                                               "with the wrong mask"));
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  SDLoc dl(SVOp);
+  unsigned NumElems = VT.getVectorNumElements();
+
    // Convert i32 vectors to floating point if it is not AVX2.
    // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
    MVT BlendVT = VT;
@@ -6612,7 +6741,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                  &MaskV[0]);
  
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
        ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
        NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
                                    NewV.getOperand(0),
@@ -6636,7 +6765,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                  &MaskV[0]);
  
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
        ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
        NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
                                    NewV.getOperand(0),
@@ -6867,6 +6996,9 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
    unsigned Scale;
    switch (VT.SimpleTy) {
    default: llvm_unreachable("Unexpected!");
+  case MVT::v2i64:
+  case MVT::v2f64:
+           return SDValue(SVOp, 0);
    case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
    case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
    case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
@@ -7302,6 +7434,23 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
                                getShuffleSHUFImmediate(SVOp), DAG);
  }
  
+static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
+                                         SelectionDAG &DAG) {
+  SDLoc dl(Load);
+  MVT VT = Load->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue Addr = Load->getOperand(1);
+  SDValue NewAddr = DAG.getNode(
+      ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+      DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
+
+  SDValue NewLoad =
+      DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+                  DAG.getMachineFunction().getMachineMemOperand(
+                      Load->getMemOperand(), 0, EVT.getStoreSize()));
+  return NewLoad;
+}
+
  // It is only safe to call this function if isINSERTPSMask is true for
  // this shufflevector mask.
  static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
@@ -7310,10 +7459,9 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
    // v4f32 or when copying a member from one v4f32 to another.
    // We also use it for transferring i32 from one register to another,
    // since it simply copies the same bits.
-  // If we're transfering an i32 from memory to a specific element in a
+  // If we're transferring an i32 from memory to a specific element in a
    // register, we output a generic DAG that will match the PINSRD
    // instruction.
-  // TODO: Optimize for AVX cases too (VINSERTPS)
    MVT VT = SVOp->getSimpleValueType(0);
    MVT EVT = VT.getVectorElementType();
    SDValue V1 = SVOp->getOperand(0);
@@ -7322,8 +7470,9 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
    assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
           "unsupported vector type for insertps/pinsrd");
  
-  int FromV1 = std::count_if(Mask.begin(), Mask.end(),
-                             [](const int &i) { return i < 4; });
+  auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
+  auto FromV2Predicate = [](const int &i) { return i >= 4; };
+  int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
  
    SDValue From;
    SDValue To;
@@ -7331,32 +7480,27 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
    if (FromV1 == 1) {
      From = V1;
      To = V2;
-    DestIndex = std::find_if(Mask.begin(), Mask.end(),
-                             [](const int &i) { return i < 4; }) -
+    DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
                  Mask.begin();
    } else {
+    assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
+           "More than one element from V1 and from V2, or no elements from one "
+           "of the vectors. This case should not have returned true from "
+           "isINSERTPSMask");
      From = V2;
      To = V1;
-    DestIndex = std::find_if(Mask.begin(), Mask.end(),
-                             [](const int &i) { return i >= 4; }) -
-                Mask.begin();
+    DestIndex =
+        std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
    }
  
    if (MayFoldLoad(From)) {
      // Trivial case, when From comes from a load and is only used by the
      // shuffle. Make it use insertps from the vector that we need from that
      // load.
-    SDValue Addr = From.getOperand(1);
-    SDValue NewAddr =
-        DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
-                    DAG.getConstant(DestIndex * EVT.getStoreSize(),
-                                    Addr.getSimpleValueType()));
-
-    LoadSDNode *Load = cast<LoadSDNode>(From);
      SDValue NewLoad =
-        DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
-                    DAG.getMachineFunction().getMachineMemOperand(
-                        Load->getMemOperand(), 0, EVT.getStoreSize()));
+        NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG);
+    if (!NewLoad.getNode())
+      return SDValue();
  
      if (EVT == MVT::f32) {
        // Create this as a scalar to vector to match the instruction pattern.
@@ -7468,9 +7612,8 @@ static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
                       DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
  }
  
-static SDValue
-NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
-                       SelectionDAG &DAG) {
+static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
    MVT VT = Op.getSimpleValueType();
    SDLoc dl(Op);
@@ -7495,31 +7638,29 @@ NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
  
    // If the shuffle can be profitably rewritten as a narrower shuffle, then
    // do it!
-  if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
-      VT == MVT::v16i16 || VT == MVT::v32i8) {
+  if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
+      VT == MVT::v32i8) {
      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
      if (NewOp.getNode())
        return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
-  } else if ((VT == MVT::v4i32 ||
-             (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+  } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
      // FIXME: Figure out a cleaner way to do this.
-    // Try to make use of movq to zero out the top part.
      if (ISD::isBuildVectorAllZeros(V2.getNode())) {
        SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
        if (NewOp.getNode()) {
          MVT NewVT = NewOp.getSimpleValueType();
          if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
                                 NewVT, true, false))
-          return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
-                              DAG, Subtarget, dl);
+          return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
+                              dl);
        }
      } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
        SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
        if (NewOp.getNode()) {
          MVT NewVT = NewOp.getSimpleValueType();
          if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
-          return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
-                              DAG, Subtarget, dl);
+          return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
+                              dl);
        }
      }
    }
@@ -7792,9 +7933,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
                                  V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
  
-  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
-  if (BlendOp.getNode())
-    return BlendOp;
+  unsigned MaskValue;
+  if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
+                  &MaskValue))
+    return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
  
    if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
      return getINSERTPS(SVOp, dl, DAG);
@@ -7864,6 +8006,109 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    return SDValue();
  }
  
+// This function assumes its argument is a BUILD_VECTOR of constants or
+// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
+// true.
+static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
+                                    unsigned &MaskValue) {
+  MaskValue = 0;
+  unsigned NumElems = BuildVector->getNumOperands();
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
+  unsigned NumElemsInLane = NumElems / NumLanes;
+
+  // Blend for v16i16 should be symetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
+    SDValue EltCond = BuildVector->getOperand(i);
+    SDValue SndLaneEltCond =
+        (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
+
+    int Lane1Cond = -1, Lane2Cond = -1;
+    if (isa<ConstantSDNode>(EltCond))
+      Lane1Cond = !isZero(EltCond);
+    if (isa<ConstantSDNode>(SndLaneEltCond))
+      Lane2Cond = !isZero(SndLaneEltCond);
+
+    if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
+      // Lane1Cond != 0, means we want the first argument.
+      // Lane1Cond == 0, means we want the second argument.
+      // The encoding of this argument is 0 for the first argument, 1
+      // for the second. Therefore, invert the condition.
+      MaskValue |= !Lane1Cond << i;
+    else if (Lane1Cond < 0)
+      MaskValue |= !Lane2Cond << i;
+    else
+      return false;
+  }
+  return true;
+}
+
+// Try to lower a vselect node into a simple blend instruction.
+static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget,
+                                   SelectionDAG &DAG) {
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  // Check the mask for BLEND and build the value.
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
+
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  MVT BlendVT = VT;
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+                               NumElems);
+    LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
+    RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+  }
+
+  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
+                            DAG.getConstant(MaskValue, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
+SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG);
+  if (BlendOp.getNode())
+    return BlendOp;
+
+  // Some types for vselect were previously set to Expand, not Legal or
+  // Custom. Return an empty SDValue so we fall-through to Expand, after
+  // the Custom lowering phase.
+  MVT VT = Op.getSimpleValueType();
+  switch (VT.SimpleTy) {
+  default:
+    break;
+  case MVT::v8i16:
+  case MVT::v16i16:
+    return SDValue();
+  }
+
+  // We couldn't create a "Blend with immediate" node.
+  // This node should still be legal, but we'll have to emit a blendv*
+  // instruction.
+  return Op;
+}
+
  static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
    MVT VT = Op.getSimpleValueType();
    SDLoc dl(Op);
@@ -8724,10 +8969,6 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
      // Windows 64bit: gs:0x58
      // Windows 32bit: fs:__tls_array
  
-    // If GV is an alias then use the aliasee for determining
-    // thread-localness.
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GV = GA->getAliasedGlobal();
      SDLoc dl(GA);
      SDValue Chain = DAG.getEntryNode();
  
@@ -9921,7 +10162,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
      // If we have a constant logical shift that's only used in a comparison
      // against zero turn it into an equivalent AND. This allows turning it into
      // a TEST instruction later.
-    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
+    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
          isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
        EVT VT = Op.getValueType();
        unsigned BitWidth = VT.getSizeInBits();
@@ -10120,7 +10361,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
          unsigned AndBitWidth = And.getValueSizeInBits();
          if (BitWidth > AndBitWidth) {
            APInt Zeros, Ones;
-          DAG.ComputeMaskedBits(Op0, Zeros, Ones);
+          DAG.computeKnownBits(Op0, Zeros, Ones);
            if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
              return SDValue();
          }
@@ -10649,11 +10890,6 @@ static bool isX86LogicalCmp(SDValue Op) {
    return false;
  }
  
-static bool isZero(SDValue V) {
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
-  return C && C->isNullValue();
-}
-
  static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
    if (V.getOpcode() != ISD::TRUNCATE)
      return false;
@@ -11257,8 +11493,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
    }
  
    if (addTest) {
-    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
-    Cond = EmitTest(Cond, X86::COND_NE, dl, DAG);
+    X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
+    CC = DAG.getConstant(X86Cond, MVT::i8);
+    Cond = EmitTest(Cond, X86Cond, dl, DAG);
    }
    Cond = ConvertCmpIfNecessary(Cond, DAG);
    return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
@@ -11527,6 +11764,10 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
                                            SelectionDAG &DAG) {
    MVT ElementType = VT.getVectorElementType();
  
+  // Fold this packed shift into its first operand if ShiftAmt is 0.
+  if (ShiftAmt == 0)
+    return SrcOp;
+
    // Check for ShiftAmt >= element width
    if (ShiftAmt >= ElementType.getSizeInBits()) {
      if (Opc == X86ISD::VSRAI)
@@ -12322,27 +12563,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
  }
  
  static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
-                             SDValue Base, SDValue Index,
-                             SDValue ScaleOp, SDValue Chain,
-                             const X86Subtarget * Subtarget) {
-  SDLoc dl(Op);
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
-  assert(C && "Invalid scale type");
-  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
-  SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
-  EVT MaskVT = MVT::getVectorVT(MVT::i1,
-                             Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
-  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
-  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
-  SDValue Segment = DAG.getRegister(0, MVT::i32);
-  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
-  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
-  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
-  return DAG.getMergeValues(RetOps, dl);
-}
-
-static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Src, SDValue Mask, SDValue Base,
                                SDValue Index, SDValue ScaleOp, SDValue Chain,
                                const X86Subtarget * Subtarget) {
@@ -12352,7 +12572,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
    SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
    EVT MaskVT = MVT::getVectorVT(MVT::i1,
                               Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+  else
+    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
    SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
    SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
    SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -12365,8 +12590,8 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
  }
  
  static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
-                              SDValue Src, SDValue Base, SDValue Index,
-                              SDValue ScaleOp, SDValue Chain) {
+                               SDValue Src, SDValue Mask, SDValue Base,
+                               SDValue Index, SDValue ScaleOp, SDValue Chain) {
    SDLoc dl(Op);
    ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
    assert(C && "Invalid scale type");
@@ -12375,29 +12600,39 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
    SDValue Segment = DAG.getRegister(0, MVT::i32);
    EVT MaskVT = MVT::getVectorVT(MVT::i1,
                               Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+  else
+    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
    SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
    SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
    SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
    return SDValue(Res, 1);
  }
  
-static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
-                               SDValue Src, SDValue Mask, SDValue Base,
-                               SDValue Index, SDValue ScaleOp, SDValue Chain) {
+static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                               SDValue Mask, SDValue Base, SDValue Index,
+                               SDValue ScaleOp, SDValue Chain) {
    SDLoc dl(Op);
    ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
    assert(C && "Invalid scale type");
    SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
    SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
    SDValue Segment = DAG.getRegister(0, MVT::i32);
-  EVT MaskVT = MVT::getVectorVT(MVT::i1,
-                             Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
-  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
-  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
-  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
-  return SDValue(Res, 1);
+  EVT MaskVT =
+    MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+  else
+    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  //SDVTList VTs = DAG.getVTList(MVT::Other);
+  SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
+  return SDValue(Res, 0);
  }
  
  // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
@@ -12463,27 +12698,120 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
    return DAG.getMergeValues(Results, DL);
  }
  
+enum IntrinsicType {
+  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST
+};
+
+struct IntrinsicData {
+  IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1)
+    :Type(IType), Opc0(IOpc0), Opc1(IOpc1) {}
+  IntrinsicType Type;
+  unsigned      Opc0;
+  unsigned      Opc1;
+};
+
+std::map < unsigned, IntrinsicData> IntrMap;
+static void InitIntinsicsMap() {
+  static bool Initialized = false;
+  if (Initialized) 
+    return;
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
+                                IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
+                                IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512,
+                                IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512,
+                                IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512,
+                                IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0)));
+
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512,
+                                IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512, 
+                                IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512, 
+                                IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512, 
+                                IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0)));
+   
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm,
+                                                        X86::VGATHERPF1QPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm,
+                                                        X86::VGATHERPF1QPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm,
+                                                        X86::VGATHERPF1DPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm,
+                                                        X86::VGATHERPF1DPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm,
+                                                        X86::VSCATTERPF1QPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm,
+                                                        X86::VSCATTERPF1QPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm,
+                                                        X86::VSCATTERPF1DPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm,
+                                                        X86::VSCATTERPF1DPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16,
+                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32,
+                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64,
+                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16,
+                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32,
+                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64,
+                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_xtest,
+                                IntrinsicData(XTEST,  X86ISD::XTEST,  0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc,
+                                IntrinsicData(RDTSC,  X86ISD::RDTSC_DAG, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp,
+                                IntrinsicData(RDTSC,  X86ISD::RDTSCP_DAG, 0)));
+  Initialized = true;
+}
+
  static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                                        SelectionDAG &DAG) {
-  SDLoc dl(Op);
+  InitIntinsicsMap();
    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-  switch (IntNo) {
-  default: return SDValue();    // Don't custom lower most intrinsics.
+  std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo);
+  if (itr == IntrMap.end())
+    return SDValue();
  
-  // RDRAND/RDSEED intrinsics.
-  case Intrinsic::x86_rdrand_16:
-  case Intrinsic::x86_rdrand_32:
-  case Intrinsic::x86_rdrand_64:
-  case Intrinsic::x86_rdseed_16:
-  case Intrinsic::x86_rdseed_32:
-  case Intrinsic::x86_rdseed_64: {
-    unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 ||
-                       IntNo == Intrinsic::x86_rdseed_32 ||
-                       IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED :
-                                                            X86ISD::RDRAND;
+  SDLoc dl(Op);
+  IntrinsicData Intr = itr->second;
+  switch(Intr.Type) {
+  case RDSEED:
+  case RDRAND: {
      // Emit the node with the right value type.
      SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
-    SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0));
+    SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0));
  
      // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
      // Otherwise return the value from Rand, which is always 0, casted to i32.
@@ -12499,162 +12827,49 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
      return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
                         SDValue(Result.getNode(), 2));
    }
-  //int_gather(index, base, scale);
-  case Intrinsic::x86_avx512_gather_qpd_512:
-  case Intrinsic::x86_avx512_gather_qps_512:
-  case Intrinsic::x86_avx512_gather_dpd_512:
-  case Intrinsic::x86_avx512_gather_qpi_512:
-  case Intrinsic::x86_avx512_gather_qpq_512:
-  case Intrinsic::x86_avx512_gather_dpq_512:
-  case Intrinsic::x86_avx512_gather_dps_512:
-  case Intrinsic::x86_avx512_gather_dpi_512: {
-    unsigned Opc;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
-    case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
-    case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
-    case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
-    case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
-    case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
-    case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
-    case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
-    }
-    SDValue Chain = Op.getOperand(0);
-    SDValue Index = Op.getOperand(2);
-    SDValue Base  = Op.getOperand(3);
-    SDValue Scale = Op.getOperand(4);
-    return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget);
-  }
-  //int_gather_mask(v1, mask, index, base, scale);
-  case Intrinsic::x86_avx512_gather_qps_mask_512:
-  case Intrinsic::x86_avx512_gather_qpd_mask_512:
-  case Intrinsic::x86_avx512_gather_dpd_mask_512:
-  case Intrinsic::x86_avx512_gather_dps_mask_512:
-  case Intrinsic::x86_avx512_gather_qpi_mask_512:
-  case Intrinsic::x86_avx512_gather_qpq_mask_512:
-  case Intrinsic::x86_avx512_gather_dpi_mask_512:
-  case Intrinsic::x86_avx512_gather_dpq_mask_512: {
-    unsigned Opc;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_avx512_gather_qps_mask_512:
-      Opc = X86::VGATHERQPSZrm; break;
-    case Intrinsic::x86_avx512_gather_qpd_mask_512:
-      Opc = X86::VGATHERQPDZrm; break;
-    case Intrinsic::x86_avx512_gather_dpd_mask_512:
-      Opc = X86::VGATHERDPDZrm; break;
-    case Intrinsic::x86_avx512_gather_dps_mask_512:
-      Opc = X86::VGATHERDPSZrm; break;
-    case Intrinsic::x86_avx512_gather_qpi_mask_512:
-      Opc = X86::VPGATHERQDZrm; break;
-    case Intrinsic::x86_avx512_gather_qpq_mask_512:
-      Opc = X86::VPGATHERQQZrm; break;
-    case Intrinsic::x86_avx512_gather_dpi_mask_512:
-      Opc = X86::VPGATHERDDZrm; break;
-    case Intrinsic::x86_avx512_gather_dpq_mask_512:
-      Opc = X86::VPGATHERDQZrm; break;
-    }
+  case GATHER: {
+  //gather(v1, mask, index, base, scale);
      SDValue Chain = Op.getOperand(0);
      SDValue Src   = Op.getOperand(2);
-    SDValue Mask  = Op.getOperand(3);
+    SDValue Base  = Op.getOperand(3);
      SDValue Index = Op.getOperand(4);
-    SDValue Base  = Op.getOperand(5);
+    SDValue Mask  = Op.getOperand(5);
      SDValue Scale = Op.getOperand(6);
-    return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+    return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
                            Subtarget);
    }
-  //int_scatter(base, index, v1, scale);
-  case Intrinsic::x86_avx512_scatter_qpd_512:
-  case Intrinsic::x86_avx512_scatter_qps_512:
-  case Intrinsic::x86_avx512_scatter_dpd_512:
-  case Intrinsic::x86_avx512_scatter_qpi_512:
-  case Intrinsic::x86_avx512_scatter_qpq_512:
-  case Intrinsic::x86_avx512_scatter_dpq_512:
-  case Intrinsic::x86_avx512_scatter_dps_512:
-  case Intrinsic::x86_avx512_scatter_dpi_512: {
-    unsigned Opc;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_avx512_scatter_qpd_512:
-      Opc = X86::VSCATTERQPDZmr; break;
-    case Intrinsic::x86_avx512_scatter_qps_512:
-      Opc = X86::VSCATTERQPSZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpd_512:
-      Opc = X86::VSCATTERDPDZmr; break;
-    case Intrinsic::x86_avx512_scatter_dps_512:
-      Opc = X86::VSCATTERDPSZmr; break;
-    case Intrinsic::x86_avx512_scatter_qpi_512:
-      Opc = X86::VPSCATTERQDZmr; break;
-    case Intrinsic::x86_avx512_scatter_qpq_512:
-      Opc = X86::VPSCATTERQQZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpq_512:
-      Opc = X86::VPSCATTERDQZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpi_512:
-      Opc = X86::VPSCATTERDDZmr; break;
-    }
-    SDValue Chain = Op.getOperand(0);
-    SDValue Base  = Op.getOperand(2);
-    SDValue Index = Op.getOperand(3);
-    SDValue Src   = Op.getOperand(4);
-    SDValue Scale = Op.getOperand(5);
-    return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain);
-  }
-  //int_scatter_mask(base, mask, index, v1, scale);
-  case Intrinsic::x86_avx512_scatter_qps_mask_512:
-  case Intrinsic::x86_avx512_scatter_qpd_mask_512:
-  case Intrinsic::x86_avx512_scatter_dpd_mask_512:
-  case Intrinsic::x86_avx512_scatter_dps_mask_512:
-  case Intrinsic::x86_avx512_scatter_qpi_mask_512:
-  case Intrinsic::x86_avx512_scatter_qpq_mask_512:
-  case Intrinsic::x86_avx512_scatter_dpi_mask_512:
-  case Intrinsic::x86_avx512_scatter_dpq_mask_512: {
-    unsigned Opc;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_avx512_scatter_qpd_mask_512:
-      Opc = X86::VSCATTERQPDZmr; break;
-    case Intrinsic::x86_avx512_scatter_qps_mask_512:
-      Opc = X86::VSCATTERQPSZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpd_mask_512:
-      Opc = X86::VSCATTERDPDZmr; break;
-    case Intrinsic::x86_avx512_scatter_dps_mask_512:
-      Opc = X86::VSCATTERDPSZmr; break;
-    case Intrinsic::x86_avx512_scatter_qpi_mask_512:
-      Opc = X86::VPSCATTERQDZmr; break;
-    case Intrinsic::x86_avx512_scatter_qpq_mask_512:
-      Opc = X86::VPSCATTERQQZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpq_mask_512:
-      Opc = X86::VPSCATTERDQZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpi_mask_512:
-      Opc = X86::VPSCATTERDDZmr; break;
-    }
+  case SCATTER: {
+  //scatter(base, mask, index, v1, scale);
      SDValue Chain = Op.getOperand(0);
      SDValue Base  = Op.getOperand(2);
      SDValue Mask  = Op.getOperand(3);
      SDValue Index = Op.getOperand(4);
      SDValue Src   = Op.getOperand(5);
      SDValue Scale = Op.getOperand(6);
-    return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+    return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+  }
+  case PREFETCH: {
+    SDValue Hint = Op.getOperand(6);
+    unsigned HintVal;
+    if (dyn_cast<ConstantSDNode> (Hint) == 0 ||
+        (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
+      llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
+    unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0);
+    SDValue Chain = Op.getOperand(0);
+    SDValue Mask  = Op.getOperand(2);
+    SDValue Index = Op.getOperand(3);
+    SDValue Base  = Op.getOperand(4);
+    SDValue Scale = Op.getOperand(5);
+    return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
    }
-  // Read Time Stamp Counter (RDTSC).
-  case Intrinsic::x86_rdtsc:
-  // Read Time Stamp Counter and Processor ID (RDTSCP).
-  case Intrinsic::x86_rdtscp: {
-    unsigned Opc;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
-    case Intrinsic::x86_rdtsc:
-      Opc = X86ISD::RDTSC_DAG; break;
-    case Intrinsic::x86_rdtscp:
-      Opc = X86ISD::RDTSCP_DAG; break;
-    }
+  // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
+  case RDTSC: {
      SmallVector<SDValue, 2> Results;
-    getReadTimeStampCounter(Op.getNode(), dl, Opc, DAG, Subtarget, Results);
+    getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results);
      return DAG.getMergeValues(Results, dl);
    }
    // XTEST intrinsics.
-  case Intrinsic::x86_xtest: {
+  case XTEST: {
      SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
      SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
@@ -12665,6 +12880,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                         Ret, SDValue(InTrans.getNode(), 1));
    }
    }
+  llvm_unreachable("Unknown Intrinsic Type");
  }
  
  SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
@@ -12717,6 +12933,19 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
    return FrameAddr;
  }
  
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned X86TargetLowering::getRegisterByName(const char* RegName,
+                                              EVT VT) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("esp", X86::ESP)
+                       .Case("rsp", X86::RSP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
  SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                       SelectionDAG &DAG) const {
    const X86RegisterInfo *RegInfo =
@@ -13182,6 +13411,58 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
    return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
  }
  
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWin64() && "Unexpected target");
+  EVT VT = Op.getValueType();
+  assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+         "Unexpected return type for lowering");
+
+  RTLIB::Libcall LC;
+  bool isSigned;
+  switch (Op->getOpcode()) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
+  case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
+  case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
+  case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
+  case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
+  case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
+  }
+
+  SDLoc dl(Op);
+  SDValue InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Op->getOperand(i).getValueType();
+    assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+           "Unexpected argument type for lowering");
+    SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+    Entry.Node = StackPtr;
+    InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
+                           false, false, 16);
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Ty = PointerType::get(ArgTy,0);
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Args.push_back(Entry);
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy());
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain)
+    .setCallee(getLibcallCallingConv(LC),
+               static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
+               Callee, &Args, 0)
+    .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
+}
+
  static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
                               SelectionDAG &DAG) {
    SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
@@ -14017,6 +14298,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG) {
    MVT SrcVT = Op.getOperand(0).getSimpleValueType();
    MVT DstVT = Op.getSimpleValueType();
+
+  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    if (DstVT != MVT::f64)
+      // This conversion needs to be expanded.
+      return SDValue();
+
+    SDValue InVec = Op->getOperand(0);
+    SDLoc dl(Op);
+    unsigned NumElts = SrcVT.getVectorNumElements();
+    EVT SVT = SrcVT.getVectorElementType();
+
+    // Widen the vector in input in the case of MVT::v2i32.
+    // Example: from MVT::v2i32 to MVT::v4i32.
+    SmallVector<SDValue, 16> Elts;
+    for (unsigned i = 0, e = NumElts; i != e; ++i)
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
+                                 DAG.getIntPtrConstant(i)));
+
+    // Explicitly mark the extra elements as Undef.
+    SDValue Undef = DAG.getUNDEF(SVT);
+    for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
+      Elts.push_back(Undef);
+
+    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
+    SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
+                       DAG.getIntPtrConstant(0));
+  }
+
    assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
           Subtarget->hasMMX() && "Unexpected custom BITCAST");
    assert((DstVT == MVT::i64 ||
@@ -14132,12 +14444,11 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
    Type *RetTy = isF64
      ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
      : (Type*)VectorType::get(ArgTy, 4);
-  TargetLowering::
-    CallLoweringInfo CLI(DAG.getEntryNode(), RetTy,
-                         false, false, false, false, 0,
-                         CallingConv::C, /*isTaillCall=*/false,
-                         /*doesNotRet=*/false, /*isReturnValueUsed*/true,
-                         Callee, Args, DAG, dl);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::C, RetTy, Callee, &Args, 0);
+
    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
  
    if (isF64)
@@ -14166,6 +14477,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
    case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
    case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
    case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
    case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
    case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
@@ -14302,6 +14614,16 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
    case ISD::SUBE:
      // We don't want to expand or promote these.
      return;
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+  case ISD::SDIVREM:
+  case ISD::UDIVREM: {
+    SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+    Results.push_back(V);
+    return;
+  }
    case ISD::FP_TO_SINT:
    case ISD::FP_TO_UINT: {
      bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
@@ -14462,8 +14784,33 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
      return;
    }
-  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_LOAD: {
      ReplaceATOMIC_LOAD(N, Results, DAG);
+    return;
+  }
+  case ISD::BITCAST: {
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    EVT DstVT = N->getValueType(0);
+    EVT SrcVT = N->getOperand(0)->getValueType(0);
+
+    if (SrcVT != MVT::f64 ||
+        (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
+      return;
+
+    unsigned NumElts = DstVT.getVectorNumElements();
+    EVT SVT = DstVT.getVectorElementType();
+    EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+    SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                   MVT::v2f64, N->getOperand(0));
+    SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
+
+    SmallVector<SDValue, 8> Elts;
+    for (unsigned i = 0, e = NumElts; i != e; ++i)
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
+                                   ToVecInt, DAG.getIntPtrConstant(i)));
+
+    Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
+  }
    }
  }
  
@@ -14823,7 +15170,23 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
    if (VT.getSizeInBits() == 64)
      return false;
  
-  // FIXME: pshufb, blends, shifts.
+  // If this is a single-input shuffle with no 128 bit lane crossings we can
+  // lower it into pshufb.
+  if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
+      (SVT.is256BitVector() && Subtarget->hasInt256())) {
+    bool isLegal = true;
+    for (unsigned I = 0, E = M.size(); I != E; ++I) {
+      if (M[I] >= (int)SVT.getVectorNumElements() ||
+          ShuffleCrosses128bitLane(SVT, I, M[I])) {
+        isLegal = false;
+        break;
+      }
+    }
+    if (isLegal)
+      return true;
+  }
+
+  // FIXME: blends, shifts.
    return (SVT.getVectorNumElements() == 2 ||
            ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
            isMOVLMask(M, SVT) ||
@@ -14835,7 +15198,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
            isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
            isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
            isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
-          isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()));
+          isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
+          isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()));
  }
  
  bool
@@ -16914,11 +17278,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
  //                           X86 Optimization Hooks
  //===----------------------------------------------------------------------===//
  
-void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                       APInt &KnownZero,
-                                                       APInt &KnownOne,
-                                                       const SelectionDAG &DAG,
-                                                       unsigned Depth) const {
+void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
    unsigned BitWidth = KnownZero.getBitWidth();
    unsigned Opc = Op.getOpcode();
    assert((Opc >= ISD::BUILTIN_OP_END ||
@@ -17140,6 +17504,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const X86Subtarget *Subtarget) {
    SDLoc dl(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
    EVT VT = N->getValueType(0);
  
    // Don't create instructions with illegal types after legalize types has run.
@@ -17152,6 +17518,57 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
        N->getOpcode() == ISD::VECTOR_SHUFFLE)
      return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
  
+  // During Type Legalization, when promoting illegal vector types,
+  // the backend might introduce new shuffle dag nodes and bitcasts.
+  //
+  // This code performs the following transformation:
+  // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
+  //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
+  //
+  // We do this only if both the bitcast and the BINOP dag nodes have
+  // one use. Also, perform this transformation only if the new binary
+  // operation is legal. This is to avoid introducing dag nodes that
+  // potentially need to be further expanded (or custom lowered) into a
+  // less optimal sequence of dag nodes.
+  if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
+      N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
+      N0.getOpcode() == ISD::BITCAST) {
+    SDValue BC0 = N0.getOperand(0);
+    EVT SVT = BC0.getValueType();
+    unsigned Opcode = BC0.getOpcode();
+    unsigned NumElts = VT.getVectorNumElements();
+    
+    if (BC0.hasOneUse() && SVT.isVector() &&
+        SVT.getVectorNumElements() * 2 == NumElts &&
+        TLI.isOperationLegal(Opcode, VT)) {
+      bool CanFold = false;
+      switch (Opcode) {
+      default : break;
+      case ISD::ADD :
+      case ISD::FADD :
+      case ISD::SUB :
+      case ISD::FSUB :
+      case ISD::MUL :
+      case ISD::FMUL :
+        CanFold = true;
+      }
+
+      unsigned SVTNumElts = SVT.getVectorNumElements();
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+      for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
+        CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
+      for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
+        CanFold = SVOp->getMaskElt(i) < 0;
+
+      if (CanFold) {
+        SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
+        SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
+        SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
+        return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
+      }
+    }
+  }
+
    // Only handle 128 wide vector from here on.
    if (!VT.is128BitVector())
      return SDValue();
@@ -17442,6 +17859,51 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
    return std::make_pair(Opc, NeedSplit);
  }
  
+static SDValue
+TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  SDValue Cond = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+
+  if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
+    SDValue CondSrc = Cond->getOperand(0);
+    if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
+      Cond = CondSrc->getOperand(0);
+  }
+
+  MVT VT = N->getSimpleValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
+
+  SmallVector<int, 8> ShuffleMask(NumElems, -1);
+  for (unsigned i = 0; i < NumElems; ++i) {
+    // Be sure we emit undef where we can.
+    if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
+      ShuffleMask[i] = -1;
+    else
+      ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
+  }
+
+  return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
+}
+
  /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
  /// nodes.
  static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
@@ -17784,7 +18246,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
  
        // Another special case: If C was a sign bit, the sub has been
        // canonicalized into a xor.
-      // FIXME: Would it be better to use ComputeMaskedBits to determine whether
+      // FIXME: Would it be better to use computeKnownBits to determine whether
        //        it's safe to decanonicalize the xor?
        // x s< 0 ? x^C : 0 --> subus x, C
        if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
@@ -17950,7 +18412,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
    // depend on the highest bit in each word. Try to use SimplifyDemandedBits
    // to simplify previous instructions.
    if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
-      !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
+      !DCI.isBeforeLegalize() &&
+      // We explicitly check against v8i16 and v16i16 because, although
+      // they're marked as Custom, they might only be legal when Cond is a
+      // build_vector of constants. This will be taken care in a later
+      // condition.
+      (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
+       VT != MVT::v8i16)) {
      unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
  
      // Don't optimize vector selects that map to mask-registers.
@@ -17977,6 +18445,23 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
        DCI.CommitTargetLoweringOpt(TLO);
    }
  
+  // We should generate an X86ISD::BLENDI from a vselect if its argument
+  // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+  // constants. This specific pattern gets generated when we split a
+  // selector for a 512 bit vector in a machine without AVX512 (but with
+  // 256-bit vectors), during legalization:
+  //
+  // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+  //
+  // Iff we find this pattern and the build_vectors are built from
+  // constants, we translate the vselect into a shuffle_vector that we
+  // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+  if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
+    SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+    if (Shuffle.getNode())
+      return Shuffle;
+  }
+
    return SDValue();
  }
  
@@ -18284,6 +18769,106 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+                                                const X86Subtarget *Subtarget) {
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default: return SDValue();
+  // SSE/AVX/AVX2 blend intrinsics.
+  case Intrinsic::x86_avx2_pblendvb:
+  case Intrinsic::x86_avx2_pblendw:
+  case Intrinsic::x86_avx2_pblendd_128:
+  case Intrinsic::x86_avx2_pblendd_256:
+    // Don't try to simplify this intrinsic if we don't have AVX2.
+    if (!Subtarget->hasAVX2())
+      return SDValue();
+    // FALL-THROUGH
+  case Intrinsic::x86_avx_blend_pd_256:
+  case Intrinsic::x86_avx_blend_ps_256:
+  case Intrinsic::x86_avx_blendv_pd_256:
+  case Intrinsic::x86_avx_blendv_ps_256:
+    // Don't try to simplify this intrinsic if we don't have AVX.
+    if (!Subtarget->hasAVX())
+      return SDValue();
+    // FALL-THROUGH
+  case Intrinsic::x86_sse41_pblendw:
+  case Intrinsic::x86_sse41_blendpd:
+  case Intrinsic::x86_sse41_blendps:
+  case Intrinsic::x86_sse41_blendvps:
+  case Intrinsic::x86_sse41_blendvpd:
+  case Intrinsic::x86_sse41_pblendvb: {
+    SDValue Op0 = N->getOperand(1);
+    SDValue Op1 = N->getOperand(2);
+    SDValue Mask = N->getOperand(3);
+
+    // Don't try to simplify this intrinsic if we don't have SSE4.1.
+    if (!Subtarget->hasSSE41())
+      return SDValue();
+
+    // fold (blend A, A, Mask) -> A
+    if (Op0 == Op1)
+      return Op0;
+    // fold (blend A, B, allZeros) -> A
+    if (ISD::isBuildVectorAllZeros(Mask.getNode()))
+      return Op0;
+    // fold (blend A, B, allOnes) -> B
+    if (ISD::isBuildVectorAllOnes(Mask.getNode()))
+      return Op1;
+    
+    // Simplify the case where the mask is a constant i32 value.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
+      if (C->isNullValue())
+        return Op0;
+      if (C->isAllOnesValue())
+        return Op1;
+    }
+  }
+
+  // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx2_psra_d: {
+    SDValue Op0 = N->getOperand(1);
+    SDValue Op1 = N->getOperand(2);
+    EVT VT = Op0.getValueType();
+    assert(VT.isVector() && "Expected a vector type!");
+
+    if (isa<BuildVectorSDNode>(Op1))
+      Op1 = Op1.getOperand(0);
+
+    if (!isa<ConstantSDNode>(Op1))
+      return SDValue();
+
+    EVT SVT = VT.getVectorElementType();
+    unsigned SVTBits = SVT.getSizeInBits();
+
+    ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
+    const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
+    uint64_t ShAmt = C.getZExtValue();
+
+    // Don't try to convert this shift into a ISD::SRA if the shift
+    // count is bigger than or equal to the element size.
+    if (ShAmt >= SVTBits)
+      return SDValue();
+
+    // Trivial case: if the shift count is zero, then fold this
+    // into the first operand.
+    if (ShAmt == 0)
+      return Op0;
+
+    // Replace this packed shift intrinsic with a target independent
+    // shift dag node.
+    SDValue Splat = DAG.getConstant(C, VT);
+    return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
+  }
+  }
+}
+
  /// PerformMulCombine - Optimize a single multiply with constant into two
  /// in order to implement it with two cheaper instructions, e.g.
  /// LEA + SHL, LEA + LEA.
@@ -19804,6 +20389,33 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  MVT VT = N->getOperand(1)->getSimpleValueType(0);
+  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+         "X86insertps is only defined for v4x32");
+
+  SDValue Ld = N->getOperand(1);
+  if (MayFoldLoad(Ld)) {
+    // Extract the countS bits from the immediate so we can get the proper
+    // address when narrowing the vector load to a specific element.
+    // When the second source op is a memory address, interps doesn't use
+    // countS and just gets an f32 from that address.
+    unsigned DestIndex =
+        cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+    Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
+  } else
+    return SDValue();
+
+  // Create this as a scalar to vector to match the instruction pattern.
+  SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+  // countS bits are ignored when loading from memory on insertps, which
+  // means we don't need to explicitly set them to 0.
+  return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+                     LoadScalarToVector, N->getOperand(2));
+}
+
  // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
  // as "sbb reg,reg", since it can be extended without zext and produces
  // an all-ones bit which is more useful than 0/1 in some cases.
@@ -20083,7 +20695,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::ANY_EXTEND:
    case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
    case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
-  case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
+  case ISD::SIGN_EXTEND_INREG:
+    return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
    case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
    case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
    case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
@@ -20104,6 +20717,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case X86ISD::VPERM2X128:
    case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
    case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
+  case X86ISD::INSERTPS:
+    return PerformINSERTPSCombine(N, DAG, Subtarget);
    }
  
    return SDValue();