Allow the third argument for the subi family to be an expression.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index ebacc425df0434c04550c2c15bb378dab0286992..782ba5f7a78a8a3dbbb2b64397d367bf45a4cbc8 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -44,11 +44,13 @@
  #include "llvm/MC/MCContext.h"
  #include "llvm/MC/MCExpr.h"
  #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/MathExtras.h"
  #include "llvm/Target/TargetOptions.h"
  #include <bitset>
+#include <numeric>
  #include <cctype>
  using namespace llvm;
  
@@ -56,6 +58,17 @@ using namespace llvm;
  
  STATISTIC(NumTailCalls, "Number of tail calls");
  
+static cl::opt<bool> ExperimentalVectorWideningLegalization(
+    "x86-experimental-vector-widening-legalization", cl::init(false),
+    cl::desc("Enable an experimental vector type legalization through widening "
+             "rather than promotion."),
+    cl::Hidden);
+
+static cl::opt<bool> ExperimentalVectorShuffleLowering(
+    "x86-experimental-vector-shuffle-lowering", cl::init(false),
+    cl::desc("Enable an experimental vector shuffle lowering code path."),
+    cl::Hidden);
+
  // Forward declarations.
  static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
                         SDValue V2);
@@ -242,7 +255,7 @@ void X86TargetLowering::resetOperationActions() {
    else
      setSchedulingPreference(Sched::RegPressure);
    const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
+      TM.getSubtarget<X86Subtarget>().getRegisterInfo();
    setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
  
    // Bypass expensive divides on Atom when compiling with O2
@@ -502,6 +515,25 @@ void X86TargetLowering::resetOperationActions() {
      }
    }
  
+  // Special handling for half-precision floating point conversions.
+  // If we don't have F16C support, then lower half float conversions
+  // into library calls.
+  if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
+    setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+    setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+  }
+
+  // There's never any support for operations beyond MVT::f32.
+  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+  setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f80, MVT::f16, Expand);
+
    if (Subtarget->hasPOPCNT()) {
      setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
    } else {
@@ -585,21 +617,6 @@ void X86TargetLowering::resetOperationActions() {
      setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
    }
  
-  if (!Subtarget->is64Bit()) {
-    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
-  }
-
    if (Subtarget->hasCmpxchg16b()) {
      setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
    }
@@ -642,8 +659,7 @@ void X86TargetLowering::resetOperationActions() {
    setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
    setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
  
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
-                     MVT::i64 : MVT::i32, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
  
    if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
      // f32 and f64 use SSE.
@@ -872,7 +888,12 @@ void X86TargetLowering::resetOperationActions() {
                            (MVT::SimpleValueType)InnerVT, Expand);
      setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
      setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+
+    // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
+    // we have to deal with them whether we ask for Expansion or not. Setting
+    // Expand causes its own optimisation problems though, so leave them legal.
+    if (VT.getVectorElementType() == MVT::i1)
+      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
    }
  
    // FIXME: In order to prevent SSE instructions being expanded to MMX ones
@@ -989,6 +1010,22 @@ void X86TargetLowering::resetOperationActions() {
        setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
      }
  
+    // We support custom legalizing of sext and anyext loads for specific
+    // memory vector types which we can load as a scalar (or sequence of
+    // scalars) and extend in-register to a legal 128-bit vector type. For sext
+    // loads these must work with a single scalar load.
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom);
+    if (Subtarget->is64Bit()) {
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom);
+    }
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom);
+
      setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
      setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
      setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
@@ -1084,6 +1121,12 @@ void X86TargetLowering::resetOperationActions() {
      // some vselects for now.
      setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
  
+    // SSE41 brings specific instructions for doing vector sign extend even in
+    // cases where we don't have SRA.
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom);
+
      // i8 and i16 vectors are custom , because the source register and source
      // source memory operand types are not the same width.  f32 vectors are
      // custom since the immediate controlling the insert encodes additional
@@ -1483,6 +1526,11 @@ void X86TargetLowering::resetOperationActions() {
      }
    }// has  AVX-512
  
+  if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
+    addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
+    addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
+  }
+
    // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
    // of this type with custom code.
    for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
@@ -1596,6 +1644,22 @@ void X86TargetLowering::resetOperationActions() {
    setPrefFunctionAlignment(4); // 2^4 bytes.
  }
  
+// This has so far only been implemented for 64-bit MachO.
+bool X86TargetLowering::useLoadStackGuardNode() const {
+  return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO &&
+         Subtarget->is64Bit();
+}
+
+TargetLoweringBase::LegalizeTypeAction
+X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (ExperimentalVectorWideningLegalization &&
+      VT.getVectorNumElements() != 1 &&
+      VT.getVectorElementType().getSimpleVT() != MVT::i1)
+    return TypeWidenVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
  EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
    if (!VT.isVector())
      return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
@@ -1710,9 +1774,10 @@ bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
  }
  
  bool
-X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
-                                                 unsigned,
-                                                 bool *Fast) const {
+X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                  unsigned,
+                                                  unsigned,
+                                                  bool *Fast) const {
    if (Fast)
      *Fast = Subtarget->isUnalignedMemAccessFast();
    return true;
@@ -1835,8 +1900,7 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                          const SmallVectorImpl<ISD::OutputArg> &Outs,
                          LLVMContext &Context) const {
    SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
-                 RVLocs, Context);
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
    return CCInfo.CheckReturn(Outs, RetCC_X86);
  }
  
@@ -1855,8 +1919,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
  
    SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
    CCInfo.AnalyzeReturn(Outs, RetCC_X86);
  
    SDValue Flag;
@@ -1902,8 +1965,8 @@ X86TargetLowering::LowerReturn(SDValue Chain,
  
      // Returns in ST0/ST1 are handled specially: these are pushed as operands to
      // the RET instruction and handled by the FP Stackifier.
-    if (VA.getLocReg() == X86::ST0 ||
-        VA.getLocReg() == X86::ST1) {
+    if (VA.getLocReg() == X86::FP0 ||
+        VA.getLocReg() == X86::FP1) {
        // If this is a copy from an xmm register to ST(0), use an FPExtend to
        // change the value to the FP stack register class.
        if (isScalarFPTypeInSSEReg(VA.getValVT()))
@@ -1999,8 +2062,8 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
    return true;
  }
  
-MVT
-X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
+EVT
+X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
                                              ISD::NodeType ExtendKind) const {
    MVT ReturnMVT;
    // TODO: Is this also valid on 32-bit?
@@ -2009,7 +2072,7 @@ X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
    else
      ReturnMVT = MVT::i32;
  
-  MVT MinVT = getRegisterType(ReturnMVT);
+  EVT MinVT = getRegisterType(Context, ReturnMVT);
    return VT.bitsLT(MinVT) ? MinVT : VT;
  }
  
@@ -2026,8 +2089,8 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
    // Assign locations to each value returned by this call.
    SmallVector<CCValAssign, 16> RVLocs;
    bool Is64Bit = Subtarget->is64Bit();
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 DAG.getTarget(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
  
    // Copy all of the result registers out of their specified physreg.
@@ -2041,33 +2104,21 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
        report_fatal_error("SSE register return with SSE disabled");
      }
  
-    SDValue Val;
-
-    // If this is a call to a function that returns an fp value on the floating
-    // point stack, we must guarantee the value is popped from the stack, so
-    // a CopyFromReg is not good enough - the copy instruction may be eliminated
-    // if the return value is not used. We use the FpPOP_RETVAL instruction
-    // instead.
-    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
-      // If we prefer to use the value in xmm registers, copy it out as f80 and
-      // use a truncate to move it from fp stack reg to xmm reg.
-      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
-      SDValue Ops[] = { Chain, InFlag };
-      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
-                                         MVT::Other, MVT::Glue, Ops), 1);
-      Val = Chain.getValue(0);
-
-      // Round the f80 to the right size, which also moves it to the appropriate
-      // xmm register.
-      if (CopyVT != VA.getValVT())
-        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
-                          // This truncation won't change the value.
-                          DAG.getIntPtrConstant(1));
-    } else {
-      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
-                                 CopyVT, InFlag).getValue(1);
-      Val = Chain.getValue(0);
-    }
+    // If we prefer to use the value in xmm registers, copy it out as f80 and
+    // use a truncate to move it from fp stack reg to xmm reg.
+    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+        isScalarFPTypeInSSEReg(VA.getValVT()))
+      CopyVT = MVT::f80;
+
+    Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                               CopyVT, InFlag).getValue(1);
+    SDValue Val = Chain.getValue(0);
+
+    if (CopyVT != VA.getValVT())
+      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+                        // This truncation won't change the value.
+                        DAG.getIntPtrConstant(1));
+
      InFlag = Chain.getValue(2);
      InVals.push_back(Val);
    }
@@ -2235,8 +2286,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
  
    // Assign locations to all of the incoming arguments.
    SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
  
    // Allocate shadow area for Win64
    if (IsWin64)
@@ -2280,6 +2330,10 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
          RC = &X86::VK8RegClass;
        else if (RegVT == MVT::v16i1)
          RC = &X86::VK16RegClass;
+      else if (RegVT == MVT::v32i1)
+        RC = &X86::VK32RegClass;
+      else if (RegVT == MVT::v64i1)
+        RC = &X86::VK64RegClass;
        else
          llvm_unreachable("Unknown argument type!");
  
@@ -2399,7 +2453,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
          TotalNumXMMRegs = 0;
  
        if (IsWin64) {
-        const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
+        const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
          // Get to the caller-allocated home save location.  Add 8 to account
          // for the return address.
          int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
@@ -2598,8 +2652,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  
    // Analyze operands of the call, assigning locations to each operand.
    SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
  
    // Allocate shadow area for Win64
    if (IsWin64)
@@ -2639,8 +2692,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    // arguments passed in memory when using inalloca.
    if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
      NumBytesToPush = 0;
-    assert(ArgLocs.back().getLocMemOffset() == 0 &&
-           "an inalloca argument must be the only memory argument");
+    if (!ArgLocs.back().isMemLoc())
+      report_fatal_error("cannot use inalloca attribute on a register "
+                         "parameter");
+    if (ArgLocs.back().getLocMemOffset() != 0)
+      report_fatal_error("any parameter with the inalloca attribute must be "
+                         "the only memory argument");
    }
  
    if (!IsSibcall)
@@ -2659,8 +2716,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  
    // Walk the register/memloc assignments, inserting copies/loads.  In the case
    // of tail call optimization arguments are handle later.
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
      // Skip inalloca arguments, they have already been written.
      ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -2956,7 +3013,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                    RegsToPass[i].second.getValueType()));
  
    // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
    const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
    assert(Mask && "Missing call preserved mask for calling convention");
    Ops.push_back(DAG.getRegisterMask(Mask));
@@ -3027,7 +3084,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  //  If a tail called function callee has more arguments than the caller the
  //  caller needs to make sure that there is room to move the RETADDR to. This is
  //  achieved by reserving an area the size of the argument delta right after the
-//  original REtADDR, but before the saved framepointer or the spilled registers
+//  original RETADDR, but before the saved framepointer or the spilled registers
  //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
  //  stack layout:
  //    arg1
@@ -3047,9 +3104,9 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
                                                 SelectionDAG& DAG) const {
    MachineFunction &MF = DAG.getMachineFunction();
    const TargetMachine &TM = MF.getTarget();
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
-  const TargetFrameLowering &TFI = *TM.getFrameLowering();
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      TM.getSubtargetImpl()->getRegisterInfo());
+  const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
    unsigned StackAlignment = TFI.getStackAlignment();
    uint64_t AlignMask = StackAlignment - 1;
    int64_t Offset = StackSize;
@@ -3162,8 +3219,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
  
    // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
    // emit a special epilogue.
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
    if (RegInfo->needsStackRealignment(MF))
      return false;
  
@@ -3191,8 +3248,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
        return false;
  
      SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   DAG.getTarget(), ArgLocs, *DAG.getContext());
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                   *DAG.getContext());
  
      CCInfo.AnalyzeCallOperands(Outs, CC_X86);
      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -3212,12 +3269,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    }
    if (Unused) {
      SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
-                   DAG.getTarget(), RVLocs, *DAG.getContext());
+    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
+                   *DAG.getContext());
      CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
      for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
        CCValAssign &VA = RVLocs[i];
-      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+      if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
          return false;
      }
    }
@@ -3226,13 +3283,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    // results are returned in the same way as what the caller expects.
    if (!CCMatch) {
      SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-                    DAG.getTarget(), RVLocs1, *DAG.getContext());
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
+                    *DAG.getContext());
      CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
  
      SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-                    DAG.getTarget(), RVLocs2, *DAG.getContext());
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
+                    *DAG.getContext());
      CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
  
      if (RVLocs1.size() != RVLocs2.size())
@@ -3258,8 +3315,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
      // Check if stack adjustment is needed. For now, do not do this if any
      // argument is passed on the stack.
      SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   DAG.getTarget(), ArgLocs, *DAG.getContext());
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                   *DAG.getContext());
  
      // Allocate shadow area for Win64
      if (IsCalleeWin64)
@@ -3276,7 +3333,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
        MachineFrameInfo *MFI = MF.getFrameInfo();
        const MachineRegisterInfo *MRI = &MF.getRegInfo();
        const X86InstrInfo *TII =
-          static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo());
+          static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
        for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
          CCValAssign &VA = ArgLocs[i];
          SDValue Arg = OutVals[i];
@@ -3346,6 +3403,7 @@ static bool MayFoldIntoStore(SDValue Op) {
  static bool isTargetShuffle(unsigned Opcode) {
    switch(Opcode) {
    default: return false;
+  case X86ISD::PSHUFB:
    case X86ISD::PSHUFD:
    case X86ISD::PSHUFHW:
    case X86ISD::PSHUFLW:
@@ -3401,6 +3459,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
    switch(Opc) {
    default: llvm_unreachable("Unknown x86 shuffle node");
    case X86ISD::PALIGNR:
+  case X86ISD::VALIGN:
    case X86ISD::SHUFP:
    case X86ISD::VPERM2X128:
      return DAG.getNode(Opc, dl, VT, V1, V2,
@@ -3427,8 +3486,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
  
  SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
    MachineFunction &MF = DAG.getMachineFunction();
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
    int ReturnAddrIndex = FuncInfo->getRAIndex();
  
@@ -3739,16 +3798,12 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
    return true;
  }
  
-/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
-/// is suitable for input to PALIGNR.
-static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
-                          const X86Subtarget *Subtarget) {
-  if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
-      (VT.is256BitVector() && !Subtarget->hasInt256()))
-    return false;
-
+/// \brief Return true if the mask specifies a shuffle of elements that is
+/// suitable for input to intralane (palignr) or interlane (valign) vector
+/// right-shift.
+static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
    unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
+  unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
    unsigned NumLaneElts = NumElts/NumLanes;
  
    // Do not handle 64-bit element shuffles with palignr.
@@ -3812,6 +3867,28 @@ static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
    return true;
  }
  
+/// \brief Return true if the node specifies a shuffle of elements that is
+/// suitable for input to PALIGNR.
+static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
+                          const X86Subtarget *Subtarget) {
+  if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
+      (VT.is256BitVector() && !Subtarget->hasInt256()))
+    // FIXME: Add AVX512BW.
+    return false;
+
+  return isAlignrMask(Mask, VT, false);
+}
+
+/// \brief Return true if the node specifies a shuffle of elements that is
+/// suitable for input to VALIGN.
+static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
+                          const X86Subtarget *Subtarget) {
+  // FIXME: Add AVX512VL.
+  if (!VT.is512BitVector() || !Subtarget->hasAVX512())
+    return false;
+  return isAlignrMask(Mask, VT, true);
+}
+
  /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
  /// the two vector operands have swapped position.
  static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
@@ -4636,11 +4713,13 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
    return Mask;
  }
  
-/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
-static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
+/// \brief Return the appropriate immediate to shuffle the specified
+/// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
+/// VALIGN (if Interlane is true) instructions.
+static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
+                                           bool InterLane) {
    MVT VT = SVOp->getSimpleValueType(0);
-  unsigned EltSize = VT.is512BitVector() ? 1 :
+  unsigned EltSize = InterLane ? 1 :
      VT.getVectorElementType().getSizeInBits() >> 3;
  
    unsigned NumElts = VT.getVectorNumElements();
@@ -4661,6 +4740,19 @@ static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
    return (Val - i) * EltSize;
  }
  
+/// \brief Return the appropriate immediate to shuffle the specified
+/// VECTOR_SHUFFLE mask with the PALIGNR instruction.
+static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
+  return getShuffleAlignrImmediate(SVOp, false);
+}
+
+/// \brief Return the appropriate immediate to shuffle the specified
+/// VECTOR_SHUFFLE mask with the VALIGN instruction.
+static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
+  return getShuffleAlignrImmediate(SVOp, true);
+}
+
+
  static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
    assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
    if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
@@ -4735,28 +4827,6 @@ bool X86::isZeroNode(SDValue Elt) {
    return false;
  }
  
-/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
-/// their permute mask.
-static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
-                                    SelectionDAG &DAG) {
-  MVT VT = SVOp->getSimpleValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-  SmallVector<int, 8> MaskVec;
-
-  for (unsigned i = 0; i != NumElems; ++i) {
-    int Idx = SVOp->getMaskElt(i);
-    if (Idx >= 0) {
-      if (Idx < (int)NumElems)
-        Idx += NumElems;
-      else
-        Idx -= NumElems;
-    }
-    MaskVec.push_back(Idx);
-  }
-  return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
-                              SVOp->getOperand(0), &MaskVec[0]);
-}
-
  /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
  /// match movhlps. The lower half elements should come from upper half of
  /// V1 (and in order), and the upper half elements should come from the upper
@@ -4842,19 +4912,6 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
    return true;
  }
  
-/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
-/// all the same.
-static bool isSplatVector(SDNode *N) {
-  if (N->getOpcode() != ISD::BUILD_VECTOR)
-    return false;
-
-  SDValue SplatValue = N->getOperand(0);
-  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
-    if (N->getOperand(i) != SplatValue)
-      return false;
-  return true;
-}
-
  /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
  /// to an zero vector.
  /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
@@ -5106,30 +5163,38 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
  }
  
  /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
-/// target specific opcode. Returns true if the Mask could be calculated.
-/// Sets IsUnary to true if only uses one source.
+/// target specific opcode. Returns true if the Mask could be calculated. Sets
+/// IsUnary to true if only uses one source. Note that this will set IsUnary for
+/// shuffles which use a single input multiple times, and in those cases it will
+/// adjust the mask to only have indices within that single input.
  static bool getTargetShuffleMask(SDNode *N, MVT VT,
                                   SmallVectorImpl<int> &Mask, bool &IsUnary) {
    unsigned NumElems = VT.getVectorNumElements();
    SDValue ImmN;
  
    IsUnary = false;
+  bool IsFakeUnary = false;
    switch(N->getOpcode()) {
    case X86ISD::SHUFP:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::UNPCKH:
      DecodeUNPCKHMask(VT, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::UNPCKL:
      DecodeUNPCKLMask(VT, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::MOVHLPS:
      DecodeMOVHLPSMask(NumElems, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::MOVLHPS:
      DecodeMOVLHPSMask(NumElems, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::PALIGNR:
      ImmN = N->getOperand(N->getNumOperands()-1);
@@ -5151,6 +5216,67 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
      DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
      IsUnary = true;
      break;
+  case X86ISD::PSHUFB: {
+    IsUnary = true;
+    SDValue MaskNode = N->getOperand(1);
+    while (MaskNode->getOpcode() == ISD::BITCAST)
+      MaskNode = MaskNode->getOperand(0);
+
+    if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+      // If we have a build-vector, then things are easy.
+      EVT VT = MaskNode.getValueType();
+      assert(VT.isVector() &&
+             "Can't produce a non-vector with a build_vector!");
+      if (!VT.isInteger())
+        return false;
+
+      int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
+
+      SmallVector<uint64_t, 32> RawMask;
+      for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
+        auto *CN = dyn_cast<ConstantSDNode>(MaskNode->getOperand(i));
+        if (!CN)
+          return false;
+        APInt MaskElement = CN->getAPIntValue();
+
+        // We now have to decode the element which could be any integer size and
+        // extract each byte of it.
+        for (int j = 0; j < NumBytesPerElement; ++j) {
+          // Note that this is x86 and so always little endian: the low byte is
+          // the first byte of the mask.
+          RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
+          MaskElement = MaskElement.lshr(8);
+        }
+      }
+      DecodePSHUFBMask(RawMask, Mask);
+      break;
+    }
+
+    auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+    if (!MaskLoad)
+      return false;
+
+    SDValue Ptr = MaskLoad->getBasePtr();
+    if (Ptr->getOpcode() == X86ISD::Wrapper)
+      Ptr = Ptr->getOperand(0);
+
+    auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+    if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+      return false;
+
+    if (auto *C = dyn_cast<ConstantDataSequential>(MaskCP->getConstVal())) {
+      // FIXME: Support AVX-512 here.
+      if (!C->getType()->isVectorTy() ||
+          (C->getNumElements() != 16 && C->getNumElements() != 32))
+        return false;
+
+      assert(C->getType()->isVectorTy() && "Expected a vector constant.");
+      DecodePSHUFBMask(C, Mask);
+      break;
+    }
+
+    return false;
+  }
    case X86ISD::VPERMI:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
@@ -5183,6 +5309,14 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
    default: llvm_unreachable("unknown target shuffle node");
    }
  
+  // If we have a fake unary shuffle, the shuffle mask is spread across two
+  // inputs that are actually the same node. Re-map the mask to always point
+  // into the first input.
+  if (IsFakeUnary)
+    for (int &M : Mask)
+      if (M >= (int)Mask.size())
+        M -= Mask.size();
+
    return true;
  }
  
@@ -5763,18 +5897,22 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
        return SDValue();
  
      case ISD::BUILD_VECTOR: {
-      // The BUILD_VECTOR node must be a splat.
-      if (!isSplatVector(Op.getNode()))
+      auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
+      BitVector UndefElements;
+      SDValue Splat = BVOp->getSplatValue(&UndefElements);
+
+      // We need a splat of a single value to use broadcast, and it doesn't
+      // make any sense if the value is only in one element of the vector.
+      if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
          return SDValue();
  
-      Ld = Op.getOperand(0);
+      Ld = Splat;
        ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
-                     Ld.getOpcode() == ISD::ConstantFP);
+                       Ld.getOpcode() == ISD::ConstantFP);
  
-      // The suspected load node has several users. Make sure that all
-      // of its users are from the BUILD_VECTOR node.
-      // Constants may have multiple users.
-      if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
+      // Make sure that all of the users of a non-constant load are from the
+      // BUILD_VECTOR node.
+      if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
          return SDValue();
        break;
      }
@@ -6875,3620 +7013,4985 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
    return LowerAVXCONCAT_VECTORS(Op, DAG);
  }
  
-static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
-                        bool hasInt256, unsigned *MaskOut = nullptr) {
-  MVT EltVT = VT.getVectorElementType();
-
-  // There is no blend with immediate in AVX-512.
-  if (VT.is512BitVector())
-    return false;
-
-  if (!hasSSE41 || EltVT == MVT::i8)
-    return false;
-  if (!hasInt256 && VT == MVT::v16i16)
-    return false;
-
-  unsigned MaskValue = 0;
-  unsigned NumElems = VT.getVectorNumElements();
-  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
-  unsigned NumLanes = (NumElems - 1) / 8 + 1;
-  unsigned NumElemsInLane = NumElems / NumLanes;
-
-  // Blend for v16i16 should be symetric for the both lanes.
-  for (unsigned i = 0; i < NumElemsInLane; ++i) {
-
-    int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
-    int EltIdx = MaskVals[i];
  
-    if ((EltIdx < 0 || EltIdx == (int)i) &&
-        (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
-      continue;
+//===----------------------------------------------------------------------===//
+// Vector shuffle lowering
+//
+// This is an experimental code path for lowering vector shuffles on x86. It is
+// designed to handle arbitrary vector shuffles and blends, gracefully
+// degrading performance as necessary. It works hard to recognize idiomatic
+// shuffles and lower them to optimal instruction patterns without leaving
+// a framework that allows reasonably efficient handling of all vector shuffle
+// patterns.
+//===----------------------------------------------------------------------===//
  
-    if (((unsigned)EltIdx == (i + NumElems)) &&
-        (SndLaneEltIdx < 0 ||
-         (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
-      MaskValue |= (1 << i);
-    else
+/// \brief Tiny helper function to identify a no-op mask.
+///
+/// This is a somewhat boring predicate function. It checks whether the mask
+/// array input, which is assumed to be a single-input shuffle mask of the kind
+/// used by the X86 shuffle instructions (not a fully general
+/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
+/// in-place shuffle are 'no-op's.
+static bool isNoopShuffleMask(ArrayRef<int> Mask) {
+  for (int i = 0, Size = Mask.size(); i < Size; ++i)
+    if (Mask[i] != -1 && Mask[i] != i)
        return false;
-  }
+  return true;
+}
  
-  if (MaskOut)
-    *MaskOut = MaskValue;
+/// \brief Helper function to classify a mask as a single-input mask.
+///
+/// This isn't a generic single-input test because in the vector shuffle
+/// lowering we canonicalize single inputs to be the first input operand. This
+/// means we can more quickly test for a single input by only checking whether
+/// an input from the second operand exists. We also assume that the size of
+/// mask corresponds to the size of the input vectors which isn't true in the
+/// fully general case.
+static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
+  for (int M : Mask)
+    if (M >= (int)Mask.size())
+      return false;
    return true;
  }
  
-// Try to lower a shuffle node into a simple blend instruction.
-// This function assumes isBlendMask returns true for this
-// SuffleVectorSDNode
-static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
-                                          unsigned MaskValue,
-                                          const X86Subtarget *Subtarget,
+/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
+///
+/// This helper function produces an 8-bit shuffle immediate corresponding to
+/// the ubiquitous shuffle encoding scheme used in x86 instructions for
+/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
+/// example.
+///
+/// NB: We rely heavily on "undef" masks preserving the input lane.
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
                                            SelectionDAG &DAG) {
-  MVT VT = SVOp->getSimpleValueType(0);
-  MVT EltVT = VT.getVectorElementType();
-  assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
-                     Subtarget->hasInt256() && "Trying to lower a "
-                                               "VECTOR_SHUFFLE to a Blend but "
-                                               "with the wrong mask"));
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  SDLoc dl(SVOp);
-  unsigned NumElems = VT.getVectorNumElements();
-
-  // Convert i32 vectors to floating point if it is not AVX2.
-  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
-  MVT BlendVT = VT;
-  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
-    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
-                               NumElems);
-    V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
-    V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
-  }
+  assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
+  assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
+  assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
+  assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
+  assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
  
-  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
-                            DAG.getConstant(MaskValue, MVT::i32));
-  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+  unsigned Imm = 0;
+  Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
+  Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
+  Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
+  Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
+  return DAG.getConstant(Imm, MVT::i8);
  }
  
-/// In vector type \p VT, return true if the element at index \p InputIdx
-/// falls on a different 128-bit lane than \p OutputIdx.
-static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
-                                     unsigned OutputIdx) {
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-  return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
-}
+/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
+///
+/// This is the basis function for the 2-lane 64-bit shuffles as we have full
+/// support for floating point shuffles but not integer shuffles. These
+/// instructions will incur a domain crossing penalty on some chips though so
+/// it is better to avoid lowering through this for integer vectors where
+/// possible.
+static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
  
-/// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
-/// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
-/// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
-/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
-/// zero.
-static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
-                         SelectionDAG &DAG) {
-  MVT VT = V1.getSimpleValueType();
-  assert(VT.is128BitVector() || VT.is256BitVector());
+  if (isSingleInputShuffleMask(Mask)) {
+    // Straight shuffle of a single input vector. Simulate this by using the
+    // single input as both of the "inputs" to this instruction..
+    unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
+    return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
+                       DAG.getConstant(SHUFPDMask, MVT::i8));
+  }
+  assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
+  assert(Mask[1] >= 2 && "Non-canonicalized blend!");
  
-  MVT EltVT = VT.getVectorElementType();
-  unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
-  unsigned NumElts = VT.getVectorNumElements();
+  unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
+  return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
+                     DAG.getConstant(SHUFPDMask, MVT::i8));
+}
  
-  SmallVector<SDValue, 32> PshufbMask;
-  for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
-    int InputIdx = MaskVals[OutputIdx];
-    unsigned InputByteIdx;
+/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
+///
+/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
+/// the integer unit to minimize domain crossing penalties. However, for blends
+/// it falls back to the floating point shuffle operation with appropriate bit
+/// casting.
+static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+  if (isSingleInputShuffleMask(Mask)) {
+    // Straight shuffle of a single input vector. For everything from SSE2
+    // onward this has a single fast instruction with no scary immediates.
+    // We have to map the mask as it is actually a v4i32 shuffle instruction.
+    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
+    int WidenedMask[4] = {
+        std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
+        std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
+    return DAG.getNode(
+        ISD::BITCAST, DL, MVT::v2i64,
+        DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
+                    getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
+  }
+
+  // We implement this with SHUFPD which is pretty lame because it will likely
+  // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
+  // However, all the alternatives are still more cycles and newer chips don't
+  // have this problem. It would be really nice if x86 had better shuffles here.
+  V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
+  V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
+  return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                     DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
+}
+
+/// \brief Lower 4-lane 32-bit floating point shuffles.
+///
+/// Uses instructions exclusively from the floating point unit to minimize
+/// domain crossing penalties, as these are sufficient to implement all v4f32
+/// shuffles.
+static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  SDValue LowV = V1, HighV = V2;
+  int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
+
+  int NumV2Elements =
+      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+
+  if (NumV2Elements == 0)
+    // Straight shuffle of a single input vector. We pass the input vector to
+    // both operands to simulate this with a SHUFPS.
+    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
+                       getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+  if (NumV2Elements == 1) {
+    int V2Index =
+        std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
+        Mask.begin();
+    // Compute the index adjacent to V2Index and in the same half by toggling
+    // the low bit.
+    int V2AdjIndex = V2Index ^ 1;
+
+    if (Mask[V2AdjIndex] == -1) {
+      // Handles all the cases where we have a single V2 element and an undef.
+      // This will only ever happen in the high lanes because we commute the
+      // vector otherwise.
+      if (V2Index < 2)
+        std::swap(LowV, HighV);
+      NewMask[V2Index] -= 4;
+    } else {
+      // Handle the case where the V2 element ends up adjacent to a V1 element.
+      // To make this work, blend them together as the first step.
+      int V1Index = V2AdjIndex;
+      int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
+      V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1,
+                       getV4X86ShuffleImm8ForMask(BlendMask, DAG));
+
+      // Now proceed to reconstruct the final blend as we have the necessary
+      // high or low half formed.
+      if (V2Index < 2) {
+        LowV = V2;
+        HighV = V1;
+      } else {
+        HighV = V2;
+      }
+      NewMask[V1Index] = 2; // We put the V1 element in V2[2].
+      NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
+    }
+  } else if (NumV2Elements == 2) {
+    if (Mask[0] < 4 && Mask[1] < 4) {
+      // Handle the easy case where we have V1 in the low lanes and V2 in the
+      // high lanes. We never see this reversed because we sort the shuffle.
+      NewMask[2] -= 4;
+      NewMask[3] -= 4;
+    } else {
+      // We have a mixture of V1 and V2 in both low and high lanes. Rather than
+      // trying to place elements directly, just blend them and set up the final
+      // shuffle to place them.
  
-    if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
-      InputByteIdx = 0x80;
-    else {
-      // Cross lane is not allowed.
-      if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
-        return SDValue();
-      InputByteIdx = InputIdx * EltSizeInBytes;
-      // Index is an byte offset within the 128-bit lane.
-      InputByteIdx &= 0xf;
-    }
+      // The first two blend mask elements are for V1, the second two are for
+      // V2.
+      int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
+                          Mask[2] < 4 ? Mask[2] : Mask[3],
+                          (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
+                          (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
+      V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2,
+                       getV4X86ShuffleImm8ForMask(BlendMask, DAG));
  
-    for (unsigned j = 0; j < EltSizeInBytes; ++j) {
-      PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
-      if (InputByteIdx != 0x80)
-        ++InputByteIdx;
+      // Now we do a normal shuffle of V1 by giving V1 as both operands to
+      // a blend.
+      LowV = HighV = V1;
+      NewMask[0] = Mask[0] < 4 ? 0 : 2;
+      NewMask[1] = Mask[0] < 4 ? 2 : 0;
+      NewMask[2] = Mask[2] < 4 ? 1 : 3;
+      NewMask[3] = Mask[2] < 4 ? 3 : 1;
      }
    }
-
-  MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
-  if (ShufVT != VT)
-    V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
-  return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
-                     DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
+  return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV,
+                     getV4X86ShuffleImm8ForMask(NewMask, DAG));
  }
  
-// v8i16 shuffles - Prefer shuffles in the following order:
-// 1. [all]   pshuflw, pshufhw, optional move
-// 2. [ssse3] 1 x pshufb
-// 3. [ssse3] 2 x pshufb + 1 x por
-// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
-static SDValue
-LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
-                         SelectionDAG &DAG) {
+/// \brief Lower 4-lane i32 vector shuffles.
+///
+/// We try to handle these with integer-domain shuffles where we can, but for
+/// blends we use the floating point domain blend instructions.
+static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  SDLoc dl(SVOp);
-  SmallVector<int, 8> MaskVals;
-
-  // Determine if more than 1 of the words in each of the low and high quadwords
-  // of the result come from the same quadword of one of the two inputs.  Undef
-  // mask values count as coming from any quadword, for better codegen.
-  //
-  // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
-  // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
-  unsigned LoQuad[] = { 0, 0, 0, 0 };
-  unsigned HiQuad[] = { 0, 0, 0, 0 };
-  // Indices of quads used.
-  std::bitset<4> InputQuads;
-  for (unsigned i = 0; i < 8; ++i) {
-    unsigned *Quad = i < 4 ? LoQuad : HiQuad;
-    int EltIdx = SVOp->getMaskElt(i);
-    MaskVals.push_back(EltIdx);
-    if (EltIdx < 0) {
-      ++Quad[0];
-      ++Quad[1];
-      ++Quad[2];
-      ++Quad[3];
-      continue;
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  if (isSingleInputShuffleMask(Mask))
+    // Straight shuffle of a single input vector. For everything from SSE2
+    // onward this has a single fast instruction with no scary immediates.
+    return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+                       getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+  // We implement this with SHUFPS because it can blend from two vectors.
+  // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
+  // up the inputs, bypassing domain shift penalties that we would encur if we
+  // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
+  // relevant.
+  return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
+                     DAG.getVectorShuffle(
+                         MVT::v4f32, DL,
+                         DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
+                         DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
+}
+
+/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
+/// shuffle lowering, and the most complex part.
+///
+/// The lowering strategy is to try to form pairs of input lanes which are
+/// targeted at the same half of the final vector, and then use a dword shuffle
+/// to place them onto the right half, and finally unpack the paired lanes into
+/// their final position.
+///
+/// The exact breakdown of how to form these dword pairs and align them on the
+/// correct sides is really tricky. See the comments within the function for
+/// more of the details.
+static SDValue lowerV8I16SingleInputVectorShuffle(
+    SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+  MutableArrayRef<int> LoMask = Mask.slice(0, 4);
+  MutableArrayRef<int> HiMask = Mask.slice(4, 4);
+
+  SmallVector<int, 4> LoInputs;
+  std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
+               [](int M) { return M >= 0; });
+  std::sort(LoInputs.begin(), LoInputs.end());
+  LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
+  SmallVector<int, 4> HiInputs;
+  std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
+               [](int M) { return M >= 0; });
+  std::sort(HiInputs.begin(), HiInputs.end());
+  HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
+  int NumLToL =
+      std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
+  int NumHToL = LoInputs.size() - NumLToL;
+  int NumLToH =
+      std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
+  int NumHToH = HiInputs.size() - NumLToH;
+  MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
+  MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
+  MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
+  MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
+
+  // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
+  // such inputs we can swap two of the dwords across the half mark and end up
+  // with <=2 inputs to each half in each half. Once there, we can fall through
+  // to the generic code below. For example:
+  //
+  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+  // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
+  //
+  // Before we had 3-1 in the low half and 3-1 in the high half. Afterward, 2-2
+  // and 2-2.
+  auto balanceSides = [&](ArrayRef<int> ThreeInputs, int OneInput,
+                          int ThreeInputHalfSum, int OneInputHalfOffset) {
+    // Compute the index of dword with only one word among the three inputs in
+    // a half by taking the sum of the half with three inputs and subtracting
+    // the sum of the actual three inputs. The difference is the remaining
+    // slot.
+    int DWordA = (ThreeInputHalfSum -
+                  std::accumulate(ThreeInputs.begin(), ThreeInputs.end(), 0)) /
+                 2;
+    int DWordB = OneInputHalfOffset / 2 + (OneInput / 2 + 1) % 2;
+
+    int PSHUFDMask[] = {0, 1, 2, 3};
+    PSHUFDMask[DWordA] = DWordB;
+    PSHUFDMask[DWordB] = DWordA;
+    V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                    DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                                DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+                                getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+
+    // Adjust the mask to match the new locations of A and B.
+    for (int &M : Mask)
+      if (M != -1 && M/2 == DWordA)
+        M = 2 * DWordB + M % 2;
+      else if (M != -1 && M/2 == DWordB)
+        M = 2 * DWordA + M % 2;
+
+    // Recurse back into this routine to re-compute state now that this isn't
+    // a 3 and 1 problem.
+    return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
+                                Mask);
+  };
+  if (NumLToL == 3 && NumHToL == 1)
+    return balanceSides(LToLInputs, HToLInputs[0], 0 + 1 + 2 + 3, 4);
+  else if (NumLToL == 1 && NumHToL == 3)
+    return balanceSides(HToLInputs, LToLInputs[0], 4 + 5 + 6 + 7, 0);
+  else if (NumLToH == 1 && NumHToH == 3)
+    return balanceSides(HToHInputs, LToHInputs[0], 4 + 5 + 6 + 7, 0);
+  else if (NumLToH == 3 && NumHToH == 1)
+    return balanceSides(LToHInputs, HToHInputs[0], 0 + 1 + 2 + 3, 4);
+
+  // At this point there are at most two inputs to the low and high halves from
+  // each half. That means the inputs can always be grouped into dwords and
+  // those dwords can then be moved to the correct half with a dword shuffle.
+  // We use at most one low and one high word shuffle to collect these paired
+  // inputs into dwords, and finally a dword shuffle to place them.
+  int PSHUFLMask[4] = {-1, -1, -1, -1};
+  int PSHUFHMask[4] = {-1, -1, -1, -1};
+  int PSHUFDMask[4] = {-1, -1, -1, -1};
+
+  // First fix the masks for all the inputs that are staying in their
+  // original halves. This will then dictate the targets of the cross-half
+  // shuffles.
+  auto fixInPlaceInputs =
+      [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
+                    MutableArrayRef<int> SourceHalfMask,
+                    MutableArrayRef<int> HalfMask, int HalfOffset) {
+    if (InPlaceInputs.empty())
+      return;
+    if (InPlaceInputs.size() == 1) {
+      SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+          InPlaceInputs[0] - HalfOffset;
+      PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
+      return;
      }
-    ++Quad[EltIdx / 4];
-    InputQuads.set(EltIdx / 4);
-  }
-
-  int BestLoQuad = -1;
-  unsigned MaxQuad = 1;
-  for (unsigned i = 0; i < 4; ++i) {
-    if (LoQuad[i] > MaxQuad) {
-      BestLoQuad = i;
-      MaxQuad = LoQuad[i];
+    if (IncomingInputs.empty()) {
+      // Just fix all of the in place inputs.
+      for (int Input : InPlaceInputs) {
+        SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
+        PSHUFDMask[Input / 2] = Input / 2;
+      }
+      return;
      }
-  }
  
-  int BestHiQuad = -1;
-  MaxQuad = 1;
-  for (unsigned i = 0; i < 4; ++i) {
-    if (HiQuad[i] > MaxQuad) {
-      BestHiQuad = i;
-      MaxQuad = HiQuad[i];
-    }
-  }
+    assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
+    SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+        InPlaceInputs[0] - HalfOffset;
+    // Put the second input next to the first so that they are packed into
+    // a dword. We find the adjacent index by toggling the low bit.
+    int AdjIndex = InPlaceInputs[0] ^ 1;
+    SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
+    std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
+    PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
+  };
+  fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
+  fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
+
+  // Now gather the cross-half inputs and place them into a free dword of
+  // their target half.
+  // FIXME: This operation could almost certainly be simplified dramatically to
+  // look more like the 3-1 fixing operation.
+  auto moveInputsToRightHalf = [&PSHUFDMask](
+      MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
+      MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
+      MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
+      int DestOffset) {
+    auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
+      return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
+    };
+    auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
+                                               int Word) {
+      int LowWord = Word & ~1;
+      int HighWord = Word | 1;
+      return isWordClobbered(SourceHalfMask, LowWord) ||
+             isWordClobbered(SourceHalfMask, HighWord);
+    };
  
-  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
-  // of the two input vectors, shuffle them into one input vector so only a
-  // single pshufb instruction is necessary. If there are more than 2 input
-  // quads, disable the next transformation since it does not help SSSE3.
-  bool V1Used = InputQuads[0] || InputQuads[1];
-  bool V2Used = InputQuads[2] || InputQuads[3];
-  if (Subtarget->hasSSSE3()) {
-    if (InputQuads.count() == 2 && V1Used && V2Used) {
-      BestLoQuad = InputQuads[0] ? 0 : 1;
-      BestHiQuad = InputQuads[2] ? 2 : 3;
-    }
-    if (InputQuads.count() > 2) {
-      BestLoQuad = -1;
-      BestHiQuad = -1;
-    }
-  }
+    if (IncomingInputs.empty())
+      return;
  
-  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
-  // the shuffle mask.  If a quad is scored as -1, that means that it contains
-  // words from all 4 input quadwords.
-  SDValue NewV;
-  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
-    int MaskV[] = {
-      BestLoQuad < 0 ? 0 : BestLoQuad,
-      BestHiQuad < 0 ? 1 : BestHiQuad
-    };
-    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
-                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
-                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
-    NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
+    if (ExistingInputs.empty()) {
+      // Map any dwords with inputs from them into the right half.
+      for (int Input : IncomingInputs) {
+        // If the source half mask maps over the inputs, turn those into
+        // swaps and use the swapped lane.
+        if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
+          if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
+            SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
+                Input - SourceOffset;
+            // We have to swap the uses in our half mask in one sweep.
+            for (int &M : HalfMask)
+              if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
+                M = Input;
+              else if (M == Input)
+                M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+          } else {
+            assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
+                       Input - SourceOffset &&
+                   "Previous placement doesn't match!");
+          }
+          // Note that this correctly re-maps both when we do a swap and when
+          // we observe the other side of the swap above. We rely on that to
+          // avoid swapping the members of the input list directly.
+          Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+        }
  
-    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
-    // source words for the shuffle, to aid later transformations.
-    bool AllWordsInNewV = true;
-    bool InOrder[2] = { true, true };
-    for (unsigned i = 0; i != 8; ++i) {
-      int idx = MaskVals[i];
-      if (idx != (int)i)
-        InOrder[i/4] = false;
-      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
-        continue;
-      AllWordsInNewV = false;
-      break;
+        // Map the input's dword into the correct half.
+        if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
+          PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
+        else
+          assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
+                     Input / 2 &&
+                 "Previous placement doesn't match!");
+      }
+
+      // And just directly shift any other-half mask elements to be same-half
+      // as we will have mirrored the dword containing the element into the
+      // same position within that half.
+      for (int &M : HalfMask)
+        if (M >= SourceOffset && M < SourceOffset + 4) {
+          M = M - SourceOffset + DestOffset;
+          assert(M >= 0 && "This should never wrap below zero!");
+        }
+      return;
      }
  
-    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
-    if (AllWordsInNewV) {
-      for (int i = 0; i != 8; ++i) {
-        int idx = MaskVals[i];
-        if (idx < 0)
-          continue;
-        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
-        if ((idx != i) && idx < 4)
-          pshufhw = false;
-        if ((idx != i) && idx > 3)
-          pshuflw = false;
+    // Ensure we have the input in a viable dword of its current half. This
+    // is particularly tricky because the original position may be clobbered
+    // by inputs being moved and *staying* in that half.
+    if (IncomingInputs.size() == 1) {
+      if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+        int InputFixed = std::find(std::begin(SourceHalfMask),
+                                   std::end(SourceHalfMask), -1) -
+                         std::begin(SourceHalfMask) + SourceOffset;
+        SourceHalfMask[InputFixed - SourceOffset] =
+            IncomingInputs[0] - SourceOffset;
+        std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
+                     InputFixed);
+        IncomingInputs[0] = InputFixed;
        }
-      V1 = NewV;
-      V2Used = false;
-      BestLoQuad = 0;
-      BestHiQuad = 1;
-    }
+    } else if (IncomingInputs.size() == 2) {
+      if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
+          isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+        // We have two non-adjacent or clobbered inputs we need to extract from
+        // the source half. To do this, we need to map them into some adjacent
+        // dword slot in the source mask.
+        int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
+                              IncomingInputs[1] - SourceOffset};
+
+        // If there is a free slot in the source half mask adjacent to one of
+        // the inputs, place the other input in it. We use (Index XOR 1) to
+        // compute an adjacent index.
+        if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
+            SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
+          SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
+          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+          InputsFixed[1] = InputsFixed[0] ^ 1;
+        } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
+                   SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
+          SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
+          SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
+          InputsFixed[0] = InputsFixed[1] ^ 1;
+        } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
+                   SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
+          // The two inputs are in the same DWord but it is clobbered and the
+          // adjacent DWord isn't used at all. Move both inputs to the free
+          // slot.
+          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
+          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
+          InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
+          InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
+        } else {
+          // The only way we hit this point is if there is no clobbering
+          // (because there are no off-half inputs to this half) and there is no
+          // free slot adjacent to one of the inputs. In this case, we have to
+          // swap an input with a non-input.
+          for (int i = 0; i < 4; ++i)
+            assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
+                   "We can't handle any clobbers here!");
+          assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
+                 "Cannot have adjacent inputs here!");
+
+          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+          SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
+
+          // We also have to update the final source mask in this case because
+          // it may need to undo the above swap.
+          for (int &M : FinalSourceHalfMask)
+            if (M == (InputsFixed[0] ^ 1) + SourceOffset)
+              M = InputsFixed[1] + SourceOffset;
+            else if (M == InputsFixed[1] + SourceOffset)
+              M = (InputsFixed[0] ^ 1) + SourceOffset;
+
+          InputsFixed[1] = InputsFixed[0] ^ 1;
+        }
  
-    // If we've eliminated the use of V2, and the new mask is a pshuflw or
-    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
-    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
-      unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
-      unsigned TargetMask = 0;
-      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
-                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
-      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
-      TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
-                             getShufflePSHUFLWImmediate(SVOp);
-      V1 = NewV.getOperand(0);
-      return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
+        // Point everything at the fixed inputs.
+        for (int &M : HalfMask)
+          if (M == IncomingInputs[0])
+            M = InputsFixed[0] + SourceOffset;
+          else if (M == IncomingInputs[1])
+            M = InputsFixed[1] + SourceOffset;
+
+        IncomingInputs[0] = InputsFixed[0] + SourceOffset;
+        IncomingInputs[1] = InputsFixed[1] + SourceOffset;
+      }
+    } else {
+      llvm_unreachable("Unhandled input size!");
      }
-  }
  
-  // Promote splats to a larger type which usually leads to more efficient code.
-  // FIXME: Is this true if pshufb is available?
-  if (SVOp->isSplat())
-    return PromoteSplat(SVOp, DAG);
+    // Now hoist the DWord down to the right half.
+    int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
+    assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
+    PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
+    for (int &M : HalfMask)
+      for (int Input : IncomingInputs)
+        if (M == Input)
+          M = FreeDWord * 2 + Input % 2;
+  };
+  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
+                        /*SourceOffset*/ 4, /*DestOffset*/ 0);
+  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
+                        /*SourceOffset*/ 0, /*DestOffset*/ 4);
+
+  // Now enact all the shuffles we've computed to move the inputs into their
+  // target half.
+  if (!isNoopShuffleMask(PSHUFLMask))
+    V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+                    getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
+  if (!isNoopShuffleMask(PSHUFHMask))
+    V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+                    getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
+  if (!isNoopShuffleMask(PSHUFDMask))
+    V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                    DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                                DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+                                getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+
+  // At this point, each half should contain all its inputs, and we can then
+  // just shuffle them into their final position.
+  assert(std::count_if(LoMask.begin(), LoMask.end(),
+                       [](int M) { return M >= 4; }) == 0 &&
+         "Failed to lift all the high half inputs to the low mask!");
+  assert(std::count_if(HiMask.begin(), HiMask.end(),
+                       [](int M) { return M >= 0 && M < 4; }) == 0 &&
+         "Failed to lift all the low half inputs to the high mask!");
+
+  // Do a half shuffle for the low mask.
+  if (!isNoopShuffleMask(LoMask))
+    V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+                    getV4X86ShuffleImm8ForMask(LoMask, DAG));
+
+  // Do a half shuffle with the high mask after shifting its values down.
+  for (int &M : HiMask)
+    if (M >= 0)
+      M -= 4;
+  if (!isNoopShuffleMask(HiMask))
+    V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+                    getV4X86ShuffleImm8ForMask(HiMask, DAG));
  
-  // If we have SSSE3, and all words of the result are from 1 input vector,
-  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
-  // is present, fall back to case 4.
-  if (Subtarget->hasSSSE3()) {
-    SmallVector<SDValue,16> pshufbMask;
+  return V;
+}
  
-    // If we have elements from both input vectors, set the high bit of the
-    // shuffle mask element to zero out elements that come from V2 in the V1
-    // mask, and elements that come from V1 in the V2 mask, so that the two
-    // results can be OR'd together.
-    bool TwoInputs = V1Used && V2Used;
-    V1 = getPSHUFB(MaskVals, V1, dl, DAG);
-    if (!TwoInputs)
-      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+/// \brief Detect whether the mask pattern should be lowered through
+/// interleaving.
+///
+/// This essentially tests whether viewing the mask as an interleaving of two
+/// sub-sequences reduces the cross-input traffic of a blend operation. If so,
+/// lowering it through interleaving is a significantly better strategy.
+static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
+  int NumEvenInputs[2] = {0, 0};
+  int NumOddInputs[2] = {0, 0};
+  int NumLoInputs[2] = {0, 0};
+  int NumHiInputs[2] = {0, 0};
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
  
-    // Calculate the shuffle mask for the second input, shuffle it, and
-    // OR it with the first shuffled input.
-    CommuteVectorShuffleMask(MaskVals, 8);
-    V2 = getPSHUFB(MaskVals, V2, dl, DAG);
-    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
-    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
-  }
+    int InputIdx = Mask[i] >= Size;
  
-  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
-  // and update MaskVals with new element order.
-  std::bitset<8> InOrder;
-  if (BestLoQuad >= 0) {
-    int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
-    for (int i = 0; i != 4; ++i) {
-      int idx = MaskVals[i];
-      if (idx < 0) {
-        InOrder.set(i);
-      } else if ((idx / 4) == BestLoQuad) {
-        MaskV[i] = idx & 3;
-        InOrder.set(i);
-      }
-    }
-    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
-                                &MaskV[0]);
+    if (i < Size / 2)
+      ++NumLoInputs[InputIdx];
+    else
+      ++NumHiInputs[InputIdx];
  
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
-      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
-      NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
-                                  NewV.getOperand(0),
-                                  getShufflePSHUFLWImmediate(SVOp), DAG);
-    }
+    if ((i % 2) == 0)
+      ++NumEvenInputs[InputIdx];
+    else
+      ++NumOddInputs[InputIdx];
    }
  
-  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
-  // and update MaskVals with the new element order.
-  if (BestHiQuad >= 0) {
-    int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
-    for (unsigned i = 4; i != 8; ++i) {
-      int idx = MaskVals[i];
-      if (idx < 0) {
-        InOrder.set(i);
-      } else if ((idx / 4) == BestHiQuad) {
-        MaskV[i] = (idx & 3) + 4;
-        InOrder.set(i);
+  // The minimum number of cross-input results for both the interleaved and
+  // split cases. If interleaving results in fewer cross-input results, return
+  // true.
+  int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
+                                    NumEvenInputs[0] + NumOddInputs[1]);
+  int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
+                              NumLoInputs[0] + NumHiInputs[1]);
+  return InterleavedCrosses < SplitCrosses;
+}
+
+/// \brief Blend two v8i16 vectors using a naive unpack strategy.
+///
+/// This strategy only works when the inputs from each vector fit into a single
+/// half of that vector, and generally there are not so many inputs as to leave
+/// the in-place shuffles required highly constrained (and thus expensive). It
+/// shifts all the inputs into a single side of both input vectors and then
+/// uses an unpack to interleave these inputs in a single vector. At that
+/// point, we will fall back on the generic single input shuffle lowering.
+static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
+                                                 SDValue V2,
+                                                 MutableArrayRef<int> Mask,
+                                                 const X86Subtarget *Subtarget,
+                                                 SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+  assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+  SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
+  for (int i = 0; i < 8; ++i)
+    if (Mask[i] >= 0 && Mask[i] < 4)
+      LoV1Inputs.push_back(i);
+    else if (Mask[i] >= 4 && Mask[i] < 8)
+      HiV1Inputs.push_back(i);
+    else if (Mask[i] >= 8 && Mask[i] < 12)
+      LoV2Inputs.push_back(i);
+    else if (Mask[i] >= 12)
+      HiV2Inputs.push_back(i);
+
+  int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
+  int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
+  (void)NumV1Inputs;
+  (void)NumV2Inputs;
+  assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
+  assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
+  assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
+
+  bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
+                     HiV1Inputs.size() + HiV2Inputs.size();
+
+  auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
+                              ArrayRef<int> HiInputs, bool MoveToLo,
+                              int MaskOffset) {
+    ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
+    ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
+    if (BadInputs.empty())
+      return V;
+
+    int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+    int MoveOffset = MoveToLo ? 0 : 4;
+
+    if (GoodInputs.empty()) {
+      for (int BadInput : BadInputs) {
+        MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
+        Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
+      }
+    } else {
+      if (GoodInputs.size() == 2) {
+        // If the low inputs are spread across two dwords, pack them into
+        // a single dword.
+        MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
+        MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
+        Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
+        Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
+      } else {
+        // Otherwise pin the good inputs.
+        for (int GoodInput : GoodInputs)
+          MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
        }
-    }
-    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
-                                &MaskV[0]);
  
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
-      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
-      NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
-                                  NewV.getOperand(0),
-                                  getShufflePSHUFHWImmediate(SVOp), DAG);
+      if (BadInputs.size() == 2) {
+        // If we have two bad inputs then there may be either one or two good
+        // inputs fixed in place. Find a fixed input, and then find the *other*
+        // two adjacent indices by using modular arithmetic.
+        int GoodMaskIdx =
+            std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
+                         [](int M) { return M >= 0; }) -
+            std::begin(MoveMask);
+        int MoveMaskIdx =
+            ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
+        assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
+        assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
+        MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
+        MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
+        Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
+        Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
+      } else {
+        assert(BadInputs.size() == 1 && "All sizes handled");
+        int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
+                                    std::end(MoveMask), -1) -
+                          std::begin(MoveMask);
+        MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
+        Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
+      }
      }
-  }
  
-  // In case BestHi & BestLo were both -1, which means each quadword has a word
-  // from each of the four input quadwords, calculate the InOrder bitvector now
-  // before falling through to the insert/extract cleanup.
-  if (BestLoQuad == -1 && BestHiQuad == -1) {
-    NewV = V1;
-    for (int i = 0; i != 8; ++i)
-      if (MaskVals[i] < 0 || MaskVals[i] == i)
-        InOrder.set(i);
-  }
+    return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
+                                MoveMask);
+  };
+  V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
+                        /*MaskOffset*/ 0);
+  V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
+                        /*MaskOffset*/ 8);
  
-  // The other elements are put in the right place using pextrw and pinsrw.
-  for (unsigned i = 0; i != 8; ++i) {
-    if (InOrder[i])
-      continue;
-    int EltIdx = MaskVals[i];
-    if (EltIdx < 0)
-      continue;
-    SDValue ExtOp = (EltIdx < 8) ?
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
-                  DAG.getIntPtrConstant(EltIdx)) :
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
-                  DAG.getIntPtrConstant(EltIdx - 8));
-    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
-                       DAG.getIntPtrConstant(i));
-  }
-  return NewV;
+  // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
+  // cross-half traffic in the final shuffle.
+
+  // Munge the mask to be a single-input mask after the unpack merges the
+  // results.
+  for (int &M : Mask)
+    if (M != -1)
+      M = 2 * (M % 4) + (M / 8);
+
+  return DAG.getVectorShuffle(
+      MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
+                                  DL, MVT::v8i16, V1, V2),
+      DAG.getUNDEF(MVT::v8i16), Mask);
  }
  
-/// \brief v16i16 shuffles
+/// \brief Generic lowering of 8-lane i16 shuffles.
  ///
-/// FIXME: We only support generation of a single pshufb currently.  We can
-/// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
-/// well (e.g 2 x pshufb + 1 x por).
-static SDValue
-LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
+/// This handles both single-input shuffles and combined shuffle/blends with
+/// two inputs. The single input shuffles are immediately delegated to
+/// a dedicated lowering routine.
+///
+/// The blends are lowered in one of three fundamental ways. If there are few
+/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
+/// of the input is significantly cheaper when lowered as an interleaving of
+/// the two inputs, try to interleave them. Otherwise, blend the low and high
+/// halves of the inputs separately (making them have relatively few inputs)
+/// and then concatenate them.
+static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  SDLoc dl(SVOp);
+  ArrayRef<int> OrigMask = SVOp->getMask();
+  int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
+                        OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
+  MutableArrayRef<int> Mask(MaskStorage);
  
-  if (V2.getOpcode() != ISD::UNDEF)
-    return SDValue();
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
  
-  SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
-  return getPSHUFB(MaskVals, V1, dl, DAG);
-}
+  auto isV1 = [](int M) { return M >= 0 && M < 8; };
+  auto isV2 = [](int M) { return M >= 8; };
  
-// v16i8 shuffles - Prefer shuffles in the following order:
-// 1. [ssse3] 1 x pshufb
-// 2. [ssse3] 2 x pshufb + 1 x por
-// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
-static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
-                                        const X86Subtarget* Subtarget,
-                                        SelectionDAG &DAG) {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  SDLoc dl(SVOp);
-  ArrayRef<int> MaskVals = SVOp->getMask();
+  int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
+  int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
  
-  // Promote splats to a larger type which usually leads to more efficient code.
-  // FIXME: Is this true if pshufb is available?
-  if (SVOp->isSplat())
-    return PromoteSplat(SVOp, DAG);
+  if (NumV2Inputs == 0)
+    return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
  
-  // If we have SSSE3, case 1 is generated when all result bytes come from
-  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
-  // present, fall back to case 3.
+  assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
+                            "to be V1-input shuffles.");
  
-  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
-  if (Subtarget->hasSSSE3()) {
-    SmallVector<SDValue,16> pshufbMask;
+  if (NumV1Inputs + NumV2Inputs <= 4)
+    return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
  
-    // If all result elements are from one input vector, then only translate
-    // undef mask values to 0x80 (zero out result) in the pshufb mask.
-    //
-    // Otherwise, we have elements from both input vectors, and must zero out
-    // elements that come from V2 in the first mask, and V1 in the second mask
-    // so that we can OR them together.
-    for (unsigned i = 0; i != 16; ++i) {
-      int EltIdx = MaskVals[i];
-      if (EltIdx < 0 || EltIdx >= 16)
-        EltIdx = 0x80;
-      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+  // Check whether an interleaving lowering is likely to be more efficient.
+  // This isn't perfect but it is a strong heuristic that tends to work well on
+  // the kinds of shuffles that show up in practice.
+  //
+  // FIXME: Handle 1x, 2x, and 4x interleaving.
+  if (shouldLowerAsInterleaving(Mask)) {
+    // FIXME: Figure out whether we should pack these into the low or high
+    // halves.
+
+    int EMask[8], OMask[8];
+    for (int i = 0; i < 4; ++i) {
+      EMask[i] = Mask[2*i];
+      OMask[i] = Mask[2*i + 1];
+      EMask[i + 4] = -1;
+      OMask[i + 4] = -1;
      }
-    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
-                     DAG.getNode(ISD::BUILD_VECTOR, dl,
-                                 MVT::v16i8, pshufbMask));
  
-    // As PSHUFB will zero elements with negative indices, it's safe to ignore
-    // the 2nd operand if it's undefined or zero.
-    if (V2.getOpcode() == ISD::UNDEF ||
-        ISD::isBuildVectorAllZeros(V2.getNode()))
-      return V1;
+    SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
+    SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
  
-    // Calculate the shuffle mask for the second input, shuffle it, and
-    // OR it with the first shuffled input.
-    pshufbMask.clear();
-    for (unsigned i = 0; i != 16; ++i) {
-      int EltIdx = MaskVals[i];
-      EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
-      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
-    }
-    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
-                     DAG.getNode(ISD::BUILD_VECTOR, dl,
-                                 MVT::v16i8, pshufbMask));
-    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
    }
  
-  // No SSSE3 - Calculate in place words and then fix all out of place words
-  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
-  // the 16 different words that comprise the two doublequadword input vectors.
-  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
-  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
-  SDValue NewV = V1;
-  for (int i = 0; i != 8; ++i) {
-    int Elt0 = MaskVals[i*2];
-    int Elt1 = MaskVals[i*2+1];
+  int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
  
-    // This word of the result is all undef, skip it.
-    if (Elt0 < 0 && Elt1 < 0)
-      continue;
+  for (int i = 0; i < 4; ++i) {
+    LoBlendMask[i] = Mask[i];
+    HiBlendMask[i] = Mask[i + 4];
+  }
  
-    // This word of the result is already in the correct place, skip it.
-    if ((Elt0 == i*2) && (Elt1 == i*2+1))
-      continue;
+  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
+  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
+  LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
+  HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
  
-    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
-    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
-    SDValue InsElt;
+  return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                     DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
+}
  
-    // If Elt0 and Elt1 are defined, are consecutive, and can be load
-    // using a single extract together, load it and store it.
-    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
-      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
-                           DAG.getIntPtrConstant(Elt1 / 2));
-      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
-                        DAG.getIntPtrConstant(i));
+/// \brief Check whether a compaction lowering can be done by dropping even
+/// elements and compute how many times even elements must be dropped.
+///
+/// This handles shuffles which take every Nth element where N is a power of
+/// two. Example shuffle masks:
+///
+///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
+///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
+///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
+///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
+///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
+///
+/// Any of these lanes can of course be undef.
+///
+/// This routine only supports N <= 3.
+/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
+/// for larger N.
+///
+/// \returns N above, or the number of times even elements must be dropped if
+/// there is such a number. Otherwise returns zero.
+static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
+  // Figure out whether we're looping over two inputs or just one.
+  bool IsSingleInput = isSingleInputShuffleMask(Mask);
+
+  // The modulus for the shuffle vector entries is based on whether this is
+  // a single input or not.
+  int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
+  assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
+         "We should only be called with masks with a power-of-2 size!");
+
+  uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
+
+  // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
+  // and 2^3 simultaneously. This is because we may have ambiguity with
+  // partially undef inputs.
+  bool ViableForN[3] = {true, true, true};
+
+  for (int i = 0, e = Mask.size(); i < e; ++i) {
+    // Ignore undef lanes, we'll optimistically collapse them to the pattern we
+    // want.
+    if (Mask[i] == -1)
        continue;
-    }
  
-    // If Elt1 is defined, extract it from the appropriate source.  If the
-    // source byte is not also odd, shift the extracted word left 8 bits
-    // otherwise clear the bottom 8 bits if we need to do an or.
-    if (Elt1 >= 0) {
-      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
-                           DAG.getIntPtrConstant(Elt1 / 2));
-      if ((Elt1 & 1) == 0)
-        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
-                             DAG.getConstant(8,
-                                  TLI.getShiftAmountTy(InsElt.getValueType())));
-      else if (Elt0 >= 0)
-        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
-                             DAG.getConstant(0xFF00, MVT::i16));
-    }
-    // If Elt0 is defined, extract it from the appropriate source.  If the
-    // source byte is not also even, shift the extracted word right 8 bits. If
-    // Elt1 was also defined, OR the extracted values together before
-    // inserting them in the result.
-    if (Elt0 >= 0) {
-      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
-                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
-      if ((Elt0 & 1) != 0)
-        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
-                              DAG.getConstant(8,
-                                 TLI.getShiftAmountTy(InsElt0.getValueType())));
-      else if (Elt1 >= 0)
-        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
-                             DAG.getConstant(0x00FF, MVT::i16));
-      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
-                         : InsElt0;
-    }
-    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
-                       DAG.getIntPtrConstant(i));
+    bool IsAnyViable = false;
+    for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+      if (ViableForN[j]) {
+        uint64_t N = j + 1;
+
+        // The shuffle mask must be equal to (i * 2^N) % M.
+        if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
+          IsAnyViable = true;
+        else
+          ViableForN[j] = false;
+      }
+    // Early exit if we exhaust the possible powers of two.
+    if (!IsAnyViable)
+      break;
    }
-  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
+
+  for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+    if (ViableForN[j])
+      return j + 1;
+
+  // Return 0 as there is no viable power of two.
+  return 0;
  }
  
-// v32i8 shuffles - Translate to VPSHUFB if possible.
-static
-SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
-                                 const X86Subtarget *Subtarget,
-                                 SelectionDAG &DAG) {
-  MVT VT = SVOp->getSimpleValueType(0);
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  SDLoc dl(SVOp);
-  SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
+/// \brief Generic lowering of v16i8 shuffles.
+///
+/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
+/// detect any complexity reducing interleaving. If that doesn't help, it uses
+/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
+/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
+/// back together.
+static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> OrigMask = SVOp->getMask();
+  assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+  int MaskStorage[16] = {
+      OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
+      OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
+      OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
+      OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
+  MutableArrayRef<int> Mask(MaskStorage);
+  MutableArrayRef<int> LoMask = Mask.slice(0, 8);
+  MutableArrayRef<int> HiMask = Mask.slice(8, 8);
+
+  // For single-input shuffles, there are some nicer lowering tricks we can use.
+  if (isSingleInputShuffleMask(Mask)) {
+    // Check whether we can widen this to an i16 shuffle by duplicating bytes.
+    // Notably, this handles splat and partial-splat shuffles more efficiently.
+    // However, it only makes sense if the pre-duplication shuffle simplifies
+    // things significantly. Currently, this means we need to be able to
+    // express the pre-duplication shuffle as an i16 shuffle.
+    //
+    // FIXME: We should check for other patterns which can be widened into an
+    // i16 shuffle as well.
+    auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
+      for (int i = 0; i < 16; i += 2) {
+        if (Mask[i] != Mask[i + 1])
+          return false;
+      }
+      return true;
+    };
+    auto tryToWidenViaDuplication = [&]() -> SDValue {
+      if (!canWidenViaDuplication(Mask))
+        return SDValue();
+      SmallVector<int, 4> LoInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
+                   [](int M) { return M >= 0 && M < 8; });
+      std::sort(LoInputs.begin(), LoInputs.end());
+      LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
+                     LoInputs.end());
+      SmallVector<int, 4> HiInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
+                   [](int M) { return M >= 8; });
+      std::sort(HiInputs.begin(), HiInputs.end());
+      HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
+                     HiInputs.end());
+
+      bool TargetLo = LoInputs.size() >= HiInputs.size();
+      ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
+      ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
+
+      int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      SmallDenseMap<int, int, 8> LaneMap;
+      for (int I : InPlaceInputs) {
+        PreDupI16Shuffle[I/2] = I/2;
+        LaneMap[I] = I;
+      }
+      int j = TargetLo ? 0 : 4, je = j + 4;
+      for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
+        // Check if j is already a shuffle of this input. This happens when
+        // there are two adjacent bytes after we move the low one.
+        if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
+          // If we haven't yet mapped the input, search for a slot into which
+          // we can map it.
+          while (j < je && PreDupI16Shuffle[j] != -1)
+            ++j;
+
+          if (j == je)
+            // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+            return SDValue();
  
-  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
-  bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
-  bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
+          // Map this input with the i16 shuffle.
+          PreDupI16Shuffle[j] = MovingInputs[i] / 2;
+        }
  
-  // VPSHUFB may be generated if
-  // (1) one of input vector is undefined or zeroinitializer.
-  // The mask value 0x80 puts 0 in the corresponding slot of the vector.
-  // And (2) the mask indexes don't cross the 128-bit lane.
-  if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
-      (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
-    return SDValue();
+        // Update the lane map based on the mapping we ended up with.
+        LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
+      }
+      V1 = DAG.getNode(
+          ISD::BITCAST, DL, MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL,
+                               DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+                               DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
+
+      // Unpack the bytes to form the i16s that will be shuffled into place.
+      V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+                       MVT::v16i8, V1, V1);
+
+      int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      for (int i = 0; i < 16; i += 2) {
+        if (Mask[i] != -1)
+          PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+        assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
+      }
+      return DAG.getNode(
+          ISD::BITCAST, DL, MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL,
+                               DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+                               DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
+    };
+    if (SDValue V = tryToWidenViaDuplication())
+      return V;
+  }
  
-  if (V1IsAllZero && !V2IsAllZero) {
-    CommuteVectorShuffleMask(MaskVals, 32);
-    V1 = V2;
+  // Check whether an interleaving lowering is likely to be more efficient.
+  // This isn't perfect but it is a strong heuristic that tends to work well on
+  // the kinds of shuffles that show up in practice.
+  //
+  // FIXME: We need to handle other interleaving widths (i16, i32, ...).
+  if (shouldLowerAsInterleaving(Mask)) {
+    // FIXME: Figure out whether we should pack these into the low or high
+    // halves.
+
+    int EMask[16], OMask[16];
+    for (int i = 0; i < 8; ++i) {
+      EMask[i] = Mask[2*i];
+      OMask[i] = Mask[2*i + 1];
+      EMask[i + 8] = -1;
+      OMask[i + 8] = -1;
+    }
+
+    SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
+    SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
+
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds);
+  }
+
+  // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
+  // with PSHUFB. It is important to do this before we attempt to generate any
+  // blends but after all of the single-input lowerings. If the single input
+  // lowerings can find an instruction sequence that is faster than a PSHUFB, we
+  // want to preserve that and we can DAG combine any longer sequences into
+  // a PSHUFB in the end. But once we start blending from multiple inputs,
+  // the complexity of DAG combining bad patterns back into PSHUFB is too high,
+  // and there are *very* few patterns that would actually be faster than the
+  // PSHUFB approach because of its ability to zero lanes.
+  //
+  // FIXME: The only exceptions to the above are blends which are exact
+  // interleavings with direct instructions supporting them. We currently don't
+  // handle those well here.
+  if (Subtarget->hasSSSE3()) {
+    SDValue V1Mask[16];
+    SDValue V2Mask[16];
+    for (int i = 0; i < 16; ++i)
+      if (Mask[i] == -1) {
+        V1Mask[i] = V2Mask[i] = DAG.getConstant(0x80, MVT::i8);
+      } else {
+        V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8);
+        V2Mask[i] =
+            DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8);
+      }
+    V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+    if (isSingleInputShuffleMask(Mask))
+      return V1; // Single inputs are easy.
+
+    // Otherwise, blend the two.
+    V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
+    return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
    }
-  return getPSHUFB(MaskVals, V1, dl, DAG);
+
+  // Check whether a compaction lowering can be done. This handles shuffles
+  // which take every Nth element for some even N. See the helper function for
+  // details.
+  //
+  // We special case these as they can be particularly efficiently handled with
+  // the PACKUSB instruction on x86 and they show up in common patterns of
+  // rearranging bytes to truncate wide elements.
+  if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
+    // NumEvenDrops is the power of two stride of the elements. Another way of
+    // thinking about it is that we need to drop the even elements this many
+    // times to get the original input.
+    bool IsSingleInput = isSingleInputShuffleMask(Mask);
+
+    // First we need to zero all the dropped bytes.
+    assert(NumEvenDrops <= 3 &&
+           "No support for dropping even elements more than 3 times.");
+    // We use the mask type to pick which bytes are preserved based on how many
+    // elements are dropped.
+    MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
+    SDValue ByteClearMask =
+        DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
+                    DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
+    V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
+    if (!IsSingleInput)
+      V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
+
+    // Now pack things back together.
+    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
+    V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
+    SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
+    for (int i = 1; i < NumEvenDrops; ++i) {
+      Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
+      Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
+    }
+
+    return Result;
+  }
+
+  int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+
+  auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
+                            MutableArrayRef<int> V1HalfBlendMask,
+                            MutableArrayRef<int> V2HalfBlendMask) {
+    for (int i = 0; i < 8; ++i)
+      if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
+        V1HalfBlendMask[i] = HalfMask[i];
+        HalfMask[i] = i;
+      } else if (HalfMask[i] >= 16) {
+        V2HalfBlendMask[i] = HalfMask[i] - 16;
+        HalfMask[i] = i + 8;
+      }
+  };
+  buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
+  buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
+
+  SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
+
+  auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
+                             MutableArrayRef<int> HiBlendMask) {
+    SDValue V1, V2;
+    // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+    // them out and avoid using UNPCK{L,H} to extract the elements of V as
+    // i16s.
+    if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
+                     [](int M) { return M >= 0 && M % 2 == 1; }) &&
+        std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
+                     [](int M) { return M >= 0 && M % 2 == 1; })) {
+      // Use a mask to drop the high bytes.
+      V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+      V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
+                       DAG.getConstant(0x00FF, MVT::v8i16));
+
+      // This will be a single vector shuffle instead of a blend so nuke V2.
+      V2 = DAG.getUNDEF(MVT::v8i16);
+
+      // Squash the masks to point directly into V1.
+      for (int &M : LoBlendMask)
+        if (M >= 0)
+          M /= 2;
+      for (int &M : HiBlendMask)
+        if (M >= 0)
+          M /= 2;
+    } else {
+      // Otherwise just unpack the low half of V into V1 and the high half into
+      // V2 so that we can blend them as i16s.
+      V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                       DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+      V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                       DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+    }
+
+    SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
+    SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
+    return std::make_pair(BlendedLo, BlendedHi);
+  };
+  SDValue V1Lo, V1Hi, V2Lo, V2Hi;
+  std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
+  std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
+
+  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
+  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
+
+  return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
  }
  
-/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
-/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
-/// done when every pair / quad of shuffle mask elements point to elements in
-/// the right sequence. e.g.
-/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
-static
-SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
-                                 SelectionDAG &DAG) {
-  MVT VT = SVOp->getSimpleValueType(0);
-  SDLoc dl(SVOp);
-  unsigned NumElems = VT.getVectorNumElements();
-  MVT NewVT;
-  unsigned Scale;
+/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
+///
+/// This routine breaks down the specific type of 128-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                        MVT VT, const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
    switch (VT.SimpleTy) {
-  default: llvm_unreachable("Unexpected!");
    case MVT::v2i64:
+    return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
    case MVT::v2f64:
-           return SDValue(SVOp, 0);
-  case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
-  case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
-  case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
-  case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
-  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
-  case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
-  }
+    return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v4i32:
+    return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v4f32:
+    return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v8i16:
+    return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v16i8:
+    return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
  
-  SmallVector<int, 8> MaskVec;
-  for (unsigned i = 0; i != NumElems; i += Scale) {
-    int StartIdx = -1;
-    for (unsigned j = 0; j != Scale; ++j) {
-      int EltIdx = SVOp->getMaskElt(i+j);
-      if (EltIdx < 0)
-        continue;
-      if (StartIdx < 0)
-        StartIdx = (EltIdx / Scale);
-      if (EltIdx != (int)(StartIdx*Scale + j))
-        return SDValue();
-    }
-    MaskVec.push_back(StartIdx);
+  default:
+    llvm_unreachable("Unimplemented!");
    }
+}
  
-  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
-  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
-  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
+/// \brief Tiny helper function to test whether a shuffle mask could be
+/// simplified by widening the elements being shuffled.
+static bool canWidenShuffleElements(ArrayRef<int> Mask) {
+  for (int i = 0, Size = Mask.size(); i < Size; i += 2)
+    if (Mask[i] % 2 != 0 || Mask[i] + 1 != Mask[i+1])
+      return false;
+
+  return true;
  }
  
-/// getVZextMovL - Return a zero-extending vector move low node.
+/// \brief Top-level lowering for x86 vector shuffles.
  ///
-static SDValue getVZextMovL(MVT VT, MVT OpVT,
-                            SDValue SrcOp, SelectionDAG &DAG,
-                            const X86Subtarget *Subtarget, SDLoc dl) {
-  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
-    LoadSDNode *LD = nullptr;
-    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
-      LD = dyn_cast<LoadSDNode>(SrcOp);
-    if (!LD) {
-      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
-      // instead.
-      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
-      if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
-          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
-          SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
-          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
-        // PR2108
-        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
-        return DAG.getNode(ISD::BITCAST, dl, VT,
-                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
-                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-                                                   OpVT,
-                                                   SrcOp.getOperand(0)
-                                                          .getOperand(0))));
-      }
-    }
-  }
+/// This handles decomposition, canonicalization, and lowering of all x86
+/// vector shuffles. Most of the specific lowering strategies are encapsulated
+/// above in helper routines. The canonicalization attempts to widen shuffles
+/// to involve fewer lanes of wider elements, consolidate symmetric patterns
+/// s.t. only one of the two inputs needs to be tested, etc.
+static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+                                  SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getSimpleValueType();
+  int NumElements = VT.getVectorNumElements();
+  SDLoc dl(Op);
  
-  return DAG.getNode(ISD::BITCAST, dl, VT,
-                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
-                                 DAG.getNode(ISD::BITCAST, dl,
-                                             OpVT, SrcOp)));
-}
+  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
  
-/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
-/// which could not be matched by any known target speficic shuffle
-static SDValue
-LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  if (V1IsUndef && V2IsUndef)
+    return DAG.getUNDEF(VT);
  
-  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
-  if (NewOp.getNode())
-    return NewOp;
+  // When we create a shuffle node we put the UNDEF node to second operand,
+  // but in some cases the first operand may be transformed to UNDEF.
+  // In this case we should just commute the node.
+  if (V1IsUndef)
+    return DAG.getCommutedVectorShuffle(*SVOp);
+
+  // Check for non-undef masks pointing at an undef vector and make the masks
+  // undef as well. This makes it easier to match the shuffle based solely on
+  // the mask.
+  if (V2IsUndef)
+    for (int M : Mask)
+      if (M >= NumElements) {
+        SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+        for (int &M : NewMask)
+          if (M >= NumElements)
+            M = -1;
+        return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
+      }
  
-  MVT VT = SVOp->getSimpleValueType(0);
+  // For integer vector shuffles, try to collapse them into a shuffle of fewer
+  // lanes but wider integers. We cap this to not form integers larger than i64
+  // but it might be interesting to form i128 integers to handle flipping the
+  // low and high halves of AVX 256-bit vectors.
+  if (VT.isInteger() && VT.getScalarSizeInBits() < 64 &&
+      canWidenShuffleElements(Mask)) {
+    SmallVector<int, 8> NewMask;
+    for (int i = 0, Size = Mask.size(); i < Size; i += 2)
+      NewMask.push_back(Mask[i] / 2);
+    MVT NewVT =
+        MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2),
+                         VT.getVectorNumElements() / 2);
+    V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
+    return DAG.getNode(ISD::BITCAST, dl, VT,
+                       DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask));
+  }
  
-  unsigned NumElems = VT.getVectorNumElements();
-  unsigned NumLaneElems = NumElems / 2;
+  int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
+  for (int M : SVOp->getMask())
+    if (M < 0)
+      ++NumUndefElements;
+    else if (M < NumElements)
+      ++NumV1Elements;
+    else
+      ++NumV2Elements;
  
-  SDLoc dl(SVOp);
-  MVT EltVT = VT.getVectorElementType();
-  MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
-  SDValue Output[2];
+  // Commute the shuffle as needed such that more elements come from V1 than
+  // V2. This allows us to match the shuffle pattern strictly on how many
+  // elements come from V1 without handling the symmetric cases.
+  if (NumV2Elements > NumV1Elements)
+    return DAG.getCommutedVectorShuffle(*SVOp);
  
-  SmallVector<int, 16> Mask;
-  for (unsigned l = 0; l < 2; ++l) {
-    // Build a shuffle mask for the output, discovering on the fly which
-    // input vectors to use as shuffle operands (recorded in InputUsed).
-    // If building a suitable shuffle vector proves too hard, then bail
-    // out with UseBuildVector set.
-    bool UseBuildVector = false;
-    int InputUsed[2] = { -1, -1 }; // Not yet discovered.
-    unsigned LaneStart = l * NumLaneElems;
-    for (unsigned i = 0; i != NumLaneElems; ++i) {
-      // The mask element.  This indexes into the input.
-      int Idx = SVOp->getMaskElt(i+LaneStart);
-      if (Idx < 0) {
-        // the mask element does not index into any input vector.
-        Mask.push_back(-1);
-        continue;
-      }
+  // When the number of V1 and V2 elements are the same, try to minimize the
+  // number of uses of V2 in the low half of the vector.
+  if (NumV1Elements == NumV2Elements) {
+    int LowV1Elements = 0, LowV2Elements = 0;
+    for (int M : SVOp->getMask().slice(0, NumElements / 2))
+      if (M >= NumElements)
+        ++LowV2Elements;
+      else if (M >= 0)
+        ++LowV1Elements;
+    if (LowV2Elements > LowV1Elements)
+      return DAG.getCommutedVectorShuffle(*SVOp);
+  }
  
-      // The input vector this mask element indexes into.
-      int Input = Idx / NumLaneElems;
+  // For each vector width, delegate to a specialized lowering routine.
+  if (VT.getSizeInBits() == 128)
+    return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
  
-      // Turn the index into an offset from the start of the input vector.
-      Idx -= Input * NumLaneElems;
+  llvm_unreachable("Unimplemented!");
+}
  
-      // Find or create a shuffle vector operand to hold this input.
-      unsigned OpNo;
-      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
-        if (InputUsed[OpNo] == Input)
-          // This input vector is already an operand.
-          break;
-        if (InputUsed[OpNo] < 0) {
-          // Create a new operand for this input vector.
-          InputUsed[OpNo] = Input;
-          break;
-        }
-      }
  
-      if (OpNo >= array_lengthof(InputUsed)) {
-        // More than two input vectors used!  Give up on trying to create a
-        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
-        UseBuildVector = true;
-        break;
-      }
+//===----------------------------------------------------------------------===//
+// Legacy vector shuffle lowering
+//
+// This code is the legacy code handling vector shuffles until the above
+// replaces its functionality and performance.
+//===----------------------------------------------------------------------===//
  
-      // Add the mask index for the new shuffle vector.
-      Mask.push_back(Idx + OpNo * NumLaneElems);
-    }
+static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
+                        bool hasInt256, unsigned *MaskOut = nullptr) {
+  MVT EltVT = VT.getVectorElementType();
  
-    if (UseBuildVector) {
-      SmallVector<SDValue, 16> SVOps;
-      for (unsigned i = 0; i != NumLaneElems; ++i) {
-        // The mask element.  This indexes into the input.
-        int Idx = SVOp->getMaskElt(i+LaneStart);
-        if (Idx < 0) {
-          SVOps.push_back(DAG.getUNDEF(EltVT));
-          continue;
-        }
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return false;
  
-        // The input vector this mask element indexes into.
-        int Input = Idx / NumElems;
+  if (!hasSSE41 || EltVT == MVT::i8)
+    return false;
+  if (!hasInt256 && VT == MVT::v16i16)
+    return false;
  
-        // Turn the index into an offset from the start of the input vector.
-        Idx -= Input * NumElems;
+  unsigned MaskValue = 0;
+  unsigned NumElems = VT.getVectorNumElements();
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
+  unsigned NumElemsInLane = NumElems / NumLanes;
  
-        // Extract the vector element by hand.
-        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
-                                    SVOp->getOperand(Input),
-                                    DAG.getIntPtrConstant(Idx)));
-      }
+  // Blend for v16i16 should be symetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
  
-      // Construct the output using a BUILD_VECTOR.
-      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
-    } else if (InputUsed[0] < 0) {
-      // No input vectors were used! The result is undefined.
-      Output[l] = DAG.getUNDEF(NVT);
-    } else {
-      SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
-                                        (InputUsed[0] % 2) * NumLaneElems,
-                                        DAG, dl);
-      // If only one input was used, use an undefined vector for the other.
-      SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
-        Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
-                            (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
-      // At least one input vector was used. Create a new shuffle vector.
-      Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
-    }
+    int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
+    int EltIdx = MaskVals[i];
  
-    Mask.clear();
+    if ((EltIdx < 0 || EltIdx == (int)i) &&
+        (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
+      continue;
+
+    if (((unsigned)EltIdx == (i + NumElems)) &&
+        (SndLaneEltIdx < 0 ||
+         (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
+      MaskValue |= (1 << i);
+    else
+      return false;
    }
  
-  // Concatenate the result back
-  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
+  if (MaskOut)
+    *MaskOut = MaskValue;
+  return true;
  }
  
-/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
-/// 4 elements, and match them with several different shuffle types.
-static SDValue
-LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+// Try to lower a shuffle node into a simple blend instruction.
+// This function assumes isBlendMask returns true for this
+// SuffleVectorSDNode
+static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
+                                          unsigned MaskValue,
+                                          const X86Subtarget *Subtarget,
+                                          SelectionDAG &DAG) {
+  MVT VT = SVOp->getSimpleValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
+                     Subtarget->hasInt256() && "Trying to lower a "
+                                               "VECTOR_SHUFFLE to a Blend but "
+                                               "with the wrong mask"));
    SDValue V1 = SVOp->getOperand(0);
    SDValue V2 = SVOp->getOperand(1);
    SDLoc dl(SVOp);
-  MVT VT = SVOp->getSimpleValueType(0);
-
-  assert(VT.is128BitVector() && "Unsupported vector size");
-
-  std::pair<int, int> Locs[4];
-  int Mask1[] = { -1, -1, -1, -1 };
-  SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
+  unsigned NumElems = VT.getVectorNumElements();
  
-  unsigned NumHi = 0;
-  unsigned NumLo = 0;
-  for (unsigned i = 0; i != 4; ++i) {
-    int Idx = PermMask[i];
-    if (Idx < 0) {
-      Locs[i] = std::make_pair(-1, -1);
-    } else {
-      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
-      if (Idx < 4) {
-        Locs[i] = std::make_pair(0, NumLo);
-        Mask1[NumLo] = Idx;
-        NumLo++;
-      } else {
-        Locs[i] = std::make_pair(1, NumHi);
-        if (2+NumHi < 4)
-          Mask1[2+NumHi] = Idx;
-        NumHi++;
-      }
-    }
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  MVT BlendVT = VT;
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+                               NumElems);
+    V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
    }
  
-  if (NumLo <= 2 && NumHi <= 2) {
-    // If no more than two elements come from either vector. This can be
-    // implemented with two shuffles. First shuffle gather the elements.
-    // The second shuffle, which takes the first shuffle as both of its
-    // vector operands, put the elements into the right order.
-    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
+                            DAG.getConstant(MaskValue, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
  
-    int Mask2[] = { -1, -1, -1, -1 };
+/// In vector type \p VT, return true if the element at index \p InputIdx
+/// falls on a different 128-bit lane than \p OutputIdx.
+static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
+                                     unsigned OutputIdx) {
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
+}
  
-    for (unsigned i = 0; i != 4; ++i)
-      if (Locs[i].first != -1) {
-        unsigned Idx = (i < 2) ? 0 : 4;
-        Idx += Locs[i].first * 2 + Locs[i].second;
-        Mask2[i] = Idx;
-      }
+/// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
+/// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
+/// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
+/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
+/// zero.
+static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
+                         SelectionDAG &DAG) {
+  MVT VT = V1.getSimpleValueType();
+  assert(VT.is128BitVector() || VT.is256BitVector());
  
-    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
-  }
+  MVT EltVT = VT.getVectorElementType();
+  unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
+  unsigned NumElts = VT.getVectorNumElements();
  
-  if (NumLo == 3 || NumHi == 3) {
-    // Otherwise, we must have three elements from one vector, call it X, and
-    // one element from the other, call it Y.  First, use a shufps to build an
-    // intermediate vector with the one element from Y and the element from X
-    // that will be in the same half in the final destination (the indexes don't
-    // matter). Then, use a shufps to build the final vector, taking the half
-    // containing the element from Y from the intermediate, and the other half
-    // from X.
-    if (NumHi == 3) {
-      // Normalize it so the 3 elements come from V1.
-      CommuteVectorShuffleMask(PermMask, 4);
-      std::swap(V1, V2);
+  SmallVector<SDValue, 32> PshufbMask;
+  for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
+    int InputIdx = MaskVals[OutputIdx];
+    unsigned InputByteIdx;
+
+    if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
+      InputByteIdx = 0x80;
+    else {
+      // Cross lane is not allowed.
+      if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
+        return SDValue();
+      InputByteIdx = InputIdx * EltSizeInBytes;
+      // Index is an byte offset within the 128-bit lane.
+      InputByteIdx &= 0xf;
      }
  
-    // Find the element from V2.
-    unsigned HiIndex;
-    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
-      int Val = PermMask[HiIndex];
-      if (Val < 0)
-        continue;
-      if (Val >= 4)
-        break;
+    for (unsigned j = 0; j < EltSizeInBytes; ++j) {
+      PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
+      if (InputByteIdx != 0x80)
+        ++InputByteIdx;
      }
+  }
  
-    Mask1[0] = PermMask[HiIndex];
-    Mask1[1] = -1;
-    Mask1[2] = PermMask[HiIndex^1];
-    Mask1[3] = -1;
-    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+  MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
+  if (ShufVT != VT)
+    V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
+  return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
+}
  
-    if (HiIndex >= 2) {
-      Mask1[0] = PermMask[0];
-      Mask1[1] = PermMask[1];
-      Mask1[2] = HiIndex & 1 ? 6 : 4;
-      Mask1[3] = HiIndex & 1 ? 4 : 6;
-      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+// v8i16 shuffles - Prefer shuffles in the following order:
+// 1. [all]   pshuflw, pshufhw, optional move
+// 2. [ssse3] 1 x pshufb
+// 3. [ssse3] 2 x pshufb + 1 x por
+// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
+static SDValue
+LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
+                         SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  SDLoc dl(SVOp);
+  SmallVector<int, 8> MaskVals;
+
+  // Determine if more than 1 of the words in each of the low and high quadwords
+  // of the result come from the same quadword of one of the two inputs.  Undef
+  // mask values count as coming from any quadword, for better codegen.
+  //
+  // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
+  // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
+  unsigned LoQuad[] = { 0, 0, 0, 0 };
+  unsigned HiQuad[] = { 0, 0, 0, 0 };
+  // Indices of quads used.
+  std::bitset<4> InputQuads;
+  for (unsigned i = 0; i < 8; ++i) {
+    unsigned *Quad = i < 4 ? LoQuad : HiQuad;
+    int EltIdx = SVOp->getMaskElt(i);
+    MaskVals.push_back(EltIdx);
+    if (EltIdx < 0) {
+      ++Quad[0];
+      ++Quad[1];
+      ++Quad[2];
+      ++Quad[3];
+      continue;
      }
+    ++Quad[EltIdx / 4];
+    InputQuads.set(EltIdx / 4);
+  }
  
-    Mask1[0] = HiIndex & 1 ? 2 : 0;
-    Mask1[1] = HiIndex & 1 ? 0 : 2;
-    Mask1[2] = PermMask[2];
-    Mask1[3] = PermMask[3];
-    if (Mask1[2] >= 0)
-      Mask1[2] += 4;
-    if (Mask1[3] >= 0)
-      Mask1[3] += 4;
-    return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
+  int BestLoQuad = -1;
+  unsigned MaxQuad = 1;
+  for (unsigned i = 0; i < 4; ++i) {
+    if (LoQuad[i] > MaxQuad) {
+      BestLoQuad = i;
+      MaxQuad = LoQuad[i];
+    }
    }
  
-  // Break it into (shuffle shuffle_hi, shuffle_lo).
-  int LoMask[] = { -1, -1, -1, -1 };
-  int HiMask[] = { -1, -1, -1, -1 };
+  int BestHiQuad = -1;
+  MaxQuad = 1;
+  for (unsigned i = 0; i < 4; ++i) {
+    if (HiQuad[i] > MaxQuad) {
+      BestHiQuad = i;
+      MaxQuad = HiQuad[i];
+    }
+  }
  
-  int *MaskPtr = LoMask;
-  unsigned MaskIdx = 0;
-  unsigned LoIdx = 0;
-  unsigned HiIdx = 2;
-  for (unsigned i = 0; i != 4; ++i) {
-    if (i == 2) {
-      MaskPtr = HiMask;
-      MaskIdx = 1;
-      LoIdx = 0;
-      HiIdx = 2;
+  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
+  // of the two input vectors, shuffle them into one input vector so only a
+  // single pshufb instruction is necessary. If there are more than 2 input
+  // quads, disable the next transformation since it does not help SSSE3.
+  bool V1Used = InputQuads[0] || InputQuads[1];
+  bool V2Used = InputQuads[2] || InputQuads[3];
+  if (Subtarget->hasSSSE3()) {
+    if (InputQuads.count() == 2 && V1Used && V2Used) {
+      BestLoQuad = InputQuads[0] ? 0 : 1;
+      BestHiQuad = InputQuads[2] ? 2 : 3;
      }
-    int Idx = PermMask[i];
-    if (Idx < 0) {
-      Locs[i] = std::make_pair(-1, -1);
-    } else if (Idx < 4) {
-      Locs[i] = std::make_pair(MaskIdx, LoIdx);
-      MaskPtr[LoIdx] = Idx;
-      LoIdx++;
-    } else {
-      Locs[i] = std::make_pair(MaskIdx, HiIdx);
-      MaskPtr[HiIdx] = Idx;
-      HiIdx++;
+    if (InputQuads.count() > 2) {
+      BestLoQuad = -1;
+      BestHiQuad = -1;
      }
    }
  
-  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
-  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
-  int MaskOps[] = { -1, -1, -1, -1 };
-  for (unsigned i = 0; i != 4; ++i)
-    if (Locs[i].first != -1)
-      MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
-  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
-}
-
-static bool MayFoldVectorLoad(SDValue V) {
-  while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
-    V = V.getOperand(0);
-
-  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
-    V = V.getOperand(0);
-  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
-      V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
-    // BUILD_VECTOR (load), undef
-    V = V.getOperand(0);
+  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
+  // the shuffle mask.  If a quad is scored as -1, that means that it contains
+  // words from all 4 input quadwords.
+  SDValue NewV;
+  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
+    int MaskV[] = {
+      BestLoQuad < 0 ? 0 : BestLoQuad,
+      BestHiQuad < 0 ? 1 : BestHiQuad
+    };
+    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
+                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
+                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
+    NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
  
-  return MayFoldLoad(V);
-}
+    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
+    // source words for the shuffle, to aid later transformations.
+    bool AllWordsInNewV = true;
+    bool InOrder[2] = { true, true };
+    for (unsigned i = 0; i != 8; ++i) {
+      int idx = MaskVals[i];
+      if (idx != (int)i)
+        InOrder[i/4] = false;
+      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
+        continue;
+      AllWordsInNewV = false;
+      break;
+    }
  
-static
-SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
+    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
+    if (AllWordsInNewV) {
+      for (int i = 0; i != 8; ++i) {
+        int idx = MaskVals[i];
+        if (idx < 0)
+          continue;
+        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
+        if ((idx != i) && idx < 4)
+          pshufhw = false;
+        if ((idx != i) && idx > 3)
+          pshuflw = false;
+      }
+      V1 = NewV;
+      V2Used = false;
+      BestLoQuad = 0;
+      BestHiQuad = 1;
+    }
  
-  // Canonizalize to v2f64.
-  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
-  return DAG.getNode(ISD::BITCAST, dl, VT,
-                     getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
-                                          V1, DAG));
-}
+    // If we've eliminated the use of V2, and the new mask is a pshuflw or
+    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
+    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
+      unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
+      unsigned TargetMask = 0;
+      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
+                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
+      TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
+                             getShufflePSHUFLWImmediate(SVOp);
+      V1 = NewV.getOperand(0);
+      return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
+    }
+  }
  
-static
-SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
-                        bool HasSSE2) {
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  MVT VT = Op.getSimpleValueType();
+  // Promote splats to a larger type which usually leads to more efficient code.
+  // FIXME: Is this true if pshufb is available?
+  if (SVOp->isSplat())
+    return PromoteSplat(SVOp, DAG);
  
-  assert(VT != MVT::v2i64 && "unsupported shuffle type");
+  // If we have SSSE3, and all words of the result are from 1 input vector,
+  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
+  // is present, fall back to case 4.
+  if (Subtarget->hasSSSE3()) {
+    SmallVector<SDValue,16> pshufbMask;
  
-  if (HasSSE2 && VT == MVT::v2f64)
-    return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
-
-  // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
-  return DAG.getNode(ISD::BITCAST, dl, VT,
-                     getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
-                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
-                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
-}
-
-static
-SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  MVT VT = Op.getSimpleValueType();
-
-  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
-         "unsupported shuffle type");
-
-  if (V2.getOpcode() == ISD::UNDEF)
-    V2 = V1;
-
-  // v4i32 or v4f32
-  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
-}
-
-static
-SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  MVT VT = Op.getSimpleValueType();
-  unsigned NumElems = VT.getVectorNumElements();
+    // If we have elements from both input vectors, set the high bit of the
+    // shuffle mask element to zero out elements that come from V2 in the V1
+    // mask, and elements that come from V1 in the V2 mask, so that the two
+    // results can be OR'd together.
+    bool TwoInputs = V1Used && V2Used;
+    V1 = getPSHUFB(MaskVals, V1, dl, DAG);
+    if (!TwoInputs)
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
  
-  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
-  // operand of these instructions is only memory, so check if there's a
-  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
-  // same masks.
-  bool CanFoldLoad = false;
+    // Calculate the shuffle mask for the second input, shuffle it, and
+    // OR it with the first shuffled input.
+    CommuteVectorShuffleMask(MaskVals, 8);
+    V2 = getPSHUFB(MaskVals, V2, dl, DAG);
+    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+  }
  
-  // Trivial case, when V2 comes from a load.
-  if (MayFoldVectorLoad(V2))
-    CanFoldLoad = true;
+  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
+  // and update MaskVals with new element order.
+  std::bitset<8> InOrder;
+  if (BestLoQuad >= 0) {
+    int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
+    for (int i = 0; i != 4; ++i) {
+      int idx = MaskVals[i];
+      if (idx < 0) {
+        InOrder.set(i);
+      } else if ((idx / 4) == BestLoQuad) {
+        MaskV[i] = idx & 3;
+        InOrder.set(i);
+      }
+    }
+    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+                                &MaskV[0]);
  
-  // When V1 is a load, it can be folded later into a store in isel, example:
-  //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
-  //    turns into:
-  //  (MOVLPSmr addr:$src1, VR128:$src2)
-  // So, recognize this potential and also use MOVLPS or MOVLPD
-  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
-    CanFoldLoad = true;
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
+      NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
+                                  NewV.getOperand(0),
+                                  getShufflePSHUFLWImmediate(SVOp), DAG);
+    }
+  }
  
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  if (CanFoldLoad) {
-    if (HasSSE2 && NumElems == 2)
-      return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
+  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
+  // and update MaskVals with the new element order.
+  if (BestHiQuad >= 0) {
+    int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
+    for (unsigned i = 4; i != 8; ++i) {
+      int idx = MaskVals[i];
+      if (idx < 0) {
+        InOrder.set(i);
+      } else if ((idx / 4) == BestHiQuad) {
+        MaskV[i] = (idx & 3) + 4;
+        InOrder.set(i);
+      }
+    }
+    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+                                &MaskV[0]);
  
-    if (NumElems == 4)
-      // If we don't care about the second element, proceed to use movss.
-      if (SVOp->getMaskElt(1) != -1)
-        return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
+      NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
+                                  NewV.getOperand(0),
+                                  getShufflePSHUFHWImmediate(SVOp), DAG);
+    }
    }
  
-  // movl and movlp will both match v2i64, but v2i64 is never matched by
-  // movl earlier because we make it strict to avoid messing with the movlp load
-  // folding logic (see the code above getMOVLP call). Match it here then,
-  // this is horrible, but will stay like this until we move all shuffle
-  // matching to x86 specific nodes. Note that for the 1st condition all
-  // types are matched with movsd.
-  if (HasSSE2) {
-    // FIXME: isMOVLMask should be checked and matched before getMOVLP,
-    // as to remove this logic from here, as much as possible
-    if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
-      return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
-    return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
+  // In case BestHi & BestLo were both -1, which means each quadword has a word
+  // from each of the four input quadwords, calculate the InOrder bitvector now
+  // before falling through to the insert/extract cleanup.
+  if (BestLoQuad == -1 && BestHiQuad == -1) {
+    NewV = V1;
+    for (int i = 0; i != 8; ++i)
+      if (MaskVals[i] < 0 || MaskVals[i] == i)
+        InOrder.set(i);
    }
  
-  assert(VT != MVT::v4i32 && "unsupported shuffle type");
-
-  // Invert the operand order and use SHUFPS to match it.
-  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
-                              getShuffleSHUFImmediate(SVOp), DAG);
+  // The other elements are put in the right place using pextrw and pinsrw.
+  for (unsigned i = 0; i != 8; ++i) {
+    if (InOrder[i])
+      continue;
+    int EltIdx = MaskVals[i];
+    if (EltIdx < 0)
+      continue;
+    SDValue ExtOp = (EltIdx < 8) ?
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
+                  DAG.getIntPtrConstant(EltIdx)) :
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
+                  DAG.getIntPtrConstant(EltIdx - 8));
+    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
+                       DAG.getIntPtrConstant(i));
+  }
+  return NewV;
  }
  
-static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
-                                         SelectionDAG &DAG) {
-  SDLoc dl(Load);
-  MVT VT = Load->getSimpleValueType(0);
-  MVT EVT = VT.getVectorElementType();
-  SDValue Addr = Load->getOperand(1);
-  SDValue NewAddr = DAG.getNode(
-      ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
-      DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
+/// \brief v16i16 shuffles
+///
+/// FIXME: We only support generation of a single pshufb currently.  We can
+/// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
+/// well (e.g 2 x pshufb + 1 x por).
+static SDValue
+LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  SDLoc dl(SVOp);
  
-  SDValue NewLoad =
-      DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
-                  DAG.getMachineFunction().getMachineMemOperand(
-                      Load->getMemOperand(), 0, EVT.getStoreSize()));
-  return NewLoad;
+  if (V2.getOpcode() != ISD::UNDEF)
+    return SDValue();
+
+  SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
+  return getPSHUFB(MaskVals, V1, dl, DAG);
  }
  
-// It is only safe to call this function if isINSERTPSMask is true for
-// this shufflevector mask.
-static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
-                           SelectionDAG &DAG) {
-  // Generate an insertps instruction when inserting an f32 from memory onto a
-  // v4f32 or when copying a member from one v4f32 to another.
-  // We also use it for transferring i32 from one register to another,
-  // since it simply copies the same bits.
-  // If we're transferring an i32 from memory to a specific element in a
-  // register, we output a generic DAG that will match the PINSRD
-  // instruction.
-  MVT VT = SVOp->getSimpleValueType(0);
-  MVT EVT = VT.getVectorElementType();
+// v16i8 shuffles - Prefer shuffles in the following order:
+// 1. [ssse3] 1 x pshufb
+// 2. [ssse3] 2 x pshufb + 1 x por
+// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
+static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
+                                        const X86Subtarget* Subtarget,
+                                        SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    SDValue V1 = SVOp->getOperand(0);
    SDValue V2 = SVOp->getOperand(1);
-  auto Mask = SVOp->getMask();
-  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
-         "unsupported vector type for insertps/pinsrd");
+  SDLoc dl(SVOp);
+  ArrayRef<int> MaskVals = SVOp->getMask();
  
-  auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
-  auto FromV2Predicate = [](const int &i) { return i >= 4; };
-  int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
+  // Promote splats to a larger type which usually leads to more efficient code.
+  // FIXME: Is this true if pshufb is available?
+  if (SVOp->isSplat())
+    return PromoteSplat(SVOp, DAG);
  
-  SDValue From;
-  SDValue To;
-  unsigned DestIndex;
-  if (FromV1 == 1) {
-    From = V1;
-    To = V2;
-    DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
-                Mask.begin();
-  } else {
-    assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
-           "More than one element from V1 and from V2, or no elements from one "
-           "of the vectors. This case should not have returned true from "
-           "isINSERTPSMask");
-    From = V2;
-    To = V1;
-    DestIndex =
-        std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
-  }
+  // If we have SSSE3, case 1 is generated when all result bytes come from
+  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
+  // present, fall back to case 3.
  
-  unsigned SrcIndex = Mask[DestIndex] % 4;
-  if (MayFoldLoad(From)) {
-    // Trivial case, when From comes from a load and is only used by the
-    // shuffle. Make it use insertps from the vector that we need from that
-    // load.
-    SDValue NewLoad =
-        NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
-    if (!NewLoad.getNode())
-      return SDValue();
+  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
+  if (Subtarget->hasSSSE3()) {
+    SmallVector<SDValue,16> pshufbMask;
  
-    if (EVT == MVT::f32) {
-      // Create this as a scalar to vector to match the instruction pattern.
-      SDValue LoadScalarToVector =
-          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
-      SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
-      return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
-                         InsertpsMask);
-    } else { // EVT == MVT::i32
-      // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
-      // instruction, to match the PINSRD instruction, which loads an i32 to a
-      // certain vector element.
-      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
-                         DAG.getConstant(DestIndex, MVT::i32));
+    // If all result elements are from one input vector, then only translate
+    // undef mask values to 0x80 (zero out result) in the pshufb mask.
+    //
+    // Otherwise, we have elements from both input vectors, and must zero out
+    // elements that come from V2 in the first mask, and V1 in the second mask
+    // so that we can OR them together.
+    for (unsigned i = 0; i != 16; ++i) {
+      int EltIdx = MaskVals[i];
+      if (EltIdx < 0 || EltIdx >= 16)
+        EltIdx = 0x80;
+      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
      }
-  }
-
-  // Vector-element-to-vector
-  SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
-  return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
-}
-
-// Reduce a vector shuffle to zext.
-static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
-                                    SelectionDAG &DAG) {
-  // PMOVZX is only available from SSE41.
-  if (!Subtarget->hasSSE41())
-    return SDValue();
-
-  MVT VT = Op.getSimpleValueType();
+    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, pshufbMask));
  
-  // Only AVX2 support 256-bit vector integer extending.
-  if (!Subtarget->hasInt256() && VT.is256BitVector())
-    return SDValue();
+    // As PSHUFB will zero elements with negative indices, it's safe to ignore
+    // the 2nd operand if it's undefined or zero.
+    if (V2.getOpcode() == ISD::UNDEF ||
+        ISD::isBuildVectorAllZeros(V2.getNode()))
+      return V1;
  
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  SDLoc DL(Op);
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  unsigned NumElems = VT.getVectorNumElements();
+    // Calculate the shuffle mask for the second input, shuffle it, and
+    // OR it with the first shuffled input.
+    pshufbMask.clear();
+    for (unsigned i = 0; i != 16; ++i) {
+      int EltIdx = MaskVals[i];
+      EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+    }
+    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, pshufbMask));
+    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+  }
  
-  // Extending is an unary operation and the element type of the source vector
-  // won't be equal to or larger than i64.
-  if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
-      VT.getVectorElementType() == MVT::i64)
-    return SDValue();
+  // No SSSE3 - Calculate in place words and then fix all out of place words
+  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
+  // the 16 different words that comprise the two doublequadword input vectors.
+  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
+  SDValue NewV = V1;
+  for (int i = 0; i != 8; ++i) {
+    int Elt0 = MaskVals[i*2];
+    int Elt1 = MaskVals[i*2+1];
  
-  // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
-  unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
-  while ((1U << Shift) < NumElems) {
-    if (SVOp->getMaskElt(1U << Shift) == 1)
-      break;
-    Shift += 1;
-    // The maximal ratio is 8, i.e. from i8 to i64.
-    if (Shift > 3)
-      return SDValue();
-  }
+    // This word of the result is all undef, skip it.
+    if (Elt0 < 0 && Elt1 < 0)
+      continue;
  
-  // Check the shuffle mask.
-  unsigned Mask = (1U << Shift) - 1;
-  for (unsigned i = 0; i != NumElems; ++i) {
-    int EltIdx = SVOp->getMaskElt(i);
-    if ((i & Mask) != 0 && EltIdx != -1)
-      return SDValue();
-    if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
-      return SDValue();
-  }
+    // This word of the result is already in the correct place, skip it.
+    if ((Elt0 == i*2) && (Elt1 == i*2+1))
+      continue;
  
-  unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
-  MVT NeVT = MVT::getIntegerVT(NBits);
-  MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
+    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
+    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
+    SDValue InsElt;
  
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
-    return SDValue();
+    // If Elt0 and Elt1 are defined, are consecutive, and can be load
+    // using a single extract together, load it and store it.
+    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
+      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+                           DAG.getIntPtrConstant(Elt1 / 2));
+      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+                        DAG.getIntPtrConstant(i));
+      continue;
+    }
  
-  // Simplify the operand as it's prepared to be fed into shuffle.
-  unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
-  if (V1.getOpcode() == ISD::BITCAST &&
-      V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
-      V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-      V1.getOperand(0).getOperand(0)
-        .getSimpleValueType().getSizeInBits() == SignificantBits) {
-    // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
-    SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
-    ConstantSDNode *CIdx =
-      dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
-    // If it's foldable, i.e. normal load with single use, we will let code
-    // selection to fold it. Otherwise, we will short the conversion sequence.
-    if (CIdx && CIdx->getZExtValue() == 0 &&
-        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
-      MVT FullVT = V.getSimpleValueType();
-      MVT V1VT = V1.getSimpleValueType();
-      if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) {
-        // The "ext_vec_elt" node is wider than the result node.
-        // In this case we should extract subvector from V.
-        // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
-        unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits();
-        MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(),
-                                        FullVT.getVectorNumElements()/Ratio);
-        V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
-                        DAG.getIntPtrConstant(0));
-      }
-      V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V);
+    // If Elt1 is defined, extract it from the appropriate source.  If the
+    // source byte is not also odd, shift the extracted word left 8 bits
+    // otherwise clear the bottom 8 bits if we need to do an or.
+    if (Elt1 >= 0) {
+      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+                           DAG.getIntPtrConstant(Elt1 / 2));
+      if ((Elt1 & 1) == 0)
+        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
+                             DAG.getConstant(8,
+                                  TLI.getShiftAmountTy(InsElt.getValueType())));
+      else if (Elt0 >= 0)
+        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
+                             DAG.getConstant(0xFF00, MVT::i16));
      }
+    // If Elt0 is defined, extract it from the appropriate source.  If the
+    // source byte is not also even, shift the extracted word right 8 bits. If
+    // Elt1 was also defined, OR the extracted values together before
+    // inserting them in the result.
+    if (Elt0 >= 0) {
+      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
+      if ((Elt0 & 1) != 0)
+        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
+                              DAG.getConstant(8,
+                                 TLI.getShiftAmountTy(InsElt0.getValueType())));
+      else if (Elt1 >= 0)
+        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
+                             DAG.getConstant(0x00FF, MVT::i16));
+      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
+                         : InsElt0;
+    }
+    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+                       DAG.getIntPtrConstant(i));
    }
-
-  return DAG.getNode(ISD::BITCAST, DL, VT,
-                     DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
+  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
  }
  
-static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
-                                      SelectionDAG &DAG) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  MVT VT = Op.getSimpleValueType();
-  SDLoc dl(Op);
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-
-  if (isZeroShuffle(SVOp))
-    return getZeroVector(VT, Subtarget, DAG, dl);
+// v32i8 shuffles - Translate to VPSHUFB if possible.
+static
+SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
+                                 const X86Subtarget *Subtarget,
+                                 SelectionDAG &DAG) {
+  MVT VT = SVOp->getSimpleValueType(0);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  SDLoc dl(SVOp);
+  SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
  
-  // Handle splat operations
-  if (SVOp->isSplat()) {
-    // Use vbroadcast whenever the splat comes from a foldable load
-    SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
-    if (Broadcast.getNode())
-      return Broadcast;
-  }
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
  
-  // Check integer expanding shuffles.
-  SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
-  if (NewOp.getNode())
-    return NewOp;
+  // VPSHUFB may be generated if
+  // (1) one of input vector is undefined or zeroinitializer.
+  // The mask value 0x80 puts 0 in the corresponding slot of the vector.
+  // And (2) the mask indexes don't cross the 128-bit lane.
+  if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
+      (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
+    return SDValue();
  
-  // If the shuffle can be profitably rewritten as a narrower shuffle, then
-  // do it!
-  if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
-      VT == MVT::v32i8) {
-    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
-    if (NewOp.getNode())
-      return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
-  } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
-    // FIXME: Figure out a cleaner way to do this.
-    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
-      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
-      if (NewOp.getNode()) {
-        MVT NewVT = NewOp.getSimpleValueType();
-        if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
-                               NewVT, true, false))
-          return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
-                              dl);
-      }
-    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
-      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
-      if (NewOp.getNode()) {
-        MVT NewVT = NewOp.getSimpleValueType();
-        if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
-          return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
-                              dl);
-      }
-    }
+  if (V1IsAllZero && !V2IsAllZero) {
+    CommuteVectorShuffleMask(MaskVals, 32);
+    V1 = V2;
    }
-  return SDValue();
+  return getPSHUFB(MaskVals, V1, dl, DAG);
  }
  
-SDValue
-X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  MVT VT = Op.getSimpleValueType();
-  SDLoc dl(Op);
+/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
+/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
+/// done when every pair / quad of shuffle mask elements point to elements in
+/// the right sequence. e.g.
+/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
+static
+SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
+                                 SelectionDAG &DAG) {
+  MVT VT = SVOp->getSimpleValueType(0);
+  SDLoc dl(SVOp);
    unsigned NumElems = VT.getVectorNumElements();
-  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
-  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
-  bool V1IsSplat = false;
-  bool V2IsSplat = false;
-  bool HasSSE2 = Subtarget->hasSSE2();
-  bool HasFp256    = Subtarget->hasFp256();
-  bool HasInt256   = Subtarget->hasInt256();
-  MachineFunction &MF = DAG.getMachineFunction();
-  bool OptForSize = MF.getFunction()->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  MVT NewVT;
+  unsigned Scale;
+  switch (VT.SimpleTy) {
+  default: llvm_unreachable("Unexpected!");
+  case MVT::v2i64:
+  case MVT::v2f64:
+           return SDValue(SVOp, 0);
+  case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
+  case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
+  case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
+  case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
+  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
+  case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
+  }
  
-  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
+  SmallVector<int, 8> MaskVec;
+  for (unsigned i = 0; i != NumElems; i += Scale) {
+    int StartIdx = -1;
+    for (unsigned j = 0; j != Scale; ++j) {
+      int EltIdx = SVOp->getMaskElt(i+j);
+      if (EltIdx < 0)
+        continue;
+      if (StartIdx < 0)
+        StartIdx = (EltIdx / Scale);
+      if (EltIdx != (int)(StartIdx*Scale + j))
+        return SDValue();
+    }
+    MaskVec.push_back(StartIdx);
+  }
  
-  if (V1IsUndef && V2IsUndef)
-    return DAG.getUNDEF(VT);
+  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
+  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
+  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
+}
  
-  // When we create a shuffle node we put the UNDEF node to second operand,
-  // but in some cases the first operand may be transformed to UNDEF.
-  // In this case we should just commute the node.
-  if (V1IsUndef)
-    return CommuteVectorShuffle(SVOp, DAG);
+/// getVZextMovL - Return a zero-extending vector move low node.
+///
+static SDValue getVZextMovL(MVT VT, MVT OpVT,
+                            SDValue SrcOp, SelectionDAG &DAG,
+                            const X86Subtarget *Subtarget, SDLoc dl) {
+  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+    LoadSDNode *LD = nullptr;
+    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
+      LD = dyn_cast<LoadSDNode>(SrcOp);
+    if (!LD) {
+      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
+      // instead.
+      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
+      if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
+          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+          SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
+          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
+        // PR2108
+        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
+        return DAG.getNode(ISD::BITCAST, dl, VT,
+                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   OpVT,
+                                                   SrcOp.getOperand(0)
+                                                          .getOperand(0))));
+      }
+    }
+  }
  
-  // Vector shuffle lowering takes 3 steps:
-  //
-  // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
-  //    narrowing and commutation of operands should be handled.
-  // 2) Matching of shuffles with known shuffle masks to x86 target specific
-  //    shuffle nodes.
-  // 3) Rewriting of unmatched masks into new generic shuffle operations,
-  //    so the shuffle can be broken into other shuffles and the legalizer can
-  //    try the lowering again.
-  //
-  // The general idea is that no vector_shuffle operation should be left to
-  // be matched during isel, all of them must be converted to a target specific
-  // node here.
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+                                 DAG.getNode(ISD::BITCAST, dl,
+                                             OpVT, SrcOp)));
+}
  
-  // Normalize the input vectors. Here splats, zeroed vectors, profitable
-  // narrowing and commutation of operands should be handled. The actual code
-  // doesn't include all of those, work in progress...
-  SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
+/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
+/// which could not be matched by any known target speficic shuffle
+static SDValue
+LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+
+  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
    if (NewOp.getNode())
      return NewOp;
  
-  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
-
-  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
-  // unpckh_undef). Only use pshufd if speed is more important than size.
-  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
-  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
-
-  if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
-      V2IsUndef && MayFoldVectorLoad(V1))
-    return getMOVDDup(Op, dl, V1, DAG);
-
-  if (isMOVHLPS_v_undef_Mask(M, VT))
-    return getMOVHighToLow(Op, dl, DAG);
-
-  // Use to match splats
-  if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
-      (VT == MVT::v2f64 || VT == MVT::v2i64))
-    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
-
-  if (isPSHUFDMask(M, VT)) {
-    // The actual implementation will match the mask in the if above and then
-    // during isel it can match several different instructions, not only pshufd
-    // as its name says, sad but true, emulate the behavior for now...
-    if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
-      return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
+  MVT VT = SVOp->getSimpleValueType(0);
  
-    unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned NumLaneElems = NumElems / 2;
  
-    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
-      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
+  SDLoc dl(SVOp);
+  MVT EltVT = VT.getVectorElementType();
+  MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
+  SDValue Output[2];
  
-    if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
-      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
-                                  DAG);
+  SmallVector<int, 16> Mask;
+  for (unsigned l = 0; l < 2; ++l) {
+    // Build a shuffle mask for the output, discovering on the fly which
+    // input vectors to use as shuffle operands (recorded in InputUsed).
+    // If building a suitable shuffle vector proves too hard, then bail
+    // out with UseBuildVector set.
+    bool UseBuildVector = false;
+    int InputUsed[2] = { -1, -1 }; // Not yet discovered.
+    unsigned LaneStart = l * NumLaneElems;
+    for (unsigned i = 0; i != NumLaneElems; ++i) {
+      // The mask element.  This indexes into the input.
+      int Idx = SVOp->getMaskElt(i+LaneStart);
+      if (Idx < 0) {
+        // the mask element does not index into any input vector.
+        Mask.push_back(-1);
+        continue;
+      }
  
-    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
-                                TargetMask, DAG);
-  }
+      // The input vector this mask element indexes into.
+      int Input = Idx / NumLaneElems;
  
-  if (isPALIGNRMask(M, VT, Subtarget))
-    return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
-                                getShufflePALIGNRImmediate(SVOp),
-                                DAG);
+      // Turn the index into an offset from the start of the input vector.
+      Idx -= Input * NumLaneElems;
  
-  // Check if this can be converted into a logical shift.
-  bool isLeft = false;
-  unsigned ShAmt = 0;
-  SDValue ShVal;
-  bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
-  if (isShift && ShVal.hasOneUse()) {
-    // If the shifted value has multiple uses, it may be cheaper to use
-    // v_set0 + movlhps or movhlps, etc.
-    MVT EltVT = VT.getVectorElementType();
-    ShAmt *= EltVT.getSizeInBits();
-    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
-  }
+      // Find or create a shuffle vector operand to hold this input.
+      unsigned OpNo;
+      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
+        if (InputUsed[OpNo] == Input)
+          // This input vector is already an operand.
+          break;
+        if (InputUsed[OpNo] < 0) {
+          // Create a new operand for this input vector.
+          InputUsed[OpNo] = Input;
+          break;
+        }
+      }
  
-  if (isMOVLMask(M, VT)) {
-    if (ISD::isBuildVectorAllZeros(V1.getNode()))
-      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
-    if (!isMOVLPMask(M, VT)) {
-      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
-        return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
+      if (OpNo >= array_lengthof(InputUsed)) {
+        // More than two input vectors used!  Give up on trying to create a
+        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
+        UseBuildVector = true;
+        break;
+      }
  
-      if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
+      // Add the mask index for the new shuffle vector.
+      Mask.push_back(Idx + OpNo * NumLaneElems);
      }
-  }
-
-  // FIXME: fold these into legal mask.
-  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
-    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
  
-  if (isMOVHLPSMask(M, VT))
-    return getMOVHighToLow(Op, dl, DAG);
+    if (UseBuildVector) {
+      SmallVector<SDValue, 16> SVOps;
+      for (unsigned i = 0; i != NumLaneElems; ++i) {
+        // The mask element.  This indexes into the input.
+        int Idx = SVOp->getMaskElt(i+LaneStart);
+        if (Idx < 0) {
+          SVOps.push_back(DAG.getUNDEF(EltVT));
+          continue;
+        }
  
-  if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
-    return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
+        // The input vector this mask element indexes into.
+        int Input = Idx / NumElems;
  
-  if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
-    return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
+        // Turn the index into an offset from the start of the input vector.
+        Idx -= Input * NumElems;
  
-  if (isMOVLPMask(M, VT))
-    return getMOVLP(Op, dl, DAG, HasSSE2);
+        // Extract the vector element by hand.
+        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                    SVOp->getOperand(Input),
+                                    DAG.getIntPtrConstant(Idx)));
+      }
  
-  if (ShouldXformToMOVHLPS(M, VT) ||
-      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
-    return CommuteVectorShuffle(SVOp, DAG);
+      // Construct the output using a BUILD_VECTOR.
+      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
+    } else if (InputUsed[0] < 0) {
+      // No input vectors were used! The result is undefined.
+      Output[l] = DAG.getUNDEF(NVT);
+    } else {
+      SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
+                                        (InputUsed[0] % 2) * NumLaneElems,
+                                        DAG, dl);
+      // If only one input was used, use an undefined vector for the other.
+      SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
+        Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
+                            (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
+      // At least one input vector was used. Create a new shuffle vector.
+      Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
+    }
  
-  if (isShift) {
-    // No better options. Use a vshldq / vsrldq.
-    MVT EltVT = VT.getVectorElementType();
-    ShAmt *= EltVT.getSizeInBits();
-    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+    Mask.clear();
    }
  
-  bool Commuted = false;
-  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
-  // 1,1,1,1 -> v8i16 though.
-  V1IsSplat = isSplatVector(V1.getNode());
-  V2IsSplat = isSplatVector(V2.getNode());
-
-  // Canonicalize the splat or undef, if present, to be on the RHS.
-  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
-    CommuteVectorShuffleMask(M, NumElems);
-    std::swap(V1, V2);
-    std::swap(V1IsSplat, V2IsSplat);
-    Commuted = true;
-  }
+  // Concatenate the result back
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
+}
  
-  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
-    // Shuffling low element of v1 into undef, just return v1.
-    if (V2IsUndef)
-      return V1;
-    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
-    // the instruction selector will not match, so get a canonical MOVL with
-    // swapped operands to undo the commute.
-    return getMOVL(DAG, dl, VT, V2, V1);
-  }
+/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
+/// 4 elements, and match them with several different shuffle types.
+static SDValue
+LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  SDLoc dl(SVOp);
+  MVT VT = SVOp->getSimpleValueType(0);
  
-  if (isUNPCKLMask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+  assert(VT.is128BitVector() && "Unsupported vector size");
  
-  if (isUNPCKHMask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
+  std::pair<int, int> Locs[4];
+  int Mask1[] = { -1, -1, -1, -1 };
+  SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
  
-  if (V2IsSplat) {
-    // Normalize mask so all entries that point to V2 points to its first
-    // element then try to match unpck{h|l} again. If match, return a
-    // new vector_shuffle with the corrected mask.p
-    SmallVector<int, 8> NewMask(M.begin(), M.end());
-    NormalizeMask(NewMask, NumElems);
-    if (isUNPCKLMask(NewMask, VT, HasInt256, true))
-      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
-    if (isUNPCKHMask(NewMask, VT, HasInt256, true))
-      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
+  unsigned NumHi = 0;
+  unsigned NumLo = 0;
+  for (unsigned i = 0; i != 4; ++i) {
+    int Idx = PermMask[i];
+    if (Idx < 0) {
+      Locs[i] = std::make_pair(-1, -1);
+    } else {
+      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
+      if (Idx < 4) {
+        Locs[i] = std::make_pair(0, NumLo);
+        Mask1[NumLo] = Idx;
+        NumLo++;
+      } else {
+        Locs[i] = std::make_pair(1, NumHi);
+        if (2+NumHi < 4)
+          Mask1[2+NumHi] = Idx;
+        NumHi++;
+      }
+    }
    }
  
-  if (Commuted) {
-    // Commute is back and try unpck* again.
-    // FIXME: this seems wrong.
-    CommuteVectorShuffleMask(M, NumElems);
-    std::swap(V1, V2);
-    std::swap(V1IsSplat, V2IsSplat);
-
-    if (isUNPCKLMask(M, VT, HasInt256))
-      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
-
-    if (isUNPCKHMask(M, VT, HasInt256))
-      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
-  }
+  if (NumLo <= 2 && NumHi <= 2) {
+    // If no more than two elements come from either vector. This can be
+    // implemented with two shuffles. First shuffle gather the elements.
+    // The second shuffle, which takes the first shuffle as both of its
+    // vector operands, put the elements into the right order.
+    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
  
-  // Normalize the node to match x86 shuffle ops if needed
-  if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
-    return CommuteVectorShuffle(SVOp, DAG);
+    int Mask2[] = { -1, -1, -1, -1 };
  
-  // The checks below are all present in isShuffleMaskLegal, but they are
-  // inlined here right now to enable us to directly emit target specific
-  // nodes, and remove one by one until they don't return Op anymore.
+    for (unsigned i = 0; i != 4; ++i)
+      if (Locs[i].first != -1) {
+        unsigned Idx = (i < 2) ? 0 : 4;
+        Idx += Locs[i].first * 2 + Locs[i].second;
+        Mask2[i] = Idx;
+      }
  
-  if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
-      SVOp->getSplatIndex() == 0 && V2IsUndef) {
-    if (VT == MVT::v2f64 || VT == MVT::v2i64)
-      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
    }
  
-  if (isPSHUFHWMask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
-                                getShufflePSHUFHWImmediate(SVOp),
-                                DAG);
-
-  if (isPSHUFLWMask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
-                                getShufflePSHUFLWImmediate(SVOp),
-                                DAG);
+  if (NumLo == 3 || NumHi == 3) {
+    // Otherwise, we must have three elements from one vector, call it X, and
+    // one element from the other, call it Y.  First, use a shufps to build an
+    // intermediate vector with the one element from Y and the element from X
+    // that will be in the same half in the final destination (the indexes don't
+    // matter). Then, use a shufps to build the final vector, taking the half
+    // containing the element from Y from the intermediate, and the other half
+    // from X.
+    if (NumHi == 3) {
+      // Normalize it so the 3 elements come from V1.
+      CommuteVectorShuffleMask(PermMask, 4);
+      std::swap(V1, V2);
+    }
  
-  unsigned MaskValue;
-  if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
-                  &MaskValue))
-    return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
+    // Find the element from V2.
+    unsigned HiIndex;
+    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
+      int Val = PermMask[HiIndex];
+      if (Val < 0)
+        continue;
+      if (Val >= 4)
+        break;
+    }
  
-  if (isSHUFPMask(M, VT))
-    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
-                                getShuffleSHUFImmediate(SVOp), DAG);
+    Mask1[0] = PermMask[HiIndex];
+    Mask1[1] = -1;
+    Mask1[2] = PermMask[HiIndex^1];
+    Mask1[3] = -1;
+    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
  
-  if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
-  if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
-    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
+    if (HiIndex >= 2) {
+      Mask1[0] = PermMask[0];
+      Mask1[1] = PermMask[1];
+      Mask1[2] = HiIndex & 1 ? 6 : 4;
+      Mask1[3] = HiIndex & 1 ? 4 : 6;
+      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+    }
  
-  //===--------------------------------------------------------------------===//
-  // Generate target specific nodes for 128 or 256-bit shuffles only
-  // supported in the AVX instruction set.
-  //
+    Mask1[0] = HiIndex & 1 ? 2 : 0;
+    Mask1[1] = HiIndex & 1 ? 0 : 2;
+    Mask1[2] = PermMask[2];
+    Mask1[3] = PermMask[3];
+    if (Mask1[2] >= 0)
+      Mask1[2] += 4;
+    if (Mask1[3] >= 0)
+      Mask1[3] += 4;
+    return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
+  }
  
-  // Handle VMOVDDUPY permutations
-  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
-    return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
+  // Break it into (shuffle shuffle_hi, shuffle_lo).
+  int LoMask[] = { -1, -1, -1, -1 };
+  int HiMask[] = { -1, -1, -1, -1 };
  
-  // Handle VPERMILPS/D* permutations
-  if (isVPERMILPMask(M, VT)) {
-    if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
-      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
-                                  getShuffleSHUFImmediate(SVOp), DAG);
-    return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
-                                getShuffleSHUFImmediate(SVOp), DAG);
+  int *MaskPtr = LoMask;
+  unsigned MaskIdx = 0;
+  unsigned LoIdx = 0;
+  unsigned HiIdx = 2;
+  for (unsigned i = 0; i != 4; ++i) {
+    if (i == 2) {
+      MaskPtr = HiMask;
+      MaskIdx = 1;
+      LoIdx = 0;
+      HiIdx = 2;
+    }
+    int Idx = PermMask[i];
+    if (Idx < 0) {
+      Locs[i] = std::make_pair(-1, -1);
+    } else if (Idx < 4) {
+      Locs[i] = std::make_pair(MaskIdx, LoIdx);
+      MaskPtr[LoIdx] = Idx;
+      LoIdx++;
+    } else {
+      Locs[i] = std::make_pair(MaskIdx, HiIdx);
+      MaskPtr[HiIdx] = Idx;
+      HiIdx++;
+    }
    }
  
-  unsigned Idx;
-  if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
-    return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
-                              Idx*(NumElems/2), DAG, dl);
+  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
+  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
+  int MaskOps[] = { -1, -1, -1, -1 };
+  for (unsigned i = 0; i != 4; ++i)
+    if (Locs[i].first != -1)
+      MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
+  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
+}
  
-  // Handle VPERM2F128/VPERM2I128 permutations
-  if (isVPERM2X128Mask(M, VT, HasFp256))
-    return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
-                                V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
+static bool MayFoldVectorLoad(SDValue V) {
+  while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
  
-  if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
-    return getINSERTPS(SVOp, dl, DAG);
-
-  unsigned Imm8;
-  if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
-    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
-
-  if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
-      VT.is512BitVector()) {
-    MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
-    MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
-    SmallVector<SDValue, 16> permclMask;
-    for (unsigned i = 0; i != NumElems; ++i) {
-      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
-    }
-
-    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
-    if (V2IsUndef)
-      // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
-      return DAG.getNode(X86ISD::VPERMV, dl, VT,
-                          DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
-    return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
-                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
-  }
-
-  //===--------------------------------------------------------------------===//
-  // Since no target specific shuffle was selected for this generic one,
-  // lower it into other known shuffles. FIXME: this isn't true yet, but
-  // this is the plan.
-  //
+  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    V = V.getOperand(0);
+  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
+      V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
+    // BUILD_VECTOR (load), undef
+    V = V.getOperand(0);
  
-  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
-  if (VT == MVT::v8i16) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
-    if (NewOp.getNode())
-      return NewOp;
-  }
+  return MayFoldLoad(V);
+}
  
-  if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
-    if (NewOp.getNode())
-      return NewOp;
-  }
+static
+SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
  
-  if (VT == MVT::v16i8) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
-    if (NewOp.getNode())
-      return NewOp;
-  }
+  // Canonizalize to v2f64.
+  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
+                                          V1, DAG));
+}
  
-  if (VT == MVT::v32i8) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
-    if (NewOp.getNode())
-      return NewOp;
-  }
+static
+SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
+                        bool HasSSE2) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getSimpleValueType();
  
-  // Handle all 128-bit wide vectors with 4 elements, and match them with
-  // several different shuffle types.
-  if (NumElems == 4 && VT.is128BitVector())
-    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
+  assert(VT != MVT::v2i64 && "unsupported shuffle type");
  
-  // Handle general 256-bit shuffles
-  if (VT.is256BitVector())
-    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
+  if (HasSSE2 && VT == MVT::v2f64)
+    return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
  
-  return SDValue();
+  // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
  }
  
-// This function assumes its argument is a BUILD_VECTOR of constants or
-// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
-// true.
-static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
-                                    unsigned &MaskValue) {
-  MaskValue = 0;
-  unsigned NumElems = BuildVector->getNumOperands();
-  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
-  unsigned NumLanes = (NumElems - 1) / 8 + 1;
-  unsigned NumElemsInLane = NumElems / NumLanes;
+static
+SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getSimpleValueType();
  
-  // Blend for v16i16 should be symetric for the both lanes.
-  for (unsigned i = 0; i < NumElemsInLane; ++i) {
-    SDValue EltCond = BuildVector->getOperand(i);
-    SDValue SndLaneEltCond =
-        (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
+  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
+         "unsupported shuffle type");
  
-    int Lane1Cond = -1, Lane2Cond = -1;
-    if (isa<ConstantSDNode>(EltCond))
-      Lane1Cond = !isZero(EltCond);
-    if (isa<ConstantSDNode>(SndLaneEltCond))
-      Lane2Cond = !isZero(SndLaneEltCond);
+  if (V2.getOpcode() == ISD::UNDEF)
+    V2 = V1;
  
-    if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
-      // Lane1Cond != 0, means we want the first argument.
-      // Lane1Cond == 0, means we want the second argument.
-      // The encoding of this argument is 0 for the first argument, 1
-      // for the second. Therefore, invert the condition.
-      MaskValue |= !Lane1Cond << i;
-    else if (Lane1Cond < 0)
-      MaskValue |= !Lane2Cond << i;
-    else
-      return false;
-  }
-  return true;
+  // v4i32 or v4f32
+  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
  }
  
-// Try to lower a vselect node into a simple blend instruction.
-static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget,
-                                   SelectionDAG &DAG) {
-  SDValue Cond = Op.getOperand(0);
-  SDValue LHS = Op.getOperand(1);
-  SDValue RHS = Op.getOperand(2);
-  SDLoc dl(Op);
+static
+SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
    MVT VT = Op.getSimpleValueType();
-  MVT EltVT = VT.getVectorElementType();
    unsigned NumElems = VT.getVectorNumElements();
  
-  // There is no blend with immediate in AVX-512.
-  if (VT.is512BitVector())
-    return SDValue();
+  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
+  // operand of these instructions is only memory, so check if there's a
+  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
+  // same masks.
+  bool CanFoldLoad = false;
  
-  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
-    return SDValue();
-  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
-    return SDValue();
+  // Trivial case, when V2 comes from a load.
+  if (MayFoldVectorLoad(V2))
+    CanFoldLoad = true;
  
-  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
-    return SDValue();
+  // When V1 is a load, it can be folded later into a store in isel, example:
+  //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
+  //    turns into:
+  //  (MOVLPSmr addr:$src1, VR128:$src2)
+  // So, recognize this potential and also use MOVLPS or MOVLPD
+  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
+    CanFoldLoad = true;
  
-  // Check the mask for BLEND and build the value.
-  unsigned MaskValue = 0;
-  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
-    return SDValue();
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  if (CanFoldLoad) {
+    if (HasSSE2 && NumElems == 2)
+      return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
  
-  // Convert i32 vectors to floating point if it is not AVX2.
-  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
-  MVT BlendVT = VT;
-  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
-    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
-                               NumElems);
-    LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
-    RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+    if (NumElems == 4)
+      // If we don't care about the second element, proceed to use movss.
+      if (SVOp->getMaskElt(1) != -1)
+        return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
    }
  
-  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
-                            DAG.getConstant(MaskValue, MVT::i32));
-  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
-}
-
-SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
-  SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG);
-  if (BlendOp.getNode())
-    return BlendOp;
-
-  // Some types for vselect were previously set to Expand, not Legal or
-  // Custom. Return an empty SDValue so we fall-through to Expand, after
-  // the Custom lowering phase.
-  MVT VT = Op.getSimpleValueType();
-  switch (VT.SimpleTy) {
-  default:
-    break;
-  case MVT::v8i16:
-  case MVT::v16i16:
-    return SDValue();
+  // movl and movlp will both match v2i64, but v2i64 is never matched by
+  // movl earlier because we make it strict to avoid messing with the movlp load
+  // folding logic (see the code above getMOVLP call). Match it here then,
+  // this is horrible, but will stay like this until we move all shuffle
+  // matching to x86 specific nodes. Note that for the 1st condition all
+  // types are matched with movsd.
+  if (HasSSE2) {
+    // FIXME: isMOVLMask should be checked and matched before getMOVLP,
+    // as to remove this logic from here, as much as possible
+    if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
+      return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
+    return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
    }
  
-  // We couldn't create a "Blend with immediate" node.
-  // This node should still be legal, but we'll have to emit a blendv*
-  // instruction.
-  return Op;
+  assert(VT != MVT::v4i32 && "unsupported shuffle type");
+
+  // Invert the operand order and use SHUFPS to match it.
+  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
+                              getShuffleSHUFImmediate(SVOp), DAG);
  }
  
-static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-  SDLoc dl(Op);
+static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
+                                         SelectionDAG &DAG) {
+  SDLoc dl(Load);
+  MVT VT = Load->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue Addr = Load->getOperand(1);
+  SDValue NewAddr = DAG.getNode(
+      ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+      DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
  
-  if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
-    return SDValue();
+  SDValue NewLoad =
+      DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+                  DAG.getMachineFunction().getMachineMemOperand(
+                      Load->getMemOperand(), 0, EVT.getStoreSize()));
+  return NewLoad;
+}
  
-  if (VT.getSizeInBits() == 8) {
-    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
-                                  Op.getOperand(0), Op.getOperand(1));
-    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
-                                  DAG.getValueType(VT));
-    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
-  }
+// It is only safe to call this function if isINSERTPSMask is true for
+// this shufflevector mask.
+static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
+                           SelectionDAG &DAG) {
+  // Generate an insertps instruction when inserting an f32 from memory onto a
+  // v4f32 or when copying a member from one v4f32 to another.
+  // We also use it for transferring i32 from one register to another,
+  // since it simply copies the same bits.
+  // If we're transferring an i32 from memory to a specific element in a
+  // register, we output a generic DAG that will match the PINSRD
+  // instruction.
+  MVT VT = SVOp->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  auto Mask = SVOp->getMask();
+  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+         "unsupported vector type for insertps/pinsrd");
  
-  if (VT.getSizeInBits() == 16) {
-    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
-    if (Idx == 0)
-      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
-                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                     DAG.getNode(ISD::BITCAST, dl,
-                                                 MVT::v4i32,
-                                                 Op.getOperand(0)),
-                                     Op.getOperand(1)));
-    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
-                                  Op.getOperand(0), Op.getOperand(1));
-    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
-                                  DAG.getValueType(VT));
-    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
-  }
+  auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
+  auto FromV2Predicate = [](const int &i) { return i >= 4; };
+  int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
  
-  if (VT == MVT::f32) {
-    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
-    // the result back to FR32 register. It's only worth matching if the
-    // result has a single use which is a store or a bitcast to i32.  And in
-    // the case of a store, it's not worth it if the index is a constant 0,
-    // because a MOVSSmr can be used instead, which is smaller and faster.
-    if (!Op.hasOneUse())
-      return SDValue();
-    SDNode *User = *Op.getNode()->use_begin();
-    if ((User->getOpcode() != ISD::STORE ||
-         (isa<ConstantSDNode>(Op.getOperand(1)) &&
-          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
-        (User->getOpcode() != ISD::BITCAST ||
-         User->getValueType(0) != MVT::i32))
-      return SDValue();
-    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
-                                              Op.getOperand(0)),
-                                              Op.getOperand(1));
-    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
-  }
+  SDValue From;
+  SDValue To;
+  unsigned DestIndex;
+  if (FromV1 == 1) {
+    From = V1;
+    To = V2;
+    DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
+                Mask.begin();
  
-  if (VT == MVT::i32 || VT == MVT::i64) {
-    // ExtractPS/pextrq works with constant index.
-    if (isa<ConstantSDNode>(Op.getOperand(1)))
-      return Op;
+    // If we have 1 element from each vector, we have to check if we're
+    // changing V1's element's place. If so, we're done. Otherwise, we
+    // should assume we're changing V2's element's place and behave
+    // accordingly.
+    int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
+    assert(DestIndex <= INT32_MAX && "truncated destination index");
+    if (FromV1 == FromV2 &&
+        static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
+      From = V2;
+      To = V1;
+      DestIndex =
+          std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
+    }
+  } else {
+    assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
+           "More than one element from V1 and from V2, or no elements from one "
+           "of the vectors. This case should not have returned true from "
+           "isINSERTPSMask");
+    From = V2;
+    To = V1;
+    DestIndex =
+        std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
    }
-  return SDValue();
-}
-
-/// Extract one bit from mask vector, like v16i1 or v8i1.
-/// AVX-512 feature.
-SDValue
-X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Vec = Op.getOperand(0);
-  SDLoc dl(Vec);
-  MVT VecVT = Vec.getSimpleValueType();
-  SDValue Idx = Op.getOperand(1);
-  MVT EltVT = Op.getSimpleValueType();
  
-  assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+  // Get an index into the source vector in the range [0,4) (the mask is
+  // in the range [0,8) because it can address V1 and V2)
+  unsigned SrcIndex = Mask[DestIndex] % 4;
+  if (MayFoldLoad(From)) {
+    // Trivial case, when From comes from a load and is only used by the
+    // shuffle. Make it use insertps from the vector that we need from that
+    // load.
+    SDValue NewLoad =
+        NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
+    if (!NewLoad.getNode())
+      return SDValue();
  
-  // variable index can't be handled in mask registers,
-  // extend vector to VR512
-  if (!isa<ConstantSDNode>(Idx)) {
-    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
-    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
-    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                              ExtVT.getVectorElementType(), Ext, Idx);
-    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+    if (EVT == MVT::f32) {
+      // Create this as a scalar to vector to match the instruction pattern.
+      SDValue LoadScalarToVector =
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
+      SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
+      return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
+                         InsertpsMask);
+    } else { // EVT == MVT::i32
+      // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
+      // instruction, to match the PINSRD instruction, which loads an i32 to a
+      // certain vector element.
+      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
+                         DAG.getConstant(DestIndex, MVT::i32));
+    }
    }
  
-  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-  const TargetRegisterClass* rc = getRegClassFor(VecVT);
-  unsigned MaxSift = rc->getSize()*8 - 1;
-  Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
-                    DAG.getConstant(MaxSift - IdxVal, MVT::i8));
-  Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
-                    DAG.getConstant(MaxSift, MVT::i8));
-  return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
-                       DAG.getIntPtrConstant(0));
+  // Vector-element-to-vector
+  SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
+  return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
  }
  
-SDValue
-X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue Vec = Op.getOperand(0);
-  MVT VecVT = Vec.getSimpleValueType();
-  SDValue Idx = Op.getOperand(1);
+// Reduce a vector shuffle to zext.
+static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
+  // PMOVZX is only available from SSE41.
+  if (!Subtarget->hasSSE41())
+    return SDValue();
  
-  if (Op.getSimpleValueType() == MVT::i1)
-    return ExtractBitFromMaskVector(Op, DAG);
+  MVT VT = Op.getSimpleValueType();
  
-  if (!isa<ConstantSDNode>(Idx)) {
-    if (VecVT.is512BitVector() ||
-        (VecVT.is256BitVector() && Subtarget->hasInt256() &&
-         VecVT.getVectorElementType().getSizeInBits() == 32)) {
+  // Only AVX2 support 256-bit vector integer extending.
+  if (!Subtarget->hasInt256() && VT.is256BitVector())
+    return SDValue();
  
-      MVT MaskEltVT =
-        MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
-      MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
-                                    MaskEltVT.getSizeInBits());
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDLoc DL(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = VT.getVectorNumElements();
  
-      Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
-      SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
-                                getZeroVector(MaskVT, Subtarget, DAG, dl),
-                                Idx, DAG.getConstant(0, getPointerTy()));
-      SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
-                        Perm, DAG.getConstant(0, getPointerTy()));
-    }
+  // Extending is an unary operation and the element type of the source vector
+  // won't be equal to or larger than i64.
+  if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
+      VT.getVectorElementType() == MVT::i64)
      return SDValue();
+
+  // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
+  unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
+  while ((1U << Shift) < NumElems) {
+    if (SVOp->getMaskElt(1U << Shift) == 1)
+      break;
+    Shift += 1;
+    // The maximal ratio is 8, i.e. from i8 to i64.
+    if (Shift > 3)
+      return SDValue();
    }
  
-  // If this is a 256-bit vector result, first extract the 128-bit vector and
-  // then extract the element from the 128-bit vector.
-  if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
+  // Check the shuffle mask.
+  unsigned Mask = (1U << Shift) - 1;
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int EltIdx = SVOp->getMaskElt(i);
+    if ((i & Mask) != 0 && EltIdx != -1)
+      return SDValue();
+    if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
+      return SDValue();
+  }
  
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-    // Get the 128-bit vector.
-    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
-    MVT EltVT = VecVT.getVectorElementType();
+  unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
+  MVT NeVT = MVT::getIntegerVT(NBits);
+  MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
  
-    unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
+    return SDValue();
  
-    //if (IdxVal >= NumElems/2)
-    //  IdxVal -= NumElems/2;
-    IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
-                       DAG.getConstant(IdxVal, MVT::i32));
+  // Simplify the operand as it's prepared to be fed into shuffle.
+  unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
+  if (V1.getOpcode() == ISD::BITCAST &&
+      V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      V1.getOperand(0).getOperand(0)
+        .getSimpleValueType().getSizeInBits() == SignificantBits) {
+    // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+    SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
+    ConstantSDNode *CIdx =
+      dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
+    // If it's foldable, i.e. normal load with single use, we will let code
+    // selection to fold it. Otherwise, we will short the conversion sequence.
+    if (CIdx && CIdx->getZExtValue() == 0 &&
+        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
+      MVT FullVT = V.getSimpleValueType();
+      MVT V1VT = V1.getSimpleValueType();
+      if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) {
+        // The "ext_vec_elt" node is wider than the result node.
+        // In this case we should extract subvector from V.
+        // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
+        unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits();
+        MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(),
+                                        FullVT.getVectorNumElements()/Ratio);
+        V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
+                        DAG.getIntPtrConstant(0));
+      }
+      V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V);
+    }
    }
  
-  assert(VecVT.is128BitVector() && "Unexpected vector length");
-
-  if (Subtarget->hasSSE41()) {
-    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
-    if (Res.getNode())
-      return Res;
-  }
+  return DAG.getNode(ISD::BITCAST, DL, VT,
+                     DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
+}
  
+static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
    MVT VT = Op.getSimpleValueType();
-  // TODO: handle v16i8.
-  if (VT.getSizeInBits() == 16) {
-    SDValue Vec = Op.getOperand(0);
-    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-    if (Idx == 0)
-      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
-                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                     DAG.getNode(ISD::BITCAST, dl,
-                                                 MVT::v4i32, Vec),
-                                     Op.getOperand(1)));
-    // Transform it so it match pextrw which produces a 32-bit result.
-    MVT EltVT = MVT::i32;
-    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
-                                  Op.getOperand(0), Op.getOperand(1));
-    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
-                                  DAG.getValueType(VT));
-    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
-  }
+  SDLoc dl(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
  
-  if (VT.getSizeInBits() == 32) {
-    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-    if (Idx == 0)
-      return Op;
+  if (isZeroShuffle(SVOp))
+    return getZeroVector(VT, Subtarget, DAG, dl);
  
-    // SHUFPS the element to the lowest double word, then movss.
-    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
-    MVT VVT = Op.getOperand(0).getSimpleValueType();
-    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
-                                       DAG.getUNDEF(VVT), Mask);
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
-                       DAG.getIntPtrConstant(0));
+  // Handle splat operations
+  if (SVOp->isSplat()) {
+    // Use vbroadcast whenever the splat comes from a foldable load
+    SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
+    if (Broadcast.getNode())
+      return Broadcast;
    }
  
-  if (VT.getSizeInBits() == 64) {
-    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
-    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
-    //        to match extract_elt for f64.
-    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-    if (Idx == 0)
-      return Op;
+  // Check integer expanding shuffles.
+  SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
+  if (NewOp.getNode())
+    return NewOp;
  
-    // UNPCKHPD the element to the lowest double word, then movsd.
-    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
-    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
-    int Mask[2] = { 1, -1 };
-    MVT VVT = Op.getOperand(0).getSimpleValueType();
-    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
-                                       DAG.getUNDEF(VVT), Mask);
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
-                       DAG.getIntPtrConstant(0));
+  // If the shuffle can be profitably rewritten as a narrower shuffle, then
+  // do it!
+  if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
+      VT == MVT::v32i8) {
+    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
+    if (NewOp.getNode())
+      return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
+  } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
+    // FIXME: Figure out a cleaner way to do this.
+    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
+      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
+      if (NewOp.getNode()) {
+        MVT NewVT = NewOp.getSimpleValueType();
+        if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
+                               NewVT, true, false))
+          return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
+                              dl);
+      }
+    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
+      if (NewOp.getNode()) {
+        MVT NewVT = NewOp.getSimpleValueType();
+        if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
+          return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
+                              dl);
+      }
+    }
    }
-
    return SDValue();
  }
  
-static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+SDValue
+X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
    MVT VT = Op.getSimpleValueType();
-  MVT EltVT = VT.getVectorElementType();
    SDLoc dl(Op);
+  unsigned NumElems = VT.getVectorNumElements();
+  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsSplat = false;
+  bool V2IsSplat = false;
+  bool HasSSE2 = Subtarget->hasSSE2();
+  bool HasFp256    = Subtarget->hasFp256();
+  bool HasInt256   = Subtarget->hasInt256();
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool OptForSize = MF.getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
  
-  SDValue N0 = Op.getOperand(0);
-  SDValue N1 = Op.getOperand(1);
-  SDValue N2 = Op.getOperand(2);
+  // Check if we should use the experimental vector shuffle lowering. If so,
+  // delegate completely to that code path.
+  if (ExperimentalVectorShuffleLowering)
+    return lowerVectorShuffle(Op, Subtarget, DAG);
  
-  if (!VT.is128BitVector())
-    return SDValue();
+  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
  
-  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
-      isa<ConstantSDNode>(N2)) {
-    unsigned Opc;
-    if (VT == MVT::v8i16)
-      Opc = X86ISD::PINSRW;
-    else if (VT == MVT::v16i8)
-      Opc = X86ISD::PINSRB;
-    else
-      Opc = X86ISD::PINSRB;
+  if (V1IsUndef && V2IsUndef)
+    return DAG.getUNDEF(VT);
  
-    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
-    // argument.
-    if (N1.getValueType() != MVT::i32)
-      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
-    if (N2.getValueType() != MVT::i32)
-      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
-    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
-  }
+  // When we create a shuffle node we put the UNDEF node to second operand,
+  // but in some cases the first operand may be transformed to UNDEF.
+  // In this case we should just commute the node.
+  if (V1IsUndef)
+    return DAG.getCommutedVectorShuffle(*SVOp);
  
-  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
-    // Bits [7:6] of the constant are the source select.  This will always be
-    //  zero here.  The DAG Combiner may combine an extract_elt index into these
-    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
-    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
-    // Bits [5:4] of the constant are the destination select.  This is the
-    //  value of the incoming immediate.
-    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
-    //   combine either bitwise AND or insert of float 0.0 to set these bits.
-    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
-    // Create this as a scalar to vector..
-    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
-    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
-  }
+  // Vector shuffle lowering takes 3 steps:
+  //
+  // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
+  //    narrowing and commutation of operands should be handled.
+  // 2) Matching of shuffles with known shuffle masks to x86 target specific
+  //    shuffle nodes.
+  // 3) Rewriting of unmatched masks into new generic shuffle operations,
+  //    so the shuffle can be broken into other shuffles and the legalizer can
+  //    try the lowering again.
+  //
+  // The general idea is that no vector_shuffle operation should be left to
+  // be matched during isel, all of them must be converted to a target specific
+  // node here.
  
-  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
-    // PINSR* works with constant index.
-    return Op;
-  }
-  return SDValue();
-}
+  // Normalize the input vectors. Here splats, zeroed vectors, profitable
+  // narrowing and commutation of operands should be handled. The actual code
+  // doesn't include all of those, work in progress...
+  SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
+  if (NewOp.getNode())
+    return NewOp;
  
-/// Insert one bit to mask vector, like v16i1 or v8i1.
-/// AVX-512 feature.
-SDValue 
-X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue Vec = Op.getOperand(0);
-  SDValue Elt = Op.getOperand(1);
-  SDValue Idx = Op.getOperand(2);
-  MVT VecVT = Vec.getSimpleValueType();
+  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
  
-  if (!isa<ConstantSDNode>(Idx)) {
-    // Non constant index. Extend source and destination,
-    // insert element and then truncate the result.
-    MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
-    MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
-    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, 
-      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
-      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
-    return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
-  }
+  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
+  // unpckh_undef). Only use pshufd if speed is more important than size.
+  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
  
-  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-  SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
-  if (Vec.getOpcode() == ISD::UNDEF)
-    return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
-                       DAG.getConstant(IdxVal, MVT::i8));
-  const TargetRegisterClass* rc = getRegClassFor(VecVT);
-  unsigned MaxSift = rc->getSize()*8 - 1;
-  EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
-                    DAG.getConstant(MaxSift, MVT::i8));
-  EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
-                    DAG.getConstant(MaxSift - IdxVal, MVT::i8));
-  return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
-}
-SDValue
-X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
-  MVT VT = Op.getSimpleValueType();
-  MVT EltVT = VT.getVectorElementType();
-  
-  if (EltVT == MVT::i1)
-    return InsertBitToMaskVector(Op, DAG);
+  if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
+      V2IsUndef && MayFoldVectorLoad(V1))
+    return getMOVDDup(Op, dl, V1, DAG);
  
-  SDLoc dl(Op);
-  SDValue N0 = Op.getOperand(0);
-  SDValue N1 = Op.getOperand(1);
-  SDValue N2 = Op.getOperand(2);
+  if (isMOVHLPS_v_undef_Mask(M, VT))
+    return getMOVHighToLow(Op, dl, DAG);
  
-  // If this is a 256-bit vector result, first extract the 128-bit vector,
-  // insert the element into the extracted half and then place it back.
-  if (VT.is256BitVector() || VT.is512BitVector()) {
-    if (!isa<ConstantSDNode>(N2))
-      return SDValue();
+  // Use to match splats
+  if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
+      (VT == MVT::v2f64 || VT == MVT::v2i64))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
  
-    // Get the desired 128-bit vector half.
-    unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
-    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
+  if (isPSHUFDMask(M, VT)) {
+    // The actual implementation will match the mask in the if above and then
+    // during isel it can match several different instructions, not only pshufd
+    // as its name says, sad but true, emulate the behavior for now...
+    if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
+      return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
  
-    // Insert the element into the desired half.
-    unsigned NumEltsIn128 = 128/EltVT.getSizeInBits();
-    unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128;
+    unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
  
-    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
-                    DAG.getConstant(IdxIn128, MVT::i32));
+    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
+      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
  
-    // Insert the changed part back to the 256-bit vector
-    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
+    if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
+      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
+                                  DAG);
+
+    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
+                                TargetMask, DAG);
    }
  
-  if (Subtarget->hasSSE41())
-    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
+  if (isPALIGNRMask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
+                                getShufflePALIGNRImmediate(SVOp),
+                                DAG);
  
-  if (EltVT == MVT::i8)
-    return SDValue();
+  if (isVALIGNMask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
+                                getShuffleVALIGNImmediate(SVOp),
+                                DAG);
  
-  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
-    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
-    // as its second argument.
-    if (N1.getValueType() != MVT::i32)
-      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
-    if (N2.getValueType() != MVT::i32)
-      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
-    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+  // Check if this can be converted into a logical shift.
+  bool isLeft = false;
+  unsigned ShAmt = 0;
+  SDValue ShVal;
+  bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+  if (isShift && ShVal.hasOneUse()) {
+    // If the shifted value has multiple uses, it may be cheaper to use
+    // v_set0 + movlhps or movhlps, etc.
+    MVT EltVT = VT.getVectorElementType();
+    ShAmt *= EltVT.getSizeInBits();
+    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
    }
-  return SDValue();
-}
-
-static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
-  SDLoc dl(Op);
-  MVT OpVT = Op.getSimpleValueType();
-
-  // If this is a 256-bit vector result, first insert into a 128-bit
-  // vector and then insert into the 256-bit vector.
-  if (!OpVT.is128BitVector()) {
-    // Insert into a 128-bit vector.
-    unsigned SizeFactor = OpVT.getSizeInBits()/128;
-    MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
-                                 OpVT.getVectorNumElements() / SizeFactor);
  
-    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
+  if (isMOVLMask(M, VT)) {
+    if (ISD::isBuildVectorAllZeros(V1.getNode()))
+      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
+    if (!isMOVLPMask(M, VT)) {
+      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
+        return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
  
-    // Insert the 128-bit vector.
-    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
+      if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
+    }
    }
  
-  if (OpVT == MVT::v1i64 &&
-      Op.getOperand(0).getValueType() == MVT::i64)
-    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
+  // FIXME: fold these into legal mask.
+  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
+    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
  
-  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
-  assert(OpVT.is128BitVector() && "Expected an SSE type!");
-  return DAG.getNode(ISD::BITCAST, dl, OpVT,
-                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
-}
+  if (isMOVHLPSMask(M, VT))
+    return getMOVHighToLow(Op, dl, DAG);
  
-// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
-// a simple subregister reference or explicit instructions to grab
-// upper bits of a vector.
-static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
-                                      SelectionDAG &DAG) {
-  SDLoc dl(Op);
-  SDValue In =  Op.getOperand(0);
-  SDValue Idx = Op.getOperand(1);
-  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-  MVT ResVT   = Op.getSimpleValueType();
-  MVT InVT    = In.getSimpleValueType();
+  if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
  
-  if (Subtarget->hasFp256()) {
-    if (ResVT.is128BitVector() &&
-        (InVT.is256BitVector() || InVT.is512BitVector()) &&
-        isa<ConstantSDNode>(Idx)) {
-      return Extract128BitVector(In, IdxVal, DAG, dl);
-    }
-    if (ResVT.is256BitVector() && InVT.is512BitVector() &&
-        isa<ConstantSDNode>(Idx)) {
-      return Extract256BitVector(In, IdxVal, DAG, dl);
-    }
-  }
-  return SDValue();
-}
+  if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
  
-// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
-// simple superregister reference or explicit instructions to insert
-// the upper bits of a vector.
-static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
-                                     SelectionDAG &DAG) {
-  if (Subtarget->hasFp256()) {
-    SDLoc dl(Op.getNode());
-    SDValue Vec = Op.getNode()->getOperand(0);
-    SDValue SubVec = Op.getNode()->getOperand(1);
-    SDValue Idx = Op.getNode()->getOperand(2);
+  if (isMOVLPMask(M, VT))
+    return getMOVLP(Op, dl, DAG, HasSSE2);
  
-    if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
-         Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
-        SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
-        isa<ConstantSDNode>(Idx)) {
-      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-      return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
-    }
+  if (ShouldXformToMOVHLPS(M, VT) ||
+      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
+    return DAG.getCommutedVectorShuffle(*SVOp);
  
-    if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
-        SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
-        isa<ConstantSDNode>(Idx)) {
-      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-      return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
-    }
+  if (isShift) {
+    // No better options. Use a vshldq / vsrldq.
+    MVT EltVT = VT.getVectorElementType();
+    ShAmt *= EltVT.getSizeInBits();
+    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
    }
-  return SDValue();
-}
-
-// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
-// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
-// one of the above mentioned nodes. It has to be wrapped because otherwise
-// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
-// be used to form addressing mode. These wrapped nodes will be selected
-// into MOV32ri.
-SDValue
-X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
-  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
  
-  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
-  // global base reg.
-  unsigned char OpFlag = 0;
-  unsigned WrapperKind = X86ISD::Wrapper;
-  CodeModel::Model M = DAG.getTarget().getCodeModel();
+  bool Commuted = false;
+  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
+  // 1,1,1,1 -> v8i16 though.
+  BitVector UndefElements;
+  if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
+    if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
+      V1IsSplat = true;
+  if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
+    if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
+      V2IsSplat = true;
  
-  if (Subtarget->isPICStyleRIPRel() &&
-      (M == CodeModel::Small || M == CodeModel::Kernel))
-    WrapperKind = X86ISD::WrapperRIP;
-  else if (Subtarget->isPICStyleGOT())
-    OpFlag = X86II::MO_GOTOFF;
-  else if (Subtarget->isPICStyleStubPIC())
-    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+  // Canonicalize the splat or undef, if present, to be on the RHS.
+  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
+    CommuteVectorShuffleMask(M, NumElems);
+    std::swap(V1, V2);
+    std::swap(V1IsSplat, V2IsSplat);
+    Commuted = true;
+  }
  
-  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
-                                             CP->getAlignment(),
-                                             CP->getOffset(), OpFlag);
-  SDLoc DL(CP);
-  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
-  // With PIC, the address is actually $g + Offset.
-  if (OpFlag) {
-    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg,
-                                     SDLoc(), getPointerTy()),
-                         Result);
+  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
+    // Shuffling low element of v1 into undef, just return v1.
+    if (V2IsUndef)
+      return V1;
+    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
+    // the instruction selector will not match, so get a canonical MOVL with
+    // swapped operands to undo the commute.
+    return getMOVL(DAG, dl, VT, V2, V1);
    }
  
-  return Result;
-}
+  if (isUNPCKLMask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
  
-SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
-  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  if (isUNPCKHMask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
  
-  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
-  // global base reg.
-  unsigned char OpFlag = 0;
-  unsigned WrapperKind = X86ISD::Wrapper;
-  CodeModel::Model M = DAG.getTarget().getCodeModel();
+  if (V2IsSplat) {
+    // Normalize mask so all entries that point to V2 points to its first
+    // element then try to match unpck{h|l} again. If match, return a
+    // new vector_shuffle with the corrected mask.p
+    SmallVector<int, 8> NewMask(M.begin(), M.end());
+    NormalizeMask(NewMask, NumElems);
+    if (isUNPCKLMask(NewMask, VT, HasInt256, true))
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+    if (isUNPCKHMask(NewMask, VT, HasInt256, true))
+      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
+  }
  
-  if (Subtarget->isPICStyleRIPRel() &&
-      (M == CodeModel::Small || M == CodeModel::Kernel))
-    WrapperKind = X86ISD::WrapperRIP;
-  else if (Subtarget->isPICStyleGOT())
-    OpFlag = X86II::MO_GOTOFF;
-  else if (Subtarget->isPICStyleStubPIC())
-    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+  if (Commuted) {
+    // Commute is back and try unpck* again.
+    // FIXME: this seems wrong.
+    CommuteVectorShuffleMask(M, NumElems);
+    std::swap(V1, V2);
+    std::swap(V1IsSplat, V2IsSplat);
  
-  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
-                                          OpFlag);
-  SDLoc DL(JT);
-  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+    if (isUNPCKLMask(M, VT, HasInt256))
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
  
-  // With PIC, the address is actually $g + Offset.
-  if (OpFlag)
-    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg,
-                                     SDLoc(), getPointerTy()),
-                         Result);
+    if (isUNPCKHMask(M, VT, HasInt256))
+      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
+  }
  
-  return Result;
-}
+  // Normalize the node to match x86 shuffle ops if needed
+  if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
+    return DAG.getCommutedVectorShuffle(*SVOp);
  
-SDValue
-X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
-  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+  // The checks below are all present in isShuffleMaskLegal, but they are
+  // inlined here right now to enable us to directly emit target specific
+  // nodes, and remove one by one until they don't return Op anymore.
  
-  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
-  // global base reg.
-  unsigned char OpFlag = 0;
-  unsigned WrapperKind = X86ISD::Wrapper;
-  CodeModel::Model M = DAG.getTarget().getCodeModel();
-
-  if (Subtarget->isPICStyleRIPRel() &&
-      (M == CodeModel::Small || M == CodeModel::Kernel)) {
-    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
-      OpFlag = X86II::MO_GOTPCREL;
-    WrapperKind = X86ISD::WrapperRIP;
-  } else if (Subtarget->isPICStyleGOT()) {
-    OpFlag = X86II::MO_GOT;
-  } else if (Subtarget->isPICStyleStubPIC()) {
-    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
-  } else if (Subtarget->isPICStyleStubNoDynamic()) {
-    OpFlag = X86II::MO_DARWIN_NONLAZY;
+  if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
+      SVOp->getSplatIndex() == 0 && V2IsUndef) {
+    if (VT == MVT::v2f64 || VT == MVT::v2i64)
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
    }
  
-  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
+  if (isPSHUFHWMask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
+                                getShufflePSHUFHWImmediate(SVOp),
+                                DAG);
  
-  SDLoc DL(Op);
-  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  if (isPSHUFLWMask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
+                                getShufflePSHUFLWImmediate(SVOp),
+                                DAG);
  
-  // With PIC, the address is actually $g + Offset.
-  if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
-      !Subtarget->is64Bit()) {
-    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg,
-                                     SDLoc(), getPointerTy()),
-                         Result);
-  }
+  unsigned MaskValue;
+  if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
+                  &MaskValue))
+    return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
  
-  // For symbols that require a load from a stub to get the address, emit the
-  // load.
-  if (isGlobalStubReference(OpFlag))
-    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(), false, false, false, 0);
+  if (isSHUFPMask(M, VT))
+    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
+                                getShuffleSHUFImmediate(SVOp), DAG);
  
-  return Result;
-}
+  if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+  if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
  
-SDValue
-X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
-  // Create the TargetBlockAddressAddress node.
-  unsigned char OpFlags =
-    Subtarget->ClassifyBlockAddressReference();
-  CodeModel::Model M = DAG.getTarget().getCodeModel();
-  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
-  SDLoc dl(Op);
-  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
-                                             OpFlags);
+  //===--------------------------------------------------------------------===//
+  // Generate target specific nodes for 128 or 256-bit shuffles only
+  // supported in the AVX instruction set.
+  //
  
-  if (Subtarget->isPICStyleRIPRel() &&
-      (M == CodeModel::Small || M == CodeModel::Kernel))
-    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
-  else
-    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+  // Handle VMOVDDUPY permutations
+  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
+    return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
  
-  // With PIC, the address is actually $g + Offset.
-  if (isGlobalRelativeToPICBase(OpFlags)) {
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
-                         Result);
+  // Handle VPERMILPS/D* permutations
+  if (isVPERMILPMask(M, VT)) {
+    if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
+      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
+                                  getShuffleSHUFImmediate(SVOp), DAG);
+    return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
+                                getShuffleSHUFImmediate(SVOp), DAG);
    }
  
-  return Result;
-}
-
-SDValue
-X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
-                                      int64_t Offset, SelectionDAG &DAG) const {
-  // Create the TargetGlobalAddress node, folding in the constant
-  // offset if it is legal.
-  unsigned char OpFlags =
-      Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
-  CodeModel::Model M = DAG.getTarget().getCodeModel();
-  SDValue Result;
-  if (OpFlags == X86II::MO_NO_FLAG &&
-      X86::isOffsetSuitableForCodeModel(Offset, M)) {
-    // A direct static reference to a global.
-    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
-    Offset = 0;
-  } else {
-    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
-  }
+  unsigned Idx;
+  if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
+    return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
+                              Idx*(NumElems/2), DAG, dl);
  
-  if (Subtarget->isPICStyleRIPRel() &&
-      (M == CodeModel::Small || M == CodeModel::Kernel))
-    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
-  else
-    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+  // Handle VPERM2F128/VPERM2I128 permutations
+  if (isVPERM2X128Mask(M, VT, HasFp256))
+    return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
+                                V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
  
-  // With PIC, the address is actually $g + Offset.
-  if (isGlobalRelativeToPICBase(OpFlags)) {
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
-                         Result);
-  }
+  if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
+    return getINSERTPS(SVOp, dl, DAG);
  
-  // For globals that require a load from a stub to get the address, emit the
-  // load.
-  if (isGlobalStubReference(OpFlags))
-    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(), false, false, false, 0);
+  unsigned Imm8;
+  if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
+    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
  
-  // If there was a non-zero offset that we didn't fold, create an explicit
-  // addition for it.
-  if (Offset != 0)
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
-                         DAG.getConstant(Offset, getPointerTy()));
+  if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
+      VT.is512BitVector()) {
+    MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
+    MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
+    SmallVector<SDValue, 16> permclMask;
+    for (unsigned i = 0; i != NumElems; ++i) {
+      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
+    }
  
-  return Result;
-}
+    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
+    if (V2IsUndef)
+      // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
+      return DAG.getNode(X86ISD::VPERMV, dl, VT,
+                          DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
+    return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
+                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
+  }
  
-SDValue
-X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
-  return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
-}
+  //===--------------------------------------------------------------------===//
+  // Since no target specific shuffle was selected for this generic one,
+  // lower it into other known shuffles. FIXME: this isn't true yet, but
+  // this is the plan.
+  //
  
-static SDValue
-GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
-           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
-           unsigned char OperandFlags, bool LocalDynamic = false) {
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  SDLoc dl(GA);
-  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
-                                           GA->getValueType(0),
-                                           GA->getOffset(),
-                                           OperandFlags);
+  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
+  if (VT == MVT::v8i16) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
  
-  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
-                                           : X86ISD::TLSADDR;
+  if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
  
-  if (InFlag) {
-    SDValue Ops[] = { Chain,  TGA, *InFlag };
-    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
-  } else {
-    SDValue Ops[]  = { Chain, TGA };
-    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
+  if (VT == MVT::v16i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
+    if (NewOp.getNode())
+      return NewOp;
    }
  
-  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
-  MFI->setAdjustsStack(true);
+  if (VT == MVT::v32i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
  
-  SDValue Flag = Chain.getValue(1);
-  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
-}
+  // Handle all 128-bit wide vectors with 4 elements, and match them with
+  // several different shuffle types.
+  if (NumElems == 4 && VT.is128BitVector())
+    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
  
-// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
-static SDValue
-LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
-                                const EVT PtrVT) {
-  SDValue InFlag;
-  SDLoc dl(GA);  // ? function entry point might be better
-  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
-                                   DAG.getNode(X86ISD::GlobalBaseReg,
-                                               SDLoc(), PtrVT), InFlag);
-  InFlag = Chain.getValue(1);
+  // Handle general 256-bit shuffles
+  if (VT.is256BitVector())
+    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
  
-  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
+  return SDValue();
  }
  
-// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
-static SDValue
-LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
-                                const EVT PtrVT) {
-  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
-                    X86::RAX, X86II::MO_TLSGD);
-}
+// This function assumes its argument is a BUILD_VECTOR of constants or
+// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
+// true.
+static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
+                                    unsigned &MaskValue) {
+  MaskValue = 0;
+  unsigned NumElems = BuildVector->getNumOperands();
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
+  unsigned NumElemsInLane = NumElems / NumLanes;
  
-static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
-                                           SelectionDAG &DAG,
-                                           const EVT PtrVT,
-                                           bool is64Bit) {
-  SDLoc dl(GA);
+  // Blend for v16i16 should be symetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
+    SDValue EltCond = BuildVector->getOperand(i);
+    SDValue SndLaneEltCond =
+        (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
  
-  // Get the start address of the TLS block for this module.
-  X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
-      .getInfo<X86MachineFunctionInfo>();
-  MFI->incNumLocalDynamicTLSAccesses();
+    int Lane1Cond = -1, Lane2Cond = -1;
+    if (isa<ConstantSDNode>(EltCond))
+      Lane1Cond = !isZero(EltCond);
+    if (isa<ConstantSDNode>(SndLaneEltCond))
+      Lane2Cond = !isZero(SndLaneEltCond);
  
-  SDValue Base;
-  if (is64Bit) {
-    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
-                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
-  } else {
-    SDValue InFlag;
-    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
-        DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
-    InFlag = Chain.getValue(1);
-    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
-                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+    if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
+      // Lane1Cond != 0, means we want the first argument.
+      // Lane1Cond == 0, means we want the second argument.
+      // The encoding of this argument is 0 for the first argument, 1
+      // for the second. Therefore, invert the condition.
+      MaskValue |= !Lane1Cond << i;
+    else if (Lane1Cond < 0)
+      MaskValue |= !Lane2Cond << i;
+    else
+      return false;
    }
+  return true;
+}
  
-  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
-  // of Base.
-
-  // Build x@dtpoff.
-  unsigned char OperandFlags = X86II::MO_DTPOFF;
-  unsigned WrapperKind = X86ISD::Wrapper;
-  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
-                                           GA->getValueType(0),
-                                           GA->getOffset(), OperandFlags);
-  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+// Try to lower a vselect node into a simple blend instruction.
+static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget,
+                                   SelectionDAG &DAG) {
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
  
-  // Add x@dtpoff with the base.
-  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
-}
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
  
-// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
-static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
-                                   const EVT PtrVT, TLSModel::Model model,
-                                   bool is64Bit, bool isPIC) {
-  SDLoc dl(GA);
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
  
-  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
-  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
-                                                         is64Bit ? 257 : 256));
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
  
-  SDValue ThreadPointer =
-      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
-                  MachinePointerInfo(Ptr), false, false, false, 0);
+  // Check the mask for BLEND and build the value.
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
  
-  unsigned char OperandFlags = 0;
-  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
-  // initialexec.
-  unsigned WrapperKind = X86ISD::Wrapper;
-  if (model == TLSModel::LocalExec) {
-    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
-  } else if (model == TLSModel::InitialExec) {
-    if (is64Bit) {
-      OperandFlags = X86II::MO_GOTTPOFF;
-      WrapperKind = X86ISD::WrapperRIP;
-    } else {
-      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
-    }
-  } else {
-    llvm_unreachable("Unexpected model");
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  MVT BlendVT = VT;
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+                               NumElems);
+    LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
+    RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
    }
  
-  // emit "addl x@ntpoff,%eax" (local exec)
-  // or "addl x@indntpoff,%eax" (initial exec)
-  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
-  SDValue TGA =
-      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
-                                 GA->getOffset(), OperandFlags);
-  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
+                            DAG.getConstant(MaskValue, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
  
-  if (model == TLSModel::InitialExec) {
-    if (isPIC && !is64Bit) {
-      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
-                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
-                           Offset);
-    }
+SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG);
+  if (BlendOp.getNode())
+    return BlendOp;
  
-    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
-                         MachinePointerInfo::getGOT(), false, false, false, 0);
+  // Some types for vselect were previously set to Expand, not Legal or
+  // Custom. Return an empty SDValue so we fall-through to Expand, after
+  // the Custom lowering phase.
+  MVT VT = Op.getSimpleValueType();
+  switch (VT.SimpleTy) {
+  default:
+    break;
+  case MVT::v8i16:
+  case MVT::v16i16:
+    return SDValue();
    }
  
-  // The address of the thread local variable is the add of the thread
-  // pointer with the offset of the variable.
-  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+  // We couldn't create a "Blend with immediate" node.
+  // This node should still be legal, but we'll have to emit a blendv*
+  // instruction.
+  return Op;
  }
  
-SDValue
-X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
-
-  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-  const GlobalValue *GV = GA->getGlobal();
+static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
  
-  if (Subtarget->isTargetELF()) {
-    TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
+  if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
+    return SDValue();
  
-    switch (model) {
-      case TLSModel::GeneralDynamic:
-        if (Subtarget->is64Bit())
-          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
-        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
-      case TLSModel::LocalDynamic:
-        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
-                                           Subtarget->is64Bit());
-      case TLSModel::InitialExec:
-      case TLSModel::LocalExec:
-        return LowerToTLSExecModel(
-            GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
-            DAG.getTarget().getRelocationModel() == Reloc::PIC_);
-    }
-    llvm_unreachable("Unknown TLS model.");
+  if (VT.getSizeInBits() == 8) {
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
+                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+                                  DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
    }
  
-  if (Subtarget->isTargetDarwin()) {
-    // Darwin only has one model of TLS.  Lower to that.
-    unsigned char OpFlag = 0;
-    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
-                           X86ISD::WrapperRIP : X86ISD::Wrapper;
+  if (VT.getSizeInBits() == 16) {
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
+    if (Idx == 0)
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                     DAG.getNode(ISD::BITCAST, dl,
+                                                 MVT::v4i32,
+                                                 Op.getOperand(0)),
+                                     Op.getOperand(1)));
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
+                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+                                  DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  }
  
-    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
-    // global base reg.
-    bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
-                 !Subtarget->is64Bit();
-    if (PIC32)
-      OpFlag = X86II::MO_TLVP_PIC_BASE;
-    else
-      OpFlag = X86II::MO_TLVP;
-    SDLoc DL(Op);
-    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
-                                                GA->getValueType(0),
-                                                GA->getOffset(), OpFlag);
-    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  if (VT == MVT::f32) {
+    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
+    // the result back to FR32 register. It's only worth matching if the
+    // result has a single use which is a store or a bitcast to i32.  And in
+    // the case of a store, it's not worth it if the index is a constant 0,
+    // because a MOVSSmr can be used instead, which is smaller and faster.
+    if (!Op.hasOneUse())
+      return SDValue();
+    SDNode *User = *Op.getNode()->use_begin();
+    if ((User->getOpcode() != ISD::STORE ||
+         (isa<ConstantSDNode>(Op.getOperand(1)) &&
+          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
+        (User->getOpcode() != ISD::BITCAST ||
+         User->getValueType(0) != MVT::i32))
+      return SDValue();
+    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
+                                              Op.getOperand(0)),
+                                              Op.getOperand(1));
+    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
+  }
  
-    // With PIC32, the address is actually $g + Offset.
-    if (PIC32)
-      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                           DAG.getNode(X86ISD::GlobalBaseReg,
-                                       SDLoc(), getPointerTy()),
-                           Offset);
+  if (VT == MVT::i32 || VT == MVT::i64) {
+    // ExtractPS/pextrq works with constant index.
+    if (isa<ConstantSDNode>(Op.getOperand(1)))
+      return Op;
+  }
+  return SDValue();
+}
  
-    // Lowering the machine isd will make sure everything is in the right
-    // location.
-    SDValue Chain = DAG.getEntryNode();
-    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-    SDValue Args[] = { Chain, Offset };
-    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
+/// Extract one bit from mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Vec = Op.getOperand(0);
+  SDLoc dl(Vec);
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = Op.getOperand(1);
+  MVT EltVT = Op.getSimpleValueType();
  
-    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
-    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-    MFI->setAdjustsStack(true);
+  assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
  
-    // And our return value (tls address) is in the standard call return value
-    // location.
-    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
-    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
-                              Chain.getValue(1));
+  // variable index can't be handled in mask registers,
+  // extend vector to VR512
+  if (!isa<ConstantSDNode>(Idx)) {
+    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              ExtVT.getVectorElementType(), Ext, Idx);
+    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
    }
  
-  if (Subtarget->isTargetKnownWindowsMSVC() ||
-      Subtarget->isTargetWindowsGNU()) {
-    // Just use the implicit TLS architecture
-    // Need to generate someting similar to:
-    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
-    //                                  ; from TEB
-    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
-    //   mov     rcx, qword [rdx+rcx*8]
-    //   mov     eax, .tls$:tlsvar
-    //   [rax+rcx] contains the address
-    // Windows 64bit: gs:0x58
-    // Windows 32bit: fs:__tls_array
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  const TargetRegisterClass* rc = getRegClassFor(VecVT);
+  unsigned MaxSift = rc->getSize()*8 - 1;
+  Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+                    DAG.getConstant(MaxSift - IdxVal, MVT::i8));
+  Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+                    DAG.getConstant(MaxSift, MVT::i8));
+  return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
+                       DAG.getIntPtrConstant(0));
+}
  
-    SDLoc dl(GA);
-    SDValue Chain = DAG.getEntryNode();
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = Op.getOperand(1);
  
-    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
-    // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
-    // use its literal value of 0x2C.
-    Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
-                                        ? Type::getInt8PtrTy(*DAG.getContext(),
-                                                             256)
-                                        : Type::getInt32PtrTy(*DAG.getContext(),
-                                                              257));
+  if (Op.getSimpleValueType() == MVT::i1)
+    return ExtractBitFromMaskVector(Op, DAG);
  
-    SDValue TlsArray =
-        Subtarget->is64Bit()
-            ? DAG.getIntPtrConstant(0x58)
-            : (Subtarget->isTargetWindowsGNU()
-                   ? DAG.getIntPtrConstant(0x2C)
-                   : DAG.getExternalSymbol("_tls_array", getPointerTy()));
+  if (!isa<ConstantSDNode>(Idx)) {
+    if (VecVT.is512BitVector() ||
+        (VecVT.is256BitVector() && Subtarget->hasInt256() &&
+         VecVT.getVectorElementType().getSizeInBits() == 32)) {
  
-    SDValue ThreadPointer =
-        DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
-                    MachinePointerInfo(Ptr), false, false, false, 0);
+      MVT MaskEltVT =
+        MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
+      MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
+                                    MaskEltVT.getSizeInBits());
  
-    // Load the _tls_index variable
-    SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
-    if (Subtarget->is64Bit())
-      IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
-                           IDX, MachinePointerInfo(), MVT::i32,
-                           false, false, 0);
-    else
-      IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
-                        false, false, false, 0);
+      Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
+      SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
+                                getZeroVector(MaskVT, Subtarget, DAG, dl),
+                                Idx, DAG.getConstant(0, getPointerTy()));
+      SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
+                        Perm, DAG.getConstant(0, getPointerTy()));
+    }
+    return SDValue();
+  }
  
-    SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
-                                    getPointerTy());
-    IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
+  // If this is a 256-bit vector result, first extract the 128-bit vector and
+  // then extract the element from the 128-bit vector.
+  if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
  
-    SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
-    res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
-                      false, false, false, 0);
+    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    // Get the 128-bit vector.
+    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
+    MVT EltVT = VecVT.getVectorElementType();
  
-    // Get the offset of start of .tls section
-    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
-                                             GA->getValueType(0),
-                                             GA->getOffset(), X86II::MO_SECREL);
-    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
+    unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
  
-    // The address of the thread local variable is the add of the thread
-    // pointer with the offset of the variable.
-    return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
+    //if (IdxVal >= NumElems/2)
+    //  IdxVal -= NumElems/2;
+    IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
+                       DAG.getConstant(IdxVal, MVT::i32));
    }
  
-  llvm_unreachable("TLS not implemented for this target.");
-}
+  assert(VecVT.is128BitVector() && "Unexpected vector length");
  
-/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
-/// and take a 2 x i32 value to shift plus a shift amount.
-static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
-  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
-  MVT VT = Op.getSimpleValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt  = Op.getOperand(2);
-  // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
-  // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
-  // during isel.
-  SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
-                                  DAG.getConstant(VTBits - 1, MVT::i8));
-  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
-                                     DAG.getConstant(VTBits - 1, MVT::i8))
-                       : DAG.getConstant(0, VT);
+  if (Subtarget->hasSSE41()) {
+    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
+    if (Res.getNode())
+      return Res;
+  }
  
-  SDValue Tmp2, Tmp3;
-  if (Op.getOpcode() == ISD::SHL_PARTS) {
-    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
-    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
-  } else {
-    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
-    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
+  MVT VT = Op.getSimpleValueType();
+  // TODO: handle v16i8.
+  if (VT.getSizeInBits() == 16) {
+    SDValue Vec = Op.getOperand(0);
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                     DAG.getNode(ISD::BITCAST, dl,
+                                                 MVT::v4i32, Vec),
+                                     Op.getOperand(1)));
+    // Transform it so it match pextrw which produces a 32-bit result.
+    MVT EltVT = MVT::i32;
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
+                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
+                                  DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
    }
  
-  // If the shift amount is larger or equal than the width of a part we can't
-  // rely on the results of shld/shrd. Insert a test and select the appropriate
-  // values for large shift amounts.
-  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
-                                DAG.getConstant(VTBits, MVT::i8));
-  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
-                             AndNode, DAG.getConstant(0, MVT::i8));
+  if (VT.getSizeInBits() == 32) {
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return Op;
  
-  SDValue Hi, Lo;
-  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
-  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
-  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
+    // SHUFPS the element to the lowest double word, then movss.
+    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
+    MVT VVT = Op.getOperand(0).getSimpleValueType();
+    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+                                       DAG.getUNDEF(VVT), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+                       DAG.getIntPtrConstant(0));
+  }
  
-  if (Op.getOpcode() == ISD::SHL_PARTS) {
-    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
-    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
-  } else {
-    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
-    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+  if (VT.getSizeInBits() == 64) {
+    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
+    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
+    //        to match extract_elt for f64.
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return Op;
+
+    // UNPCKHPD the element to the lowest double word, then movsd.
+    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+    int Mask[2] = { 1, -1 };
+    MVT VVT = Op.getOperand(0).getSimpleValueType();
+    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+                                       DAG.getUNDEF(VVT), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+                       DAG.getIntPtrConstant(0));
    }
  
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, dl);
+  return SDValue();
  }
  
-SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  SDLoc dl(Op);
  
-  if (SrcVT.isVector())
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2 = Op.getOperand(2);
+
+  if (!VT.is128BitVector())
      return SDValue();
  
-  assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
-         "Unknown SINT_TO_FP to lower!");
+  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
+      isa<ConstantSDNode>(N2)) {
+    unsigned Opc;
+    if (VT == MVT::v8i16)
+      Opc = X86ISD::PINSRW;
+    else if (VT == MVT::v16i8)
+      Opc = X86ISD::PINSRB;
+    else
+      Opc = X86ISD::PINSRB;
  
-  // These are really Legal; return the operand so the caller accepts it as
-  // Legal.
-  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
-    return Op;
-  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
-      Subtarget->is64Bit()) {
-    return Op;
+    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+    // argument.
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
    }
  
-  SDLoc dl(Op);
-  unsigned Size = SrcVT.getSizeInBits()/8;
-  MachineFunction &MF = DAG.getMachineFunction();
-  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
-  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
-  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
-                               StackSlot,
-                               MachinePointerInfo::getFixedStack(SSFI),
-                               false, false, 0);
-  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
-}
-
-SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
-                                     SDValue StackSlot,
-                                     SelectionDAG &DAG) const {
-  // Build the FILD
-  SDLoc DL(Op);
-  SDVTList Tys;
-  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
-  if (useSSE)
-    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
-  else
-    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
-
-  unsigned ByteSize = SrcVT.getSizeInBits()/8;
-
-  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
-  MachineMemOperand *MMO;
-  if (FI) {
-    int SSFI = FI->getIndex();
-    MMO =
-      DAG.getMachineFunction()
-      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
-                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
-  } else {
-    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
-    StackSlot = StackSlot.getOperand(1);
+  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
+    // Bits [7:6] of the constant are the source select.  This will always be
+    //  zero here.  The DAG Combiner may combine an extract_elt index into these
+    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
+    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
+    // Bits [5:4] of the constant are the destination select.  This is the
+    //  value of the incoming immediate.
+    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
+    //   combine either bitwise AND or insert of float 0.0 to set these bits.
+    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
+    // Create this as a scalar to vector..
+    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
    }
-  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
-  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
-                                           X86ISD::FILD, DL,
-                                           Tys, Ops, SrcVT, MMO);
  
-  if (useSSE) {
-    Chain = Result.getValue(1);
-    SDValue InFlag = Result.getValue(2);
+  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
+    // PINSR* works with constant index.
+    return Op;
+  }
+  return SDValue();
+}
  
-    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
-    // shouldn't be necessary except that RFP cannot be live across
-    // multiple blocks. When stackifier is fixed, they can be uncoupled.
-    MachineFunction &MF = DAG.getMachineFunction();
-    unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
-    int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
-    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
-    Tys = DAG.getVTList(MVT::Other);
-    SDValue Ops[] = {
-      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
-    };
-    MachineMemOperand *MMO =
-      DAG.getMachineFunction()
-      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
-                            MachineMemOperand::MOStore, SSFISize, SSFISize);
+/// Insert one bit to mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue 
+X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue Elt = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+  MVT VecVT = Vec.getSimpleValueType();
  
-    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
-                                    Ops, Op.getValueType(), MMO);
-    Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
-                         MachinePointerInfo::getFixedStack(SSFI),
-                         false, false, false, 0);
+  if (!isa<ConstantSDNode>(Idx)) {
+    // Non constant index. Extend source and destination,
+    // insert element and then truncate the result.
+    MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
+    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, 
+      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
+      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
+    return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
    }
  
-  return Result;
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+  if (Vec.getOpcode() == ISD::UNDEF)
+    return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+                       DAG.getConstant(IdxVal, MVT::i8));
+  const TargetRegisterClass* rc = getRegClassFor(VecVT);
+  unsigned MaxSift = rc->getSize()*8 - 1;
+  EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+                    DAG.getConstant(MaxSift, MVT::i8));
+  EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
+                    DAG.getConstant(MaxSift - IdxVal, MVT::i8));
+  return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
  }
-
-// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
-SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  // This algorithm is not obvious. Here it is what we're trying to output:
-  /*
-     movq       %rax,  %xmm0
-     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
-     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
-     #ifdef __SSE3__
-       haddpd   %xmm0, %xmm0
-     #else
-       pshufd   $0x4e, %xmm0, %xmm1
-       addpd    %xmm1, %xmm0
-     #endif
-  */
+SDValue
+X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  
+  if (EltVT == MVT::i1)
+    return InsertBitToMaskVector(Op, DAG);
  
    SDLoc dl(Op);
-  LLVMContext *Context = DAG.getContext();
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2 = Op.getOperand(2);
  
-  // Build some magic constants.
-  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
-  Constant *C0 = ConstantDataVector::get(*Context, CV0);
-  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
+  // If this is a 256-bit vector result, first extract the 128-bit vector,
+  // insert the element into the extracted half and then place it back.
+  if (VT.is256BitVector() || VT.is512BitVector()) {
+    if (!isa<ConstantSDNode>(N2))
+      return SDValue();
  
-  SmallVector<Constant*,2> CV1;
-  CV1.push_back(
-    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
-                                      APInt(64, 0x4330000000000000ULL))));
-  CV1.push_back(
-    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
-                                      APInt(64, 0x4530000000000000ULL))));
-  Constant *C1 = ConstantVector::get(CV1);
-  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
+    // Get the desired 128-bit vector half.
+    unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
+    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
  
-  // Load the 64-bit value into an XMM register.
-  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
-                            Op.getOperand(0));
-  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
-                              MachinePointerInfo::getConstantPool(),
-                              false, false, false, 16);
-  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
-                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
-                              CLod0);
+    // Insert the element into the desired half.
+    unsigned NumEltsIn128 = 128/EltVT.getSizeInBits();
+    unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128;
  
-  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
-                              MachinePointerInfo::getConstantPool(),
-                              false, false, false, 16);
-  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
-  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
-  SDValue Result;
+    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
+                    DAG.getConstant(IdxIn128, MVT::i32));
  
-  if (Subtarget->hasSSE3()) {
-    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
-    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
-  } else {
-    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
-    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
-                                           S2F, 0x4E, DAG);
-    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
-                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
-                         Sub);
+    // Insert the changed part back to the 256-bit vector
+    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
    }
  
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
-                     DAG.getIntPtrConstant(0));
+  if (Subtarget->hasSSE41())
+    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
+
+  if (EltVT == MVT::i8)
+    return SDValue();
+
+  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
+    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+    // as its second argument.
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+  }
+  return SDValue();
  }
  
-// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
-SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
-                                               SelectionDAG &DAG) const {
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
    SDLoc dl(Op);
-  // FP constant to bias correct the final result.
-  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
-                                   MVT::f64);
+  MVT OpVT = Op.getSimpleValueType();
  
-  // Load the 32-bit value into an XMM register.
-  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
-                             Op.getOperand(0));
+  // If this is a 256-bit vector result, first insert into a 128-bit
+  // vector and then insert into the 256-bit vector.
+  if (!OpVT.is128BitVector()) {
+    // Insert into a 128-bit vector.
+    unsigned SizeFactor = OpVT.getSizeInBits()/128;
+    MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
+                                 OpVT.getVectorNumElements() / SizeFactor);
  
-  // Zero out the upper parts of the register.
-  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
+    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
  
-  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
-                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
-                     DAG.getIntPtrConstant(0));
+    // Insert the 128-bit vector.
+    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
+  }
  
-  // Or the load with the bias.
-  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
-                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
-                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-                                                   MVT::v2f64, Load)),
-                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
-                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-                                                   MVT::v2f64, Bias)));
-  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
-                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
-                   DAG.getIntPtrConstant(0));
-
-  // Subtract the bias.
-  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
+  if (OpVT == MVT::v1i64 &&
+      Op.getOperand(0).getValueType() == MVT::i64)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
  
-  // Handle final rounding.
-  EVT DestVT = Op.getValueType();
+  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
+  assert(OpVT.is128BitVector() && "Expected an SSE type!");
+  return DAG.getNode(ISD::BITCAST, dl, OpVT,
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
+}
  
-  if (DestVT.bitsLT(MVT::f64))
-    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
-                       DAG.getIntPtrConstant(0));
-  if (DestVT.bitsGT(MVT::f64))
-    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
+// a simple subregister reference or explicit instructions to grab
+// upper bits of a vector.
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  SDValue In =  Op.getOperand(0);
+  SDValue Idx = Op.getOperand(1);
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  MVT ResVT   = Op.getSimpleValueType();
+  MVT InVT    = In.getSimpleValueType();
  
-  // Handle final rounding.
-  return Sub;
+  if (Subtarget->hasFp256()) {
+    if (ResVT.is128BitVector() &&
+        (InVT.is256BitVector() || InVT.is512BitVector()) &&
+        isa<ConstantSDNode>(Idx)) {
+      return Extract128BitVector(In, IdxVal, DAG, dl);
+    }
+    if (ResVT.is256BitVector() && InVT.is512BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      return Extract256BitVector(In, IdxVal, DAG, dl);
+    }
+  }
+  return SDValue();
  }
  
-SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  SDValue N0 = Op.getOperand(0);
-  MVT SVT = N0.getSimpleValueType();
-  SDLoc dl(Op);
+// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
+// simple superregister reference or explicit instructions to insert
+// the upper bits of a vector.
+static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+                                     SelectionDAG &DAG) {
+  if (Subtarget->hasFp256()) {
+    SDLoc dl(Op.getNode());
+    SDValue Vec = Op.getNode()->getOperand(0);
+    SDValue SubVec = Op.getNode()->getOperand(1);
+    SDValue Idx = Op.getNode()->getOperand(2);
  
-  assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
-          SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
-         "Custom UINT_TO_FP is not supported!");
+    if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
+         Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
+        SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+    }
  
-  MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
-  return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
-                     DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+    if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
+        SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+    }
+  }
+  return SDValue();
  }
  
-SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  SDValue N0 = Op.getOperand(0);
-  SDLoc dl(Op);
-
-  if (Op.getValueType().isVector())
-    return lowerUINT_TO_FP_vec(Op, DAG);
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDValue
+X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
  
-  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
-  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
-  // the optimization here.
-  if (DAG.SignBitIsZero(N0))
-    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
  
-  MVT SrcVT = N0.getSimpleValueType();
-  MVT DstVT = Op.getSimpleValueType();
-  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
-    return LowerUINT_TO_FP_i64(Op, DAG);
-  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
-    return LowerUINT_TO_FP_i32(Op, DAG);
-  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
-    return SDValue();
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    WrapperKind = X86ISD::WrapperRIP;
+  else if (Subtarget->isPICStyleGOT())
+    OpFlag = X86II::MO_GOTOFF;
+  else if (Subtarget->isPICStyleStubPIC())
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
  
-  // Make a 64-bit buffer, and use it to build an FILD.
-  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
-  if (SrcVT == MVT::i32) {
-    SDValue WordOff = DAG.getConstant(4, getPointerTy());
-    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
-                                     getPointerTy(), StackSlot, WordOff);
-    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
-                                  StackSlot, MachinePointerInfo(),
-                                  false, false, 0);
-    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
-                                  OffsetSlot, MachinePointerInfo(),
-                                  false, false, 0);
-    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
-    return Fild;
+  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
+                                             CP->getAlignment(),
+                                             CP->getOffset(), OpFlag);
+  SDLoc DL(CP);
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (OpFlag) {
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     SDLoc(), getPointerTy()),
+                         Result);
    }
  
-  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
-                               StackSlot, MachinePointerInfo(),
-                               false, false, 0);
-  // For i64 source, we need to add the appropriate power of 2 if the input
-  // was negative.  This is the same as the optimization in
-  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
-  // we must be careful to do the computation in x87 extended precision, not
-  // in SSE. (The generic code can't know it's OK to do this, or how to.)
-  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
-  MachineMemOperand *MMO =
-    DAG.getMachineFunction()
-    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
-                          MachineMemOperand::MOLoad, 8, 8);
+  return Result;
+}
  
-  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
-  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
-  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
-                                         MVT::i64, MMO);
+SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
  
-  APInt FF(32, 0x5F800000ULL);
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
  
-  // Check whether the sign bit is set.
-  SDValue SignSet = DAG.getSetCC(dl,
-                                 getSetCCResultType(*DAG.getContext(), MVT::i64),
-                                 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
-                                 ISD::SETLT);
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    WrapperKind = X86ISD::WrapperRIP;
+  else if (Subtarget->isPICStyleGOT())
+    OpFlag = X86II::MO_GOTOFF;
+  else if (Subtarget->isPICStyleStubPIC())
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
  
-  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
-  SDValue FudgePtr = DAG.getConstantPool(
-                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
-                                         getPointerTy());
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
+                                          OpFlag);
+  SDLoc DL(JT);
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
  
-  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
-  SDValue Zero = DAG.getIntPtrConstant(0);
-  SDValue Four = DAG.getIntPtrConstant(4);
-  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
-                               Zero, Four);
-  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
+  // With PIC, the address is actually $g + Offset.
+  if (OpFlag)
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     SDLoc(), getPointerTy()),
+                         Result);
  
-  // Load the value out, extending it from f32 to f80.
-  // FIXME: Avoid the extend by constructing the right constant pool?
-  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
-                                 FudgePtr, MachinePointerInfo::getConstantPool(),
-                                 MVT::f32, false, false, 4);
-  // Extend everything to 80 bits to force it to be done on x87.
-  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
-  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
+  return Result;
  }
  
-std::pair<SDValue,SDValue>
-X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
-                                    bool IsSigned, bool IsReplace) const {
-  SDLoc DL(Op);
+SDValue
+X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
  
-  EVT DstTy = Op.getValueType();
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
  
-  if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
-    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
-    DstTy = MVT::i64;
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel)) {
+    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
+      OpFlag = X86II::MO_GOTPCREL;
+    WrapperKind = X86ISD::WrapperRIP;
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOT;
+  } else if (Subtarget->isPICStyleStubPIC()) {
+    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+  } else if (Subtarget->isPICStyleStubNoDynamic()) {
+    OpFlag = X86II::MO_DARWIN_NONLAZY;
    }
  
-  assert(DstTy.getSimpleVT() <= MVT::i64 &&
-         DstTy.getSimpleVT() >= MVT::i16 &&
-         "Unknown FP_TO_INT to lower!");
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
  
-  // These are really Legal.
-  if (DstTy == MVT::i32 &&
-      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
-    return std::make_pair(SDValue(), SDValue());
-  if (Subtarget->is64Bit() &&
-      DstTy == MVT::i64 &&
-      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
-    return std::make_pair(SDValue(), SDValue());
-
-  // We lower FP->int64 either into FISTP64 followed by a load from a temporary
-  // stack slot, or into the FTOL runtime function.
-  MachineFunction &MF = DAG.getMachineFunction();
-  unsigned MemSize = DstTy.getSizeInBits()/8;
-  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
-  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
-
-  unsigned Opc;
-  if (!IsSigned && isIntegerTypeFTOL(DstTy))
-    Opc = X86ISD::WIN_FTOL;
-  else
-    switch (DstTy.getSimpleVT().SimpleTy) {
-    default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
-    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
-    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
-    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
-    }
-
-  SDValue Chain = DAG.getEntryNode();
-  SDValue Value = Op.getOperand(0);
-  EVT TheVT = Op.getOperand(0).getValueType();
-  // FIXME This causes a redundant load/store if the SSE-class value is already
-  // in memory, such as if it is on the callstack.
-  if (isScalarFPTypeInSSEReg(TheVT)) {
-    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
-    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
-                         MachinePointerInfo::getFixedStack(SSFI),
-                         false, false, 0);
-    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
-    SDValue Ops[] = {
-      Chain, StackSlot, DAG.getValueType(TheVT)
-    };
+  SDLoc DL(Op);
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
  
-    MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
-                              MachineMemOperand::MOLoad, MemSize, MemSize);
-    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
-    Chain = Value.getValue(1);
-    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
-    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  // With PIC, the address is actually $g + Offset.
+  if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->is64Bit()) {
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     SDLoc(), getPointerTy()),
+                         Result);
    }
  
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
-                            MachineMemOperand::MOStore, MemSize, MemSize);
+  // For symbols that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlag))
+    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
  
-  if (Opc != X86ISD::WIN_FTOL) {
-    // Build the FP_TO_INT*_IN_MEM
-    SDValue Ops[] = { Chain, Value, StackSlot };
-    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
-                                           Ops, DstTy, MMO);
-    return std::make_pair(FIST, StackSlot);
-  } else {
-    SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
-      DAG.getVTList(MVT::Other, MVT::Glue),
-      Chain, Value);
-    SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
-      MVT::i32, ftol.getValue(1));
-    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
-      MVT::i32, eax.getValue(2));
-    SDValue Ops[] = { eax, edx };
-    SDValue pair = IsReplace
-      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
-      : DAG.getMergeValues(Ops, DL);
-    return std::make_pair(pair, SDValue());
-  }
+  return Result;
  }
  
-static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
-                              const X86Subtarget *Subtarget) {
-  MVT VT = Op->getSimpleValueType(0);
-  SDValue In = Op->getOperand(0);
-  MVT InVT = In.getSimpleValueType();
+SDValue
+X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+  // Create the TargetBlockAddressAddress node.
+  unsigned char OpFlags =
+    Subtarget->ClassifyBlockAddressReference();
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
    SDLoc dl(Op);
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
+                                             OpFlags);
  
-  // Optimize vectors in AVX mode:
-  //
-  //   v8i16 -> v8i32
-  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
-  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
-  //   Concat upper and lower parts.
-  //
-  //   v4i32 -> v4i64
-  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
-  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
-  //   Concat upper and lower parts.
-  //
-
-  if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
-      ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
-      ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
-    return SDValue();
-
-  if (Subtarget->hasInt256())
-    return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+  else
+    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
  
-  SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
-  SDValue Undef = DAG.getUNDEF(InVT);
-  bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
-  SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
-  SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+  // With PIC, the address is actually $g + Offset.
+  if (isGlobalRelativeToPICBase(OpFlags)) {
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
+                         Result);
+  }
  
-  MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
-                             VT.getVectorNumElements()/2);
+  return Result;
+}
  
-  OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
-  OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+SDValue
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
+                                      int64_t Offset, SelectionDAG &DAG) const {
+  // Create the TargetGlobalAddress node, folding in the constant
+  // offset if it is legal.
+  unsigned char OpFlags =
+      Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
+  SDValue Result;
+  if (OpFlags == X86II::MO_NO_FLAG &&
+      X86::isOffsetSuitableForCodeModel(Offset, M)) {
+    // A direct static reference to a global.
+    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
+    Offset = 0;
+  } else {
+    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
+  }
  
-  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
-}
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+  else
+    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
  
-static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
-                                        SelectionDAG &DAG) {
-  MVT VT = Op->getSimpleValueType(0);
-  SDValue In = Op->getOperand(0);
-  MVT InVT = In.getSimpleValueType();
-  SDLoc DL(Op);
-  unsigned int NumElts = VT.getVectorNumElements();
-  if (NumElts != 8 && NumElts != 16)
-    return SDValue();
+  // With PIC, the address is actually $g + Offset.
+  if (isGlobalRelativeToPICBase(OpFlags)) {
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
+                         Result);
+  }
  
-  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
-    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+  // For globals that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlags))
+    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
  
-  EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  // Now we have only mask extension
-  assert(InVT.getVectorElementType() == MVT::i1);
-  SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
-  const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
-  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
-  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
-  SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
-                           MachinePointerInfo::getConstantPool(),
-                           false, false, false, Alignment);
+  // If there was a non-zero offset that we didn't fold, create an explicit
+  // addition for it.
+  if (Offset != 0)
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
+                         DAG.getConstant(Offset, getPointerTy()));
  
-  SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
-  if (VT.is512BitVector())
-    return Brcst;
-  return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
+  return Result;
  }
  
-static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
-                               SelectionDAG &DAG) {
-  if (Subtarget->hasFp256()) {
-    SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
-    if (Res.getNode())
-      return Res;
-  }
-
-  return SDValue();
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+  return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
  }
  
-static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
-                                SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  MVT VT = Op.getSimpleValueType();
-  SDValue In = Op.getOperand(0);
-  MVT SVT = In.getSimpleValueType();
+static SDValue
+GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
+           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
+           unsigned char OperandFlags, bool LocalDynamic = false) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDLoc dl(GA);
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(),
+                                           OperandFlags);
  
-  if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
-    return LowerZERO_EXTEND_AVX512(Op, DAG);
+  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
+                                           : X86ISD::TLSADDR;
  
-  if (Subtarget->hasFp256()) {
-    SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
-    if (Res.getNode())
-      return Res;
+  if (InFlag) {
+    SDValue Ops[] = { Chain,  TGA, *InFlag };
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
+  } else {
+    SDValue Ops[]  = { Chain, TGA };
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
    }
  
-  assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
-         VT.getVectorNumElements() != SVT.getVectorNumElements());
-  return SDValue();
-}
+  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+  MFI->setAdjustsStack(true);
  
-SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  MVT VT = Op.getSimpleValueType();
-  SDValue In = Op.getOperand(0);
-  MVT InVT = In.getSimpleValueType();
+  SDValue Flag = Chain.getValue(1);
+  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
  
-  if (VT == MVT::i1) {
-    assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
-           "Invalid scalar TRUNCATE operation");
-    if (InVT == MVT::i32)
-      return SDValue();
-    if (InVT.getSizeInBits() == 64)
-      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::i32, In);
-    else if (InVT.getSizeInBits() < 32)
-      In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
-    return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
-  }
-  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
-         "Invalid TRUNCATE operation");
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
+static SDValue
+LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const EVT PtrVT) {
+  SDValue InFlag;
+  SDLoc dl(GA);  // ? function entry point might be better
+  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+                                   DAG.getNode(X86ISD::GlobalBaseReg,
+                                               SDLoc(), PtrVT), InFlag);
+  InFlag = Chain.getValue(1);
  
-  if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
-    if (VT.getVectorElementType().getSizeInBits() >=8)
-      return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
+}
  
-    assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
-    unsigned NumElts = InVT.getVectorNumElements();
-    assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
-    if (InVT.getSizeInBits() < 512) {
-      MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
-      In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
-      InVT = ExtVT;
-    }
-    
-    SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
-    const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
-    SDValue CP = DAG.getConstantPool(C, getPointerTy());
-    unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
-    SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
-                           MachinePointerInfo::getConstantPool(),
-                           false, false, false, Alignment);
-    SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
-    SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
-    return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
-  }
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+static SDValue
+LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const EVT PtrVT) {
+  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
+                    X86::RAX, X86II::MO_TLSGD);
+}
  
-  if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
-    // On AVX2, v4i64 -> v4i32 becomes VPERMD.
-    if (Subtarget->hasInt256()) {
-      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
-      In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
-      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
-                                ShufMask);
-      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
-                         DAG.getIntPtrConstant(0));
-    }
+static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
+                                           SelectionDAG &DAG,
+                                           const EVT PtrVT,
+                                           bool is64Bit) {
+  SDLoc dl(GA);
  
-    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
-                               DAG.getIntPtrConstant(0));
-    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
-                               DAG.getIntPtrConstant(2));
-    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
-    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
-    static const int ShufMask[] = {0, 2, 4, 6};
-    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
+  // Get the start address of the TLS block for this module.
+  X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
+      .getInfo<X86MachineFunctionInfo>();
+  MFI->incNumLocalDynamicTLSAccesses();
+
+  SDValue Base;
+  if (is64Bit) {
+    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
+                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
+  } else {
+    SDValue InFlag;
+    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+        DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
+    InFlag = Chain.getValue(1);
+    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
+                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
    }
  
-  if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
-    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
-    if (Subtarget->hasInt256()) {
-      In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
+  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
+  // of Base.
  
-      SmallVector<SDValue,32> pshufbMask;
-      for (unsigned i = 0; i < 2; ++i) {
-        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
-        for (unsigned j = 0; j < 8; ++j)
-          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-      }
-      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
-      In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
-      In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
+  // Build x@dtpoff.
+  unsigned char OperandFlags = X86II::MO_DTPOFF;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
  
-      static const int ShufMask[] = {0,  2,  -1,  -1};
-      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
-                                &ShufMask[0]);
-      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
-                       DAG.getIntPtrConstant(0));
-      return DAG.getNode(ISD::BITCAST, DL, VT, In);
-    }
+  // Add x@dtpoff with the base.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
+}
  
-    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
-                               DAG.getIntPtrConstant(0));
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
+static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                   const EVT PtrVT, TLSModel::Model model,
+                                   bool is64Bit, bool isPIC) {
+  SDLoc dl(GA);
  
-    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
-                               DAG.getIntPtrConstant(4));
+  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
+  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
+                                                         is64Bit ? 257 : 256));
  
-    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
-    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
+  SDValue ThreadPointer =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
+                  MachinePointerInfo(Ptr), false, false, false, 0);
  
-    // The PSHUFB mask:
-    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
-                                   -1, -1, -1, -1, -1, -1, -1, -1};
+  unsigned char OperandFlags = 0;
+  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
+  // initialexec.
+  unsigned WrapperKind = X86ISD::Wrapper;
+  if (model == TLSModel::LocalExec) {
+    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
+  } else if (model == TLSModel::InitialExec) {
+    if (is64Bit) {
+      OperandFlags = X86II::MO_GOTTPOFF;
+      WrapperKind = X86ISD::WrapperRIP;
+    } else {
+      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
+    }
+  } else {
+    llvm_unreachable("Unexpected model");
+  }
  
-    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
-    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
-    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+  // emit "addl x@ntpoff,%eax" (local exec)
+  // or "addl x@indntpoff,%eax" (initial exec)
+  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
+  SDValue TGA =
+      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+                                 GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
  
-    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
-    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+  if (model == TLSModel::InitialExec) {
+    if (isPIC && !is64Bit) {
+      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
+                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+                           Offset);
+    }
  
-    // The MOVLHPS Mask:
-    static const int ShufMask2[] = {0, 1, 4, 5};
-    SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
-    return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
+    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
    }
  
-  // Handle truncation of V256 to V128 using shuffles.
-  if (!VT.is128BitVector() || !InVT.is256BitVector())
-    return SDValue();
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
  
-  assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
+SDValue
+X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
  
-  unsigned NumElems = VT.getVectorNumElements();
-  MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GA->getGlobal();
  
-  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
-  // Prepare truncation shuffle mask
-  for (unsigned i = 0; i != NumElems; ++i)
-    MaskVec[i] = i * 2;
-  SDValue V = DAG.getVectorShuffle(NVT, DL,
-                                   DAG.getNode(ISD::BITCAST, DL, NVT, In),
-                                   DAG.getUNDEF(NVT), &MaskVec[0]);
-  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
-                     DAG.getIntPtrConstant(0));
-}
+  if (Subtarget->isTargetELF()) {
+    TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
  
-SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  assert(!Op.getSimpleValueType().isVector());
+    switch (model) {
+      case TLSModel::GeneralDynamic:
+        if (Subtarget->is64Bit())
+          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
+        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+      case TLSModel::LocalDynamic:
+        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
+                                           Subtarget->is64Bit());
+      case TLSModel::InitialExec:
+      case TLSModel::LocalExec:
+        return LowerToTLSExecModel(
+            GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
+            DAG.getTarget().getRelocationModel() == Reloc::PIC_);
+    }
+    llvm_unreachable("Unknown TLS model.");
+  }
  
-  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
-    /*IsSigned=*/ true, /*IsReplace=*/ false);
-  SDValue FIST = Vals.first, StackSlot = Vals.second;
-  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
-  if (!FIST.getNode()) return Op;
+  if (Subtarget->isTargetDarwin()) {
+    // Darwin only has one model of TLS.  Lower to that.
+    unsigned char OpFlag = 0;
+    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
+                           X86ISD::WrapperRIP : X86ISD::Wrapper;
  
-  if (StackSlot.getNode())
-    // Load the result.
-    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
-                       FIST, StackSlot, MachinePointerInfo(),
-                       false, false, false, 0);
+    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+    // global base reg.
+    bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
+                 !Subtarget->is64Bit();
+    if (PIC32)
+      OpFlag = X86II::MO_TLVP_PIC_BASE;
+    else
+      OpFlag = X86II::MO_TLVP;
+    SDLoc DL(Op);
+    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
+                                                GA->getValueType(0),
+                                                GA->getOffset(), OpFlag);
+    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
  
-  // The node is the result.
-  return FIST;
-}
+    // With PIC32, the address is actually $g + Offset.
+    if (PIC32)
+      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                           DAG.getNode(X86ISD::GlobalBaseReg,
+                                       SDLoc(), getPointerTy()),
+                           Offset);
  
-SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
-    /*IsSigned=*/ false, /*IsReplace=*/ false);
-  SDValue FIST = Vals.first, StackSlot = Vals.second;
-  assert(FIST.getNode() && "Unexpected failure");
+    // Lowering the machine isd will make sure everything is in the right
+    // location.
+    SDValue Chain = DAG.getEntryNode();
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Args[] = { Chain, Offset };
+    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
  
-  if (StackSlot.getNode())
-    // Load the result.
-    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
-                       FIST, StackSlot, MachinePointerInfo(),
-                       false, false, false, 0);
+    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI->setAdjustsStack(true);
  
-  // The node is the result.
-  return FIST;
-}
+    // And our return value (tls address) is in the standard call return value
+    // location.
+    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
+                              Chain.getValue(1));
+  }
  
-static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  MVT VT = Op.getSimpleValueType();
-  SDValue In = Op.getOperand(0);
-  MVT SVT = In.getSimpleValueType();
+  if (Subtarget->isTargetKnownWindowsMSVC() ||
+      Subtarget->isTargetWindowsGNU()) {
+    // Just use the implicit TLS architecture
+    // Need to generate someting similar to:
+    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
+    //                                  ; from TEB
+    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
+    //   mov     rcx, qword [rdx+rcx*8]
+    //   mov     eax, .tls$:tlsvar
+    //   [rax+rcx] contains the address
+    // Windows 64bit: gs:0x58
+    // Windows 32bit: fs:__tls_array
  
-  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
+    SDLoc dl(GA);
+    SDValue Chain = DAG.getEntryNode();
  
-  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
-                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
-                                 In, DAG.getUNDEF(SVT)));
-}
+    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
+    // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
+    // use its literal value of 0x2C.
+    Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
+                                        ? Type::getInt8PtrTy(*DAG.getContext(),
+                                                             256)
+                                        : Type::getInt32PtrTy(*DAG.getContext(),
+                                                              257));
  
-static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) {
-  LLVMContext *Context = DAG.getContext();
-  SDLoc dl(Op);
-  MVT VT = Op.getSimpleValueType();
-  MVT EltVT = VT;
-  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
-  if (VT.isVector()) {
-    EltVT = VT.getVectorElementType();
-    NumElts = VT.getVectorNumElements();
-  }
-  Constant *C;
-  if (EltVT == MVT::f64)
-    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
-                                          APInt(64, ~(1ULL << 63))));
-  else
-    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
-                                          APInt(32, ~(1U << 31))));
-  C = ConstantVector::getSplat(NumElts, C);
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
-  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                             MachinePointerInfo::getConstantPool(),
-                             false, false, false, Alignment);
-  if (VT.isVector()) {
-    MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
-    return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getNode(ISD::AND, dl, ANDVT,
-                                   DAG.getNode(ISD::BITCAST, dl, ANDVT,
-                                               Op.getOperand(0)),
-                                   DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
-  }
-  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
-}
+    SDValue TlsArray =
+        Subtarget->is64Bit()
+            ? DAG.getIntPtrConstant(0x58)
+            : (Subtarget->isTargetWindowsGNU()
+                   ? DAG.getIntPtrConstant(0x2C)
+                   : DAG.getExternalSymbol("_tls_array", getPointerTy()));
  
-static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) {
-  LLVMContext *Context = DAG.getContext();
-  SDLoc dl(Op);
-  MVT VT = Op.getSimpleValueType();
-  MVT EltVT = VT;
-  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
-  if (VT.isVector()) {
-    EltVT = VT.getVectorElementType();
-    NumElts = VT.getVectorNumElements();
-  }
-  Constant *C;
-  if (EltVT == MVT::f64)
-    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
-                                          APInt(64, 1ULL << 63)));
-  else
-    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
-                                          APInt(32, 1U << 31)));
-  C = ConstantVector::getSplat(NumElts, C);
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
-  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                             MachinePointerInfo::getConstantPool(),
-                             false, false, false, Alignment);
-  if (VT.isVector()) {
-    MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64);
-    return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getNode(ISD::XOR, dl, XORVT,
-                                   DAG.getNode(ISD::BITCAST, dl, XORVT,
-                                               Op.getOperand(0)),
-                                   DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
+    SDValue ThreadPointer =
+        DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
+                    MachinePointerInfo(Ptr), false, false, false, 0);
+
+    // Load the _tls_index variable
+    SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
+    if (Subtarget->is64Bit())
+      IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
+                           IDX, MachinePointerInfo(), MVT::i32,
+                           false, false, false, 0);
+    else
+      IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
+                        false, false, false, 0);
+
+    SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
+                                    getPointerTy());
+    IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
+
+    SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
+    res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
+                      false, false, false, 0);
+
+    // Get the offset of start of .tls section
+    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                             GA->getValueType(0),
+                                             GA->getOffset(), X86II::MO_SECREL);
+    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
+
+    // The address of the thread local variable is the add of the thread
+    // pointer with the offset of the variable.
+    return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
    }
  
-  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+  llvm_unreachable("TLS not implemented for this target.");
  }
  
-static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  LLVMContext *Context = DAG.getContext();
-  SDValue Op0 = Op.getOperand(0);
-  SDValue Op1 = Op.getOperand(1);
-  SDLoc dl(Op);
+/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
+/// and take a 2 x i32 value to shift plus a shift amount.
+static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
    MVT VT = Op.getSimpleValueType();
-  MVT SrcVT = Op1.getSimpleValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
+  // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
+  // during isel.
+  SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                  DAG.getConstant(VTBits - 1, MVT::i8));
+  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+                                     DAG.getConstant(VTBits - 1, MVT::i8))
+                       : DAG.getConstant(0, VT);
  
-  // If second operand is smaller, extend it first.
-  if (SrcVT.bitsLT(VT)) {
-    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
-    SrcVT = VT;
-  }
-  // And if it is bigger, shrink it first.
-  if (SrcVT.bitsGT(VT)) {
-    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
-    SrcVT = VT;
+  SDValue Tmp2, Tmp3;
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
+  } else {
+    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
+    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
    }
  
-  // At this point the operands and the result should have the same
-  // type, and that won't be f80 since that is not custom lowered.
+  // If the shift amount is larger or equal than the width of a part we can't
+  // rely on the results of shld/shrd. Insert a test and select the appropriate
+  // values for large shift amounts.
+  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                DAG.getConstant(VTBits, MVT::i8));
+  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                             AndNode, DAG.getConstant(0, MVT::i8));
  
-  // First get the sign bit of second operand.
-  SmallVector<Constant*,4> CV;
-  if (SrcVT == MVT::f64) {
-    const fltSemantics &Sem = APFloat::IEEEdouble;
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
+  SDValue Hi, Lo;
+  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
+  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
+
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
    } else {
-    const fltSemantics &Sem = APFloat::IEEEsingle;
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
-  }
-  Constant *C = ConstantVector::get(CV);
-  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
-  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
-                              MachinePointerInfo::getConstantPool(),
-                              false, false, false, 16);
-  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
-
-  // Shift sign bit right or left if the two operands have different types.
-  if (SrcVT.bitsGT(VT)) {
-    // Op0 is MVT::f32, Op1 is MVT::f64.
-    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
-    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
-                          DAG.getConstant(32, MVT::i32));
-    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
-    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
-                          DAG.getIntPtrConstant(0));
-  }
-
-  // Clear first operand sign bit.
-  CV.clear();
-  if (VT == MVT::f64) {
-    const fltSemantics &Sem = APFloat::IEEEdouble;
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
-                                                   APInt(64, ~(1ULL << 63)))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
-  } else {
-    const fltSemantics &Sem = APFloat::IEEEsingle;
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
-                                                   APInt(32, ~(1U << 31)))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
    }
-  C = ConstantVector::get(CV);
-  CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
-  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                              MachinePointerInfo::getConstantPool(),
-                              false, false, false, 16);
-  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
-
-  // Or the value with the sign bit.
-  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
-}
-
-static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
-  SDValue N0 = Op.getOperand(0);
-  SDLoc dl(Op);
-  MVT VT = Op.getSimpleValueType();
  
-  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
-  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
-                                  DAG.getConstant(1, VT));
-  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
  }
  
-// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
-//
-static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
-                                      SelectionDAG &DAG) {
-  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
-
-  if (!Subtarget->hasSSE41())
-    return SDValue();
+SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
  
-  if (!Op->hasOneUse())
+  if (SrcVT.isVector())
      return SDValue();
  
-  SDNode *N = Op.getNode();
-  SDLoc DL(N);
-
-  SmallVector<SDValue, 8> Opnds;
-  DenseMap<SDValue, unsigned> VecInMap;
-  SmallVector<SDValue, 8> VecIns;
-  EVT VT = MVT::Other;
+  assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
+         "Unknown SINT_TO_FP to lower!");
  
-  // Recognize a special case where a vector is casted into wide integer to
-  // test all 0s.
-  Opnds.push_back(N->getOperand(0));
-  Opnds.push_back(N->getOperand(1));
+  // These are really Legal; return the operand so the caller accepts it as
+  // Legal.
+  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
+    return Op;
+  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+      Subtarget->is64Bit()) {
+    return Op;
+  }
  
-  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
-    SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
-    // BFS traverse all OR'd operands.
-    if (I->getOpcode() == ISD::OR) {
-      Opnds.push_back(I->getOperand(0));
-      Opnds.push_back(I->getOperand(1));
-      // Re-evaluate the number of nodes to be traversed.
-      e += 2; // 2 more nodes (LHS and RHS) are pushed.
-      continue;
-    }
+  SDLoc dl(Op);
+  unsigned Size = SrcVT.getSizeInBits()/8;
+  MachineFunction &MF = DAG.getMachineFunction();
+  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                               StackSlot,
+                               MachinePointerInfo::getFixedStack(SSFI),
+                               false, false, 0);
+  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+}
  
-    // Quit if a non-EXTRACT_VECTOR_ELT
-    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-      return SDValue();
+SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
+                                     SDValue StackSlot,
+                                     SelectionDAG &DAG) const {
+  // Build the FILD
+  SDLoc DL(Op);
+  SDVTList Tys;
+  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+  if (useSSE)
+    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
+  else
+    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
  
-    // Quit if without a constant index.
-    SDValue Idx = I->getOperand(1);
-    if (!isa<ConstantSDNode>(Idx))
-      return SDValue();
+  unsigned ByteSize = SrcVT.getSizeInBits()/8;
  
-    SDValue ExtractedFromVec = I->getOperand(0);
-    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
-    if (M == VecInMap.end()) {
-      VT = ExtractedFromVec.getValueType();
-      // Quit if not 128/256-bit vector.
-      if (!VT.is128BitVector() && !VT.is256BitVector())
-        return SDValue();
-      // Quit if not the same type.
-      if (VecInMap.begin() != VecInMap.end() &&
-          VT != VecInMap.begin()->first.getValueType())
-        return SDValue();
-      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
-      VecIns.push_back(ExtractedFromVec);
-    }
-    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
+  MachineMemOperand *MMO;
+  if (FI) {
+    int SSFI = FI->getIndex();
+    MMO =
+      DAG.getMachineFunction()
+      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
+  } else {
+    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
+    StackSlot = StackSlot.getOperand(1);
    }
+  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
+  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
+                                           X86ISD::FILD, DL,
+                                           Tys, Ops, SrcVT, MMO);
  
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
-         "Not extracted from 128-/256-bit vector.");
+  if (useSSE) {
+    Chain = Result.getValue(1);
+    SDValue InFlag = Result.getValue(2);
  
-  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+    // shouldn't be necessary except that RFP cannot be live across
+    // multiple blocks. When stackifier is fixed, they can be uncoupled.
+    MachineFunction &MF = DAG.getMachineFunction();
+    unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
+    int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
+    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+    Tys = DAG.getVTList(MVT::Other);
+    SDValue Ops[] = {
+      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
+    };
+    MachineMemOperand *MMO =
+      DAG.getMachineFunction()
+      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                            MachineMemOperand::MOStore, SSFISize, SSFISize);
  
-  for (DenseMap<SDValue, unsigned>::const_iterator
-        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
-    // Quit if not all elements are used.
-    if (I->second != FullMask)
-      return SDValue();
+    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
+                                    Ops, Op.getValueType(), MMO);
+    Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
+                         MachinePointerInfo::getFixedStack(SSFI),
+                         false, false, false, 0);
    }
  
-  EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+  return Result;
+}
  
-  // Cast all vectors into TestVT for PTEST.
-  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
-    VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
+// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  // This algorithm is not obvious. Here it is what we're trying to output:
+  /*
+     movq       %rax,  %xmm0
+     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
+     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
+     #ifdef __SSE3__
+       haddpd   %xmm0, %xmm0
+     #else
+       pshufd   $0x4e, %xmm0, %xmm1
+       addpd    %xmm1, %xmm0
+     #endif
+  */
  
-  // If more than one full vectors are evaluated, OR them first before PTEST.
-  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
-    // Each iteration will OR 2 nodes and append the result until there is only
-    // 1 node left, i.e. the final OR'd value of all vectors.
-    SDValue LHS = VecIns[Slot];
-    SDValue RHS = VecIns[Slot + 1];
-    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
-  }
+  SDLoc dl(Op);
+  LLVMContext *Context = DAG.getContext();
  
-  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
-                     VecIns.back(), VecIns.back());
-}
+  // Build some magic constants.
+  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+  Constant *C0 = ConstantDataVector::get(*Context, CV0);
+  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
  
-/// \brief return true if \c Op has a use that doesn't just read flags.
-static bool hasNonFlagsUse(SDValue Op) {
-  for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
-       ++UI) {
-    SDNode *User = *UI;
-    unsigned UOpNo = UI.getOperandNo();
-    if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
-      // Look pass truncate.
-      UOpNo = User->use_begin().getOperandNo();
-      User = *User->use_begin();
-    }
+  SmallVector<Constant*,2> CV1;
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                      APInt(64, 0x4330000000000000ULL))));
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                      APInt(64, 0x4530000000000000ULL))));
+  Constant *C1 = ConstantVector::get(CV1);
+  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
  
-    if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
-        !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
+  // Load the 64-bit value into an XMM register.
+  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                            Op.getOperand(0));
+  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+                              MachinePointerInfo::getConstantPool(),
+                              false, false, false, 16);
+  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
+                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
+                              CLod0);
+
+  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+                              MachinePointerInfo::getConstantPool(),
+                              false, false, false, 16);
+  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+  SDValue Result;
+
+  if (Subtarget->hasSSE3()) {
+    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
+    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
+  } else {
+    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
+    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
+                                           S2F, 0x4E, DAG);
+    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
+                         Sub);
+  }
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
+                     DAG.getIntPtrConstant(0));
+}
+
+// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  // FP constant to bias correct the final result.
+  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+                                   MVT::f64);
+
+  // Load the 32-bit value into an XMM register.
+  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+                             Op.getOperand(0));
+
+  // Zero out the upper parts of the register.
+  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
+
+  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
+                     DAG.getIntPtrConstant(0));
+
+  // Or the load with the bias.
+  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   MVT::v2f64, Load)),
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   MVT::v2f64, Bias)));
+  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
+                   DAG.getIntPtrConstant(0));
+
+  // Subtract the bias.
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
+
+  // Handle final rounding.
+  EVT DestVT = Op.getValueType();
+
+  if (DestVT.bitsLT(MVT::f64))
+    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+                       DAG.getIntPtrConstant(0));
+  if (DestVT.bitsGT(MVT::f64))
+    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+
+  // Handle final rounding.
+  return Sub;
+}
+
+SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  MVT SVT = N0.getSimpleValueType();
+  SDLoc dl(Op);
+
+  assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
+          SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
+         "Custom UINT_TO_FP is not supported!");
+
+  MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
+  return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                     DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+}
+
+SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  SDLoc dl(Op);
+
+  if (Op.getValueType().isVector())
+    return lowerUINT_TO_FP_vec(Op, DAG);
+
+  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
+  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+  // the optimization here.
+  if (DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
+
+  MVT SrcVT = N0.getSimpleValueType();
+  MVT DstVT = Op.getSimpleValueType();
+  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
+    return LowerUINT_TO_FP_i64(Op, DAG);
+  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+    return LowerUINT_TO_FP_i32(Op, DAG);
+  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
+    return SDValue();
+
+  // Make a 64-bit buffer, and use it to build an FILD.
+  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+  if (SrcVT == MVT::i32) {
+    SDValue WordOff = DAG.getConstant(4, getPointerTy());
+    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
+                                     getPointerTy(), StackSlot, WordOff);
+    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                                  StackSlot, MachinePointerInfo(),
+                                  false, false, 0);
+    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
+                                  OffsetSlot, MachinePointerInfo(),
+                                  false, false, 0);
+    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+    return Fild;
+  }
+
+  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                               StackSlot, MachinePointerInfo(),
+                               false, false, 0);
+  // For i64 source, we need to add the appropriate power of 2 if the input
+  // was negative.  This is the same as the optimization in
+  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
+  // we must be careful to do the computation in x87 extended precision, not
+  // in SSE. (The generic code can't know it's OK to do this, or how to.)
+  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+  MachineMemOperand *MMO =
+    DAG.getMachineFunction()
+    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                          MachineMemOperand::MOLoad, 8, 8);
+
+  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
+  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
+                                         MVT::i64, MMO);
+
+  APInt FF(32, 0x5F800000ULL);
+
+  // Check whether the sign bit is set.
+  SDValue SignSet = DAG.getSetCC(dl,
+                                 getSetCCResultType(*DAG.getContext(), MVT::i64),
+                                 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
+                                 ISD::SETLT);
+
+  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+  SDValue FudgePtr = DAG.getConstantPool(
+                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
+                                         getPointerTy());
+
+  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
+  SDValue Zero = DAG.getIntPtrConstant(0);
+  SDValue Four = DAG.getIntPtrConstant(4);
+  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
+                               Zero, Four);
+  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
+
+  // Load the value out, extending it from f32 to f80.
+  // FIXME: Avoid the extend by constructing the right constant pool?
+  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
+                                 FudgePtr, MachinePointerInfo::getConstantPool(),
+                                 MVT::f32, false, false, false, 4);
+  // Extend everything to 80 bits to force it to be done on x87.
+  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
+  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
+}
+
+std::pair<SDValue,SDValue>
+X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+                                    bool IsSigned, bool IsReplace) const {
+  SDLoc DL(Op);
+
+  EVT DstTy = Op.getValueType();
+
+  if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
+    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
+    DstTy = MVT::i64;
+  }
+
+  assert(DstTy.getSimpleVT() <= MVT::i64 &&
+         DstTy.getSimpleVT() >= MVT::i16 &&
+         "Unknown FP_TO_INT to lower!");
+
+  // These are really Legal.
+  if (DstTy == MVT::i32 &&
+      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+    return std::make_pair(SDValue(), SDValue());
+  if (Subtarget->is64Bit() &&
+      DstTy == MVT::i64 &&
+      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+    return std::make_pair(SDValue(), SDValue());
+
+  // We lower FP->int64 either into FISTP64 followed by a load from a temporary
+  // stack slot, or into the FTOL runtime function.
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned MemSize = DstTy.getSizeInBits()/8;
+  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+  unsigned Opc;
+  if (!IsSigned && isIntegerTypeFTOL(DstTy))
+    Opc = X86ISD::WIN_FTOL;
+  else
+    switch (DstTy.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+    }
+
+  SDValue Chain = DAG.getEntryNode();
+  SDValue Value = Op.getOperand(0);
+  EVT TheVT = Op.getOperand(0).getValueType();
+  // FIXME This causes a redundant load/store if the SSE-class value is already
+  // in memory, such as if it is on the callstack.
+  if (isScalarFPTypeInSSEReg(TheVT)) {
+    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
+                         MachinePointerInfo::getFixedStack(SSFI),
+                         false, false, 0);
+    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+    SDValue Ops[] = {
+      Chain, StackSlot, DAG.getValueType(TheVT)
+    };
+
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                              MachineMemOperand::MOLoad, MemSize, MemSize);
+    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
+    Chain = Value.getValue(1);
+    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  }
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                            MachineMemOperand::MOStore, MemSize, MemSize);
+
+  if (Opc != X86ISD::WIN_FTOL) {
+    // Build the FP_TO_INT*_IN_MEM
+    SDValue Ops[] = { Chain, Value, StackSlot };
+    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+                                           Ops, DstTy, MMO);
+    return std::make_pair(FIST, StackSlot);
+  } else {
+    SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
+      DAG.getVTList(MVT::Other, MVT::Glue),
+      Chain, Value);
+    SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
+      MVT::i32, ftol.getValue(1));
+    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
+      MVT::i32, eax.getValue(2));
+    SDValue Ops[] = { eax, edx };
+    SDValue pair = IsReplace
+      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
+      : DAG.getMergeValues(Ops, DL);
+    return std::make_pair(pair, SDValue());
+  }
+}
+
+static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
+                              const X86Subtarget *Subtarget) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  SDLoc dl(Op);
+
+  // Optimize vectors in AVX mode:
+  //
+  //   v8i16 -> v8i32
+  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
+  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
+  //   Concat upper and lower parts.
+  //
+  //   v4i32 -> v4i64
+  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
+  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
+  //   Concat upper and lower parts.
+  //
+
+  if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
+      ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
+      ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
+    return SDValue();
+
+  if (Subtarget->hasInt256())
+    return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
+
+  SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
+  SDValue Undef = DAG.getUNDEF(InVT);
+  bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
+  SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+  SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+
+  MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
+                             VT.getVectorNumElements()/2);
+
+  OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
+  OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
+                                        SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  SDLoc DL(Op);
+  unsigned int NumElts = VT.getVectorNumElements();
+  if (NumElts != 8 && NumElts != 16)
+    return SDValue();
+
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+  EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  // Now we have only mask extension
+  assert(InVT.getVectorElementType() == MVT::i1);
+  SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
+  const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
+  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+  SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, Alignment);
+
+  SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
+  if (VT.is512BitVector())
+    return Brcst;
+  return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
+}
+
+static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                               SelectionDAG &DAG) {
+  if (Subtarget->hasFp256()) {
+    SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
+    if (Res.getNode())
+      return Res;
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                                SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT SVT = In.getSimpleValueType();
+
+  if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
+    return LowerZERO_EXTEND_AVX512(Op, DAG);
+
+  if (Subtarget->hasFp256()) {
+    SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
+    if (Res.getNode())
+      return Res;
+  }
+
+  assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
+         VT.getVectorNumElements() != SVT.getVectorNumElements());
+  return SDValue();
+}
+
+SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+
+  if (VT == MVT::i1) {
+    assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
+           "Invalid scalar TRUNCATE operation");
+    if (InVT == MVT::i32)
+      return SDValue();
+    if (InVT.getSizeInBits() == 64)
+      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::i32, In);
+    else if (InVT.getSizeInBits() < 32)
+      In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
+  }
+  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
+         "Invalid TRUNCATE operation");
+
+  if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
+    if (VT.getVectorElementType().getSizeInBits() >=8)
+      return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+
+    assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+    unsigned NumElts = InVT.getVectorNumElements();
+    assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
+    if (InVT.getSizeInBits() < 512) {
+      MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
+      In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
+      InVT = ExtVT;
+    }
+    
+    SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
+    const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
+    SDValue CP = DAG.getConstantPool(C, getPointerTy());
+    unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+    SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, Alignment);
+    SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
+    SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
+    return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
+  }
+
+  if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
+    // On AVX2, v4i64 -> v4i32 becomes VPERMD.
+    if (Subtarget->hasInt256()) {
+      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+      In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
+      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
+                                ShufMask);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
+                         DAG.getIntPtrConstant(0));
+    }
+
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                               DAG.getIntPtrConstant(0));
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                               DAG.getIntPtrConstant(2));
+    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+    static const int ShufMask[] = {0, 2, 4, 6};
+    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
+  }
+
+  if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
+    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
+    if (Subtarget->hasInt256()) {
+      In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
+
+      SmallVector<SDValue,32> pshufbMask;
+      for (unsigned i = 0; i < 2; ++i) {
+        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
+        for (unsigned j = 0; j < 8; ++j)
+          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+      }
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
+      In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
+      In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
+
+      static const int ShufMask[] = {0,  2,  -1,  -1};
+      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
+                                &ShufMask[0]);
+      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                       DAG.getIntPtrConstant(0));
+      return DAG.getNode(ISD::BITCAST, DL, VT, In);
+    }
+
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+                               DAG.getIntPtrConstant(0));
+
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+                               DAG.getIntPtrConstant(4));
+
+    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
+
+    // The PSHUFB mask:
+    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
+                                   -1, -1, -1, -1, -1, -1, -1, -1};
+
+    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+
+    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+
+    // The MOVLHPS Mask:
+    static const int ShufMask2[] = {0, 1, 4, 5};
+    SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
+    return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
+  }
+
+  // Handle truncation of V256 to V128 using shuffles.
+  if (!VT.is128BitVector() || !InVT.is256BitVector())
+    return SDValue();
+
+  assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
+
+  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
+  // Prepare truncation shuffle mask
+  for (unsigned i = 0; i != NumElems; ++i)
+    MaskVec[i] = i * 2;
+  SDValue V = DAG.getVectorShuffle(NVT, DL,
+                                   DAG.getNode(ISD::BITCAST, DL, NVT, In),
+                                   DAG.getUNDEF(NVT), &MaskVec[0]);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
+                     DAG.getIntPtrConstant(0));
+}
+
+SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  assert(!Op.getSimpleValueType().isVector());
+
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+    /*IsSigned=*/ true, /*IsReplace=*/ false);
+  SDValue FIST = Vals.first, StackSlot = Vals.second;
+  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+  if (!FIST.getNode()) return Op;
+
+  if (StackSlot.getNode())
+    // Load the result.
+    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
+                       FIST, StackSlot, MachinePointerInfo(),
+                       false, false, false, 0);
+
+  // The node is the result.
+  return FIST;
+}
+
+SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+    /*IsSigned=*/ false, /*IsReplace=*/ false);
+  SDValue FIST = Vals.first, StackSlot = Vals.second;
+  assert(FIST.getNode() && "Unexpected failure");
+
+  if (StackSlot.getNode())
+    // Load the result.
+    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
+                       FIST, StackSlot, MachinePointerInfo(),
+                       false, false, false, 0);
+
+  // The node is the result.
+  return FIST;
+}
+
+static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT SVT = In.getSimpleValueType();
+
+  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
+
+  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
+                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
+                                 In, DAG.getUNDEF(SVT)));
+}
+
+static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) {
+  LLVMContext *Context = DAG.getContext();
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT;
+  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+  if (VT.isVector()) {
+    EltVT = VT.getVectorElementType();
+    NumElts = VT.getVectorNumElements();
+  }
+  Constant *C;
+  if (EltVT == MVT::f64)
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                          APInt(64, ~(1ULL << 63))));
+  else
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
+                                          APInt(32, ~(1U << 31))));
+  C = ConstantVector::getSplat(NumElts, C);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                             MachinePointerInfo::getConstantPool(),
+                             false, false, false, Alignment);
+  if (VT.isVector()) {
+    MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+    return DAG.getNode(ISD::BITCAST, dl, VT,
+                       DAG.getNode(ISD::AND, dl, ANDVT,
+                                   DAG.getNode(ISD::BITCAST, dl, ANDVT,
+                                               Op.getOperand(0)),
+                                   DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
+  }
+  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
+}
+
+static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) {
+  LLVMContext *Context = DAG.getContext();
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT;
+  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+  if (VT.isVector()) {
+    EltVT = VT.getVectorElementType();
+    NumElts = VT.getVectorNumElements();
+  }
+  Constant *C;
+  if (EltVT == MVT::f64)
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                          APInt(64, 1ULL << 63)));
+  else
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
+                                          APInt(32, 1U << 31)));
+  C = ConstantVector::getSplat(NumElts, C);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                             MachinePointerInfo::getConstantPool(),
+                             false, false, false, Alignment);
+  if (VT.isVector()) {
+    MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64);
+    return DAG.getNode(ISD::BITCAST, dl, VT,
+                       DAG.getNode(ISD::XOR, dl, XORVT,
+                                   DAG.getNode(ISD::BITCAST, dl, XORVT,
+                                               Op.getOperand(0)),
+                                   DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
+  }
+
+  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+}
+
+static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  LLVMContext *Context = DAG.getContext();
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT SrcVT = Op1.getSimpleValueType();
+
+  // If second operand is smaller, extend it first.
+  if (SrcVT.bitsLT(VT)) {
+    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
+    SrcVT = VT;
+  }
+  // And if it is bigger, shrink it first.
+  if (SrcVT.bitsGT(VT)) {
+    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
+    SrcVT = VT;
+  }
+
+  // At this point the operands and the result should have the same
+  // type, and that won't be f80 since that is not custom lowered.
+
+  // First get the sign bit of second operand.
+  SmallVector<Constant*,4> CV;
+  if (SrcVT == MVT::f64) {
+    const fltSemantics &Sem = APFloat::IEEEdouble;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
+  } else {
+    const fltSemantics &Sem = APFloat::IEEEsingle;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+  }
+  Constant *C = ConstantVector::get(CV);
+  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
+  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
+                              MachinePointerInfo::getConstantPool(),
+                              false, false, false, 16);
+  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
+
+  // Shift sign bit right or left if the two operands have different types.
+  if (SrcVT.bitsGT(VT)) {
+    // Op0 is MVT::f32, Op1 is MVT::f64.
+    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
+    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
+                          DAG.getConstant(32, MVT::i32));
+    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
+    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
+                          DAG.getIntPtrConstant(0));
+  }
+
+  // Clear first operand sign bit.
+  CV.clear();
+  if (VT == MVT::f64) {
+    const fltSemantics &Sem = APFloat::IEEEdouble;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
+                                                   APInt(64, ~(1ULL << 63)))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
+  } else {
+    const fltSemantics &Sem = APFloat::IEEEsingle;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
+                                                   APInt(32, ~(1U << 31)))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+  }
+  C = ConstantVector::get(CV);
+  CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
+  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                              MachinePointerInfo::getConstantPool(),
+                              false, false, false, 16);
+  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
+
+  // Or the value with the sign bit.
+  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
+}
+
+static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
+  SDValue N0 = Op.getOperand(0);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
+  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
+                                  DAG.getConstant(1, VT));
+  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
+}
+
+// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
+//
+static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  if (!Op->hasOneUse())
+    return SDValue();
+
+  SDNode *N = Op.getNode();
+  SDLoc DL(N);
+
+  SmallVector<SDValue, 8> Opnds;
+  DenseMap<SDValue, unsigned> VecInMap;
+  SmallVector<SDValue, 8> VecIns;
+  EVT VT = MVT::Other;
+
+  // Recognize a special case where a vector is casted into wide integer to
+  // test all 0s.
+  Opnds.push_back(N->getOperand(0));
+  Opnds.push_back(N->getOperand(1));
+
+  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+    SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
+    // BFS traverse all OR'd operands.
+    if (I->getOpcode() == ISD::OR) {
+      Opnds.push_back(I->getOperand(0));
+      Opnds.push_back(I->getOperand(1));
+      // Re-evaluate the number of nodes to be traversed.
+      e += 2; // 2 more nodes (LHS and RHS) are pushed.
+      continue;
+    }
+
+    // Quit if a non-EXTRACT_VECTOR_ELT
+    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // Quit if without a constant index.
+    SDValue Idx = I->getOperand(1);
+    if (!isa<ConstantSDNode>(Idx))
+      return SDValue();
+
+    SDValue ExtractedFromVec = I->getOperand(0);
+    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
+    if (M == VecInMap.end()) {
+      VT = ExtractedFromVec.getValueType();
+      // Quit if not 128/256-bit vector.
+      if (!VT.is128BitVector() && !VT.is256BitVector())
+        return SDValue();
+      // Quit if not the same type.
+      if (VecInMap.begin() != VecInMap.end() &&
+          VT != VecInMap.begin()->first.getValueType())
+        return SDValue();
+      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+      VecIns.push_back(ExtractedFromVec);
+    }
+    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+  }
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Not extracted from 128-/256-bit vector.");
+
+  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+
+  for (DenseMap<SDValue, unsigned>::const_iterator
+        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
+    // Quit if not all elements are used.
+    if (I->second != FullMask)
+      return SDValue();
+  }
+
+  EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+
+  // Cast all vectors into TestVT for PTEST.
+  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
+    VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
+
+  // If more than one full vectors are evaluated, OR them first before PTEST.
+  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
+    // Each iteration will OR 2 nodes and append the result until there is only
+    // 1 node left, i.e. the final OR'd value of all vectors.
+    SDValue LHS = VecIns[Slot];
+    SDValue RHS = VecIns[Slot + 1];
+    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+  }
+
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+                     VecIns.back(), VecIns.back());
+}
+
+/// \brief return true if \c Op has a use that doesn't just read flags.
+static bool hasNonFlagsUse(SDValue Op) {
+  for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
+       ++UI) {
+    SDNode *User = *UI;
+    unsigned UOpNo = UI.getOperandNo();
+    if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+      // Look pass truncate.
+      UOpNo = User->use_begin().getOperandNo();
+      User = *User->use_begin();
+    }
+
+    if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
+        !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
        return true;
    }
    return false;
@@ -11656,6 +13159,210 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
  }
  
+// Lower vector extended loads using a shuffle. If SSSE3 is not available we
+// may emit an illegal shuffle but the expansion is still better than scalar
+// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
+// we'll emit a shuffle and a arithmetic shift.
+// TODO: It is possible to support ZExt by zeroing the undef values during
+// the shuffle phase or after the shuffle.
+static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
+                                 SelectionDAG &DAG) {
+  MVT RegVT = Op.getSimpleValueType();
+  assert(RegVT.isVector() && "We only custom lower vector sext loads.");
+  assert(RegVT.isInteger() &&
+         "We only custom lower integer vector sext loads.");
+
+  // Nothing useful we can do without SSE2 shuffles.
+  assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
+
+  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+  SDLoc dl(Ld);
+  EVT MemVT = Ld->getMemoryVT();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned RegSz = RegVT.getSizeInBits();
+
+  ISD::LoadExtType Ext = Ld->getExtensionType();
+
+  assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
+         && "Only anyext and sext are currently implemented.");
+  assert(MemVT != RegVT && "Cannot extend to the same type");
+  assert(MemVT.isVector() && "Must load a vector from memory");
+
+  unsigned NumElems = RegVT.getVectorNumElements();
+  unsigned MemSz = MemVT.getSizeInBits();
+  assert(RegSz > MemSz && "Register size must be greater than the mem size");
+
+  if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
+    // The only way in which we have a legal 256-bit vector result but not the
+    // integer 256-bit operations needed to directly lower a sextload is if we
+    // have AVX1 but not AVX2. In that case, we can always emit a sextload to
+    // a 128-bit vector and a normal sign_extend to 256-bits that should get
+    // correctly legalized. We do this late to allow the canonical form of
+    // sextload to persist throughout the rest of the DAG combiner -- it wants
+    // to fold together any extensions it can, and so will fuse a sign_extend
+    // of an sextload into an sextload targeting a wider value.
+    SDValue Load;
+    if (MemSz == 128) {
+      // Just switch this to a normal load.
+      assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
+                                       "it must be a legal 128-bit vector "
+                                       "type!");
+      Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
+                  Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
+                  Ld->isInvariant(), Ld->getAlignment());
+    } else {
+      assert(MemSz < 128 &&
+             "Can't extend a type wider than 128 bits to a 256 bit vector!");
+      // Do an sext load to a 128-bit vector type. We want to use the same
+      // number of elements, but elements half as wide. This will end up being
+      // recursively lowered by this routine, but will succeed as we definitely
+      // have all the necessary features if we're using AVX1.
+      EVT HalfEltVT =
+          EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
+      EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
+      Load =
+          DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
+                         Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
+                         Ld->isNonTemporal(), Ld->isInvariant(),
+                         Ld->getAlignment());
+    }
+
+    // Replace chain users with the new chain.
+    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+    // Finally, do a normal sign-extend to the desired register.
+    return DAG.getSExtOrTrunc(Load, dl, RegVT);
+  }
+
+  // All sizes must be a power of two.
+  assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
+         "Non-power-of-two elements are not custom lowered!");
+
+  // Attempt to load the original value using scalar loads.
+  // Find the largest scalar type that divides the total loaded size.
+  MVT SclrLoadTy = MVT::i8;
+  for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+       tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+    MVT Tp = (MVT::SimpleValueType)tp;
+    if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
+      SclrLoadTy = Tp;
+    }
+  }
+
+  // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+  if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+      (64 <= MemSz))
+    SclrLoadTy = MVT::f64;
+
+  // Calculate the number of scalar loads that we need to perform
+  // in order to load our vector from memory.
+  unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+
+  assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
+         "Can only lower sext loads with a single scalar load!");
+
+  unsigned loadRegZize = RegSz;
+  if (Ext == ISD::SEXTLOAD && RegSz == 256)
+    loadRegZize /= 2;
+
+  // Represent our vector as a sequence of elements which are the
+  // largest scalar that we can load.
+  EVT LoadUnitVecVT = EVT::getVectorVT(
+      *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
+
+  // Represent the data using the same element type that is stored in
+  // memory. In practice, we ''widen'' MemVT.
+  EVT WideVecVT =
+      EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                       loadRegZize / MemVT.getScalarType().getSizeInBits());
+
+  assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+         "Invalid vector type");
+
+  // We can't shuffle using an illegal type.
+  assert(TLI.isTypeLegal(WideVecVT) &&
+         "We only lower types that form legal widened vector types");
+
+  SmallVector<SDValue, 8> Chains;
+  SDValue Ptr = Ld->getBasePtr();
+  SDValue Increment =
+      DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
+  SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+  for (unsigned i = 0; i < NumLoads; ++i) {
+    // Perform a single load.
+    SDValue ScalarLoad =
+        DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+                    Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
+                    Ld->getAlignment());
+    Chains.push_back(ScalarLoad.getValue(1));
+    // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+    // another round of DAGCombining.
+    if (i == 0)
+      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+    else
+      Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+                        ScalarLoad, DAG.getIntPtrConstant(i));
+
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+  }
+
+  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+
+  // Bitcast the loaded value to a vector of the original element type, in
+  // the size of the target vector type.
+  SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
+  unsigned SizeRatio = RegSz / MemSz;
+
+  if (Ext == ISD::SEXTLOAD) {
+    // If we have SSE4.1 we can directly emit a VSEXT node.
+    if (Subtarget->hasSSE41()) {
+      SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+      return Sext;
+    }
+
+    // Otherwise we'll shuffle the small elements in the high bits of the
+    // larger type and perform an arithmetic shift. If the shift is not legal
+    // it's better to scalarize.
+    assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
+           "We can't implement an sext load without a arithmetic right shift!");
+
+    // Redistribute the loaded elements into the different locations.
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
+
+    SDValue Shuff = DAG.getVectorShuffle(
+        WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+
+    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+
+    // Build the arithmetic shift.
+    unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
+                   MemVT.getVectorElementType().getSizeInBits();
+    Shuff =
+        DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
+
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+    return Shuff;
+  }
+
+  // Redistribute the loaded elements into the different locations.
+  SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+  for (unsigned i = 0; i != NumElems; ++i)
+    ShuffleVec[i * SizeRatio] = i;
+
+  SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+                                       DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+
+  // Bitcast to the requested type.
+  Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+  return Shuff;
+}
+
  // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
  // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
  // from the AND / OR.
@@ -11996,7 +13703,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
      SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
      Chain = SP.getValue(1);
      unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
-    const TargetFrameLowering &TFI = *DAG.getTarget().getFrameLowering();
+    const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();
      unsigned StackAlign = TFI.getStackAlignment();
      Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
      if (Align > StackAlign)
@@ -12054,8 +13761,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
  
      Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
  
-    const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+        DAG.getSubtarget().getRegisterInfo());
      unsigned SPReg = RegInfo->getStackRegister();
      SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
      Chain = SP.getValue(1);
@@ -13119,6 +14826,51 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
    return SDValue(Res, 0);
  }
  
+// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
+// read performance monitor counters (x86_rdpmc).
+static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
+                              SelectionDAG &DAG, const X86Subtarget *Subtarget,
+                              SmallVectorImpl<SDValue> &Results) {
+  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue LO, HI;
+
+  // The ECX register is used to select the index of the performance counter
+  // to read.
+  SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
+                                   N->getOperand(2));
+  SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
+
+  // Reads the content of a 64-bit performance counter and returns it in the
+  // registers EDX:EAX.
+  if (Subtarget->is64Bit()) {
+    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+                            LO.getValue(2));
+  } else {
+    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+                            LO.getValue(2));
+  }
+  Chain = HI.getValue(1);
+
+  if (Subtarget->is64Bit()) {
+    // The EAX register is loaded with the low-order 32 bits. The EDX register
+    // is loaded with the supported high-order bits of the counter.
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+                              DAG.getConstant(32, MVT::i8));
+    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+    Results.push_back(Chain);
+    return;
+  }
+
+  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+  SDValue Ops[] = { LO, HI };
+  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+  Results.push_back(Pair);
+  Results.push_back(Chain);
+}
+
  // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
  // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
  // also used to custom lower READCYCLECOUNTER nodes.
@@ -13183,7 +14935,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
  }
  
  enum IntrinsicType {
-  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST
+  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST
  };
  
  struct IntrinsicData {
@@ -13277,6 +15029,8 @@ static void InitIntinsicsMap() {
                                  IntrinsicData(RDTSC,  X86ISD::RDTSC_DAG, 0)));
    IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp,
                                  IntrinsicData(RDTSC,  X86ISD::RDTSCP_DAG, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdpmc,
+                                IntrinsicData(RDPMC,  X86ISD::RDPMC_DAG, 0)));
    Initialized = true;
  }
  
@@ -13352,6 +15106,12 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
      getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results);
      return DAG.getMergeValues(Results, dl);
    }
+  // Read Performance Monitoring Counters.
+  case RDPMC: {
+    SmallVector<SDValue, 2> Results;
+    getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
+    return DAG.getMergeValues(Results, dl);
+  }
    // XTEST intrinsics.
    case XTEST: {
      SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
@@ -13381,8 +15141,8 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
  
    if (Depth > 0) {
      SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+        DAG.getSubtarget().getRegisterInfo());
      SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
      return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                         DAG.getNode(ISD::ADD, dl, PtrVT,
@@ -13403,8 +15163,8 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
    EVT VT = Op.getValueType();
    SDLoc dl(Op);  // FIXME probably not meaningful
    unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
    unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
    assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
            (FrameReg == X86::EBP && VT == MVT::i32)) &&
@@ -13432,8 +15192,8 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName,
  
  SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                       SelectionDAG &DAG) const {
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
    return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
  }
  
@@ -13444,8 +15204,8 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
    SDLoc dl      (Op);
  
    EVT PtrVT = getPointerTy();
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
    unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
    assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
            (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
@@ -13492,7 +15252,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
    SDLoc dl (Op);
  
    const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-  const TargetRegisterInfo* TRI = DAG.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
  
    if (Subtarget->is64Bit()) {
      SDValue OutChains[6];
@@ -13656,7 +15416,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
  
    MachineFunction &MF = DAG.getMachineFunction();
    const TargetMachine &TM = MF.getTarget();
-  const TargetFrameLowering &TFI = *TM.getFrameLowering();
+  const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
    unsigned StackAlignment = TFI.getStackAlignment();
    MVT VT = Op.getSimpleValueType();
    SDLoc DL(Op);
@@ -13940,7 +15700,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
    CLI.setDebugLoc(dl).setChain(InChain)
      .setCallee(getLibcallCallingConv(LC),
                 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
-               Callee, &Args, 0)
+               Callee, std::move(Args), 0)
      .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
  
    std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
@@ -13956,10 +15716,23 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
    assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
           (VT == MVT::v8i32 && Subtarget->hasInt256()));
  
-  // Get the high parts.
-  const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8};
-  SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
-  SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
+  // PMULxD operations multiply each even value (starting at 0) of LHS with
+  // the related value of RHS and produce a widen result.
+  // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+  // => <2 x i64> <ae|cg>
+  //
+  // In other word, to have all the results, we need to perform two PMULxD:
+  // 1. one with the even values.
+  // 2. one with the odd values.
+  // To achieve #2, with need to place the odd values at an even position.
+  //
+  // Place the odd value at an even position (basically, shift all values 1
+  // step to the left):
+  const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
+  // <a|b|c|d> => <b|undef|d|undef>
+  SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+  // <e|f|g|h> => <f|undef|h|undef>
+  SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
  
    // Emit two multiplies, one for the lower 2 ints and one for the higher 2
    // ints.
@@ -13967,16 +15740,28 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
    bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
    unsigned Opcode =
        (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+  // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+  // => <2 x i64> <ae|cg>
    SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
                               DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+  // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
+  // => <2 x i64> <bf|dh>
    SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
-                             DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1));
+                             DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
  
    // Shuffle it back into the right order.
-  const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15};
-  SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
-  const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14};
-  SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  SDValue Highs, Lows;
+  if (VT == MVT::v8i32) {
+    const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
+    Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+    const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
+    Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  } else {
+    const int HighMask[] = {1, 5, 3, 7};
+    Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+    const int LowMask[] = {1, 4, 2, 6};
+    Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  }
  
    // If we have a signed multiply but no PMULDQ fix up the high parts of a
    // unsigned multiply.
@@ -13992,7 +15777,10 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
      Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
    }
  
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows);
+  // The first result of MUL_LOHI is actually the low value, followed by the
+  // high value.
+  SDValue Ops[] = {Lows, Highs};
+  return DAG.getMergeValues(Ops, dl);
  }
  
  static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
@@ -14003,10 +15791,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
    SDValue Amt = Op.getOperand(1);
  
    // Optimize shl/srl/sra with constant shift amount.
-  if (isSplatVector(Amt.getNode())) {
-    SDValue SclrAmt = Amt->getOperand(0);
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
-      uint64_t ShiftAmt = C->getZExtValue();
+  if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+    if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
+      uint64_t ShiftAmt = ShiftConst->getZExtValue();
  
        if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
            (Subtarget->hasInt256() &&
@@ -14313,15 +16100,14 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
  
  static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
                            SelectionDAG &DAG) {
-
    MVT VT = Op.getSimpleValueType();
    SDLoc dl(Op);
    SDValue R = Op.getOperand(0);
    SDValue Amt = Op.getOperand(1);
    SDValue V;
  
-  if (!Subtarget->hasSSE2())
-    return SDValue();
+  assert(VT.isVector() && "Custom lowering only for vector shifts!");
+  assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
  
    V = LowerScalarImmediateShift(Op, DAG, Subtarget);
    if (V.getNode())
@@ -14940,7 +16726,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
  
    TargetLowering::CallLoweringInfo CLI(DAG);
    CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::C, RetTy, Callee, &Args, 0);
+    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
  
    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
  
@@ -14994,6 +16780,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
    case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
    case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
+  case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
    case ISD::FABS:               return LowerFABS(Op, DAG);
    case ISD::FNEG:               return LowerFNEG(Op, DAG);
    case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
@@ -15069,29 +16856,6 @@ static void ReplaceATOMIC_LOAD(SDNode *Node,
    Results.push_back(Swap.getValue(2));
  }
  
-static void
-ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
-                        SelectionDAG &DAG, unsigned NewOp) {
-  SDLoc dl(Node);
-  assert (Node->getValueType(0) == MVT::i64 &&
-          "Only know how to expand i64 atomics");
-
-  SDValue Chain = Node->getOperand(0);
-  SDValue In1 = Node->getOperand(1);
-  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                             Node->getOperand(2), DAG.getIntPtrConstant(0));
-  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                             Node->getOperand(2), DAG.getIntPtrConstant(1));
-  SDValue Ops[] = { Chain, In1, In2L, In2H };
-  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
-  SDValue Result =
-    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64,
-                            cast<MemSDNode>(Node)->getMemOperand());
-  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
-  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF));
-  Results.push_back(Result.getValue(2));
-}
-
  /// ReplaceNodeResults - Replace a node with an illegal result type
  /// with a new node built out of custom code.
  void X86TargetLowering::ReplaceNodeResults(SDNode *N,
@@ -15176,6 +16940,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      case Intrinsic::x86_rdtscp:
        return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
                                       Results);
+    case Intrinsic::x86_rdpmc:
+      return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
      }
    }
    case ISD::READCYCLECOUNTER: {
@@ -15237,57 +17003,20 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      Results.push_back(EFLAGS.getValue(1));
      return;
    }
+  case ISD::ATOMIC_SWAP:
    case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
    case ISD::ATOMIC_LOAD_AND:
-  case ISD::ATOMIC_LOAD_NAND:
    case ISD::ATOMIC_LOAD_OR:
-  case ISD::ATOMIC_LOAD_SUB:
    case ISD::ATOMIC_LOAD_XOR:
-  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_NAND:
    case ISD::ATOMIC_LOAD_MIN:
-  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD_MAX:
    case ISD::ATOMIC_LOAD_UMIN:
-  case ISD::ATOMIC_SWAP: {
-    unsigned Opc;
-    switch (N->getOpcode()) {
-    default: llvm_unreachable("Unexpected opcode");
-    case ISD::ATOMIC_LOAD_ADD:
-      Opc = X86ISD::ATOMADD64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_AND:
-      Opc = X86ISD::ATOMAND64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_NAND:
-      Opc = X86ISD::ATOMNAND64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_OR:
-      Opc = X86ISD::ATOMOR64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_SUB:
-      Opc = X86ISD::ATOMSUB64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_XOR:
-      Opc = X86ISD::ATOMXOR64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_MAX:
-      Opc = X86ISD::ATOMMAX64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_MIN:
-      Opc = X86ISD::ATOMMIN64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_UMAX:
-      Opc = X86ISD::ATOMUMAX64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_UMIN:
-      Opc = X86ISD::ATOMUMIN64_DAG;
-      break;
-    case ISD::ATOMIC_SWAP:
-      Opc = X86ISD::ATOMSWAP64_DAG;
-      break;
-    }
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
-    return;
-  }
+  case ISD::ATOMIC_LOAD_UMAX:
+    // Delegate to generic TypeLegalization. Situations we can really handle
+    // should have already been dealt with by X86AtomicExpandPass.cpp.
+    break;
    case ISD::ATOMIC_LOAD: {
      ReplaceATOMIC_LOAD(N, Results, DAG);
      return;
@@ -15308,6 +17037,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                     MVT::v2f64, N->getOperand(0));
      SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
  
+    if (ExperimentalVectorWideningLegalization) {
+      // If we are legalizing vectors by widening, we already have the desired
+      // legal vector type, just return it.
+      Results.push_back(ToVecInt);
+      return;
+    }
+
      SmallVector<SDValue, 8> Elts;
      for (unsigned i = 0, e = NumElts; i != e; ++i)
        Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
@@ -15340,6 +17076,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::CALL:               return "X86ISD::CALL";
    case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
    case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
+  case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
    case X86ISD::BT:                 return "X86ISD::BT";
    case X86ISD::CMP:                return "X86ISD::CMP";
    case X86ISD::COMI:               return "X86ISD::COMI";
@@ -15394,12 +17131,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
    case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
    case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
-  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
-  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
-  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
-  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
-  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
-  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
    case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
    case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
    case X86ISD::VZEXT:              return "X86ISD::VZEXT";
@@ -15443,6 +17174,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::PACKSS:             return "X86ISD::PACKSS";
    case X86ISD::PACKUS:             return "X86ISD::PACKUS";
    case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
+  case X86ISD::VALIGN:             return "X86ISD::VALIGN";
    case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
    case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
    case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
@@ -15698,6 +17430,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
    return (SVT.getVectorNumElements() == 2 ||
            ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
            isMOVLMask(M, SVT) ||
+          isMOVHLPSMask(M, SVT) ||
            isSHUFPMask(M, SVT) ||
            isPSHUFDMask(M, SVT) ||
            isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
@@ -15758,712 +17491,33 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
    MachineFunction *MF = MBB->getParent();
    MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
    MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
-  MF->insert(I, mainMBB);
-  MF->insert(I, sinkMBB);
-
-  // Transfer the remainder of BB and its successor edges to sinkMBB.
-  sinkMBB->splice(sinkMBB->begin(), MBB,
-                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
-
-  // thisMBB:
-  //  xbegin sinkMBB
-  //  # fallthrough to mainMBB
-  //  # abortion to sinkMBB
-  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
-  thisMBB->addSuccessor(mainMBB);
-  thisMBB->addSuccessor(sinkMBB);
-
-  // mainMBB:
-  //  EAX = -1
-  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
-  mainMBB->addSuccessor(sinkMBB);
-
-  // sinkMBB:
-  // EAX is live into the sinkMBB
-  sinkMBB->addLiveIn(X86::EAX);
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
-          TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
-    .addReg(X86::EAX);
-
-  MI->eraseFromParent();
-  return sinkMBB;
-}
-
-// Get CMPXCHG opcode for the specified data type.
-static unsigned getCmpXChgOpcode(EVT VT) {
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::i8:  return X86::LCMPXCHG8;
-  case MVT::i16: return X86::LCMPXCHG16;
-  case MVT::i32: return X86::LCMPXCHG32;
-  case MVT::i64: return X86::LCMPXCHG64;
-  default:
-    break;
-  }
-  llvm_unreachable("Invalid operand size!");
-}
-
-// Get LOAD opcode for the specified data type.
-static unsigned getLoadOpcode(EVT VT) {
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::i8:  return X86::MOV8rm;
-  case MVT::i16: return X86::MOV16rm;
-  case MVT::i32: return X86::MOV32rm;
-  case MVT::i64: return X86::MOV64rm;
-  default:
-    break;
-  }
-  llvm_unreachable("Invalid operand size!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction.
-static unsigned getNonAtomicOpcode(unsigned Opc) {
-  switch (Opc) {
-  case X86::ATOMAND8:  return X86::AND8rr;
-  case X86::ATOMAND16: return X86::AND16rr;
-  case X86::ATOMAND32: return X86::AND32rr;
-  case X86::ATOMAND64: return X86::AND64rr;
-  case X86::ATOMOR8:   return X86::OR8rr;
-  case X86::ATOMOR16:  return X86::OR16rr;
-  case X86::ATOMOR32:  return X86::OR32rr;
-  case X86::ATOMOR64:  return X86::OR64rr;
-  case X86::ATOMXOR8:  return X86::XOR8rr;
-  case X86::ATOMXOR16: return X86::XOR16rr;
-  case X86::ATOMXOR32: return X86::XOR32rr;
-  case X86::ATOMXOR64: return X86::XOR64rr;
-  }
-  llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction with
-// extra opcode.
-static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
-                                               unsigned &ExtraOpc) {
-  switch (Opc) {
-  case X86::ATOMNAND8:  ExtraOpc = X86::NOT8r;   return X86::AND8rr;
-  case X86::ATOMNAND16: ExtraOpc = X86::NOT16r;  return X86::AND16rr;
-  case X86::ATOMNAND32: ExtraOpc = X86::NOT32r;  return X86::AND32rr;
-  case X86::ATOMNAND64: ExtraOpc = X86::NOT64r;  return X86::AND64rr;
-  case X86::ATOMMAX8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVL32rr;
-  case X86::ATOMMAX16:  ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
-  case X86::ATOMMAX32:  ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
-  case X86::ATOMMAX64:  ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
-  case X86::ATOMMIN8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVG32rr;
-  case X86::ATOMMIN16:  ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
-  case X86::ATOMMIN32:  ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
-  case X86::ATOMMIN64:  ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
-  case X86::ATOMUMAX8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVB32rr;
-  case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
-  case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
-  case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
-  case X86::ATOMUMIN8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVA32rr;
-  case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
-  case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
-  case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
-  }
-  llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction for
-// 64-bit data type on 32-bit target.
-static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
-  switch (Opc) {
-  case X86::ATOMAND6432:  HiOpc = X86::AND32rr; return X86::AND32rr;
-  case X86::ATOMOR6432:   HiOpc = X86::OR32rr;  return X86::OR32rr;
-  case X86::ATOMXOR6432:  HiOpc = X86::XOR32rr; return X86::XOR32rr;
-  case X86::ATOMADD6432:  HiOpc = X86::ADC32rr; return X86::ADD32rr;
-  case X86::ATOMSUB6432:  HiOpc = X86::SBB32rr; return X86::SUB32rr;
-  case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
-  case X86::ATOMMAX6432:  HiOpc = X86::SETLr;   return X86::SETLr;
-  case X86::ATOMMIN6432:  HiOpc = X86::SETGr;   return X86::SETGr;
-  case X86::ATOMUMAX6432: HiOpc = X86::SETBr;   return X86::SETBr;
-  case X86::ATOMUMIN6432: HiOpc = X86::SETAr;   return X86::SETAr;
-  }
-  llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction for
-// 64-bit data type on 32-bit target with extra opcode.
-static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
-                                                   unsigned &HiOpc,
-                                                   unsigned &ExtraOpc) {
-  switch (Opc) {
-  case X86::ATOMNAND6432:
-    ExtraOpc = X86::NOT32r;
-    HiOpc = X86::AND32rr;
-    return X86::AND32rr;
-  }
-  llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get pseudo CMOV opcode from the specified data type.
-static unsigned getPseudoCMOVOpc(EVT VT) {
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::i8:  return X86::CMOV_GR8;
-  case MVT::i16: return X86::CMOV_GR16;
-  case MVT::i32: return X86::CMOV_GR32;
-  default:
-    break;
-  }
-  llvm_unreachable("Unknown CMOV opcode!");
-}
-
-// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
-// They will be translated into a spin-loop or compare-exchange loop from
-//
-//    ...
-//    dst = atomic-fetch-op MI.addr, MI.val
-//    ...
-//
-// to
-//
-//    ...
-//    t1 = LOAD MI.addr
-// loop:
-//    t4 = phi(t1, t3 / loop)
-//    t2 = OP MI.val, t4
-//    EAX = t4
-//    LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined]
-//    t3 = EAX
-//    JNE loop
-// sink:
-//    dst = t3
-//    ...
-MachineBasicBlock *
-X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
-                                       MachineBasicBlock *MBB) const {
-  MachineFunction *MF = MBB->getParent();
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
-
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-
-  const BasicBlock *BB = MBB->getBasicBlock();
-  MachineFunction::iterator I = MBB;
-  ++I;
-
-  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
-         "Unexpected number of operands");
-
-  assert(MI->hasOneMemOperand() &&
-         "Expected atomic-load-op to have one memoperand");
-
-  // Memory Reference
-  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
-  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
-
-  unsigned DstReg, SrcReg;
-  unsigned MemOpndSlot;
-
-  unsigned CurOp = 0;
-
-  DstReg = MI->getOperand(CurOp++).getReg();
-  MemOpndSlot = CurOp;
-  CurOp += X86::AddrNumOperands;
-  SrcReg = MI->getOperand(CurOp++).getReg();
-
-  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
-  MVT::SimpleValueType VT = *RC->vt_begin();
-  unsigned t1 = MRI.createVirtualRegister(RC);
-  unsigned t2 = MRI.createVirtualRegister(RC);
-  unsigned t3 = MRI.createVirtualRegister(RC);
-  unsigned t4 = MRI.createVirtualRegister(RC);
-  unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT);
-
-  unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
-  unsigned LOADOpc = getLoadOpcode(VT);
-
-  // For the atomic load-arith operator, we generate
-  //
-  //  thisMBB:
-  //    t1 = LOAD [MI.addr]
-  //  mainMBB:
-  //    t4 = phi(t1 / thisMBB, t3 / mainMBB)
-  //    t1 = OP MI.val, EAX
-  //    EAX = t4
-  //    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
-  //    t3 = EAX
-  //    JNE mainMBB
-  //  sinkMBB:
-  //    dst = t3
-
-  MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
-  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
-  MF->insert(I, mainMBB);
-  MF->insert(I, sinkMBB);
-
-  MachineInstrBuilder MIB;
-
-  // Transfer the remainder of BB and its successor edges to sinkMBB.
-  sinkMBB->splice(sinkMBB->begin(), MBB,
-                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
-
-  // thisMBB:
-  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1);
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-    if (NewMO.isReg())
-      NewMO.setIsKill(false);
-    MIB.addOperand(NewMO);
-  }
-  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
-    unsigned flags = (*MMOI)->getFlags();
-    flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
-    MachineMemOperand *MMO =
-      MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
-                               (*MMOI)->getSize(),
-                               (*MMOI)->getBaseAlignment(),
-                               (*MMOI)->getTBAAInfo(),
-                               (*MMOI)->getRanges());
-    MIB.addMemOperand(MMO);
-  }
-
-  thisMBB->addSuccessor(mainMBB);
-
-  // mainMBB:
-  MachineBasicBlock *origMainMBB = mainMBB;
-
-  // Add a PHI.
-  MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4)
-                        .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
-
-  unsigned Opc = MI->getOpcode();
-  switch (Opc) {
-  default:
-    llvm_unreachable("Unhandled atomic-load-op opcode!");
-  case X86::ATOMAND8:
-  case X86::ATOMAND16:
-  case X86::ATOMAND32:
-  case X86::ATOMAND64:
-  case X86::ATOMOR8:
-  case X86::ATOMOR16:
-  case X86::ATOMOR32:
-  case X86::ATOMOR64:
-  case X86::ATOMXOR8:
-  case X86::ATOMXOR16:
-  case X86::ATOMXOR32:
-  case X86::ATOMXOR64: {
-    unsigned ARITHOpc = getNonAtomicOpcode(Opc);
-    BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg)
-      .addReg(t4);
-    break;
-  }
-  case X86::ATOMNAND8:
-  case X86::ATOMNAND16:
-  case X86::ATOMNAND32:
-  case X86::ATOMNAND64: {
-    unsigned Tmp = MRI.createVirtualRegister(RC);
-    unsigned NOTOpc;
-    unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
-    BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg)
-      .addReg(t4);
-    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp);
-    break;
-  }
-  case X86::ATOMMAX8:
-  case X86::ATOMMAX16:
-  case X86::ATOMMAX32:
-  case X86::ATOMMAX64:
-  case X86::ATOMMIN8:
-  case X86::ATOMMIN16:
-  case X86::ATOMMIN32:
-  case X86::ATOMMIN64:
-  case X86::ATOMUMAX8:
-  case X86::ATOMUMAX16:
-  case X86::ATOMUMAX32:
-  case X86::ATOMUMAX64:
-  case X86::ATOMUMIN8:
-  case X86::ATOMUMIN16:
-  case X86::ATOMUMIN32:
-  case X86::ATOMUMIN64: {
-    unsigned CMPOpc;
-    unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
-
-    BuildMI(mainMBB, DL, TII->get(CMPOpc))
-      .addReg(SrcReg)
-      .addReg(t4);
-
-    if (Subtarget->hasCMov()) {
-      if (VT != MVT::i8) {
-        // Native support
-        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
-          .addReg(SrcReg)
-          .addReg(t4);
-      } else {
-        // Promote i8 to i32 to use CMOV32
-        const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo();
-        const TargetRegisterClass *RC32 =
-          TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit);
-        unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
-        unsigned AccReg32 = MRI.createVirtualRegister(RC32);
-        unsigned Tmp = MRI.createVirtualRegister(RC32);
-
-        unsigned Undef = MRI.createVirtualRegister(RC32);
-        BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
-
-        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
-          .addReg(Undef)
-          .addReg(SrcReg)
-          .addImm(X86::sub_8bit);
-        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
-          .addReg(Undef)
-          .addReg(t4)
-          .addImm(X86::sub_8bit);
-
-        BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp)
-          .addReg(SrcReg32)
-          .addReg(AccReg32);
-
-        BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2)
-          .addReg(Tmp, 0, X86::sub_8bit);
-      }
-    } else {
-      // Use pseudo select and lower them.
-      assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
-             "Invalid atomic-load-op transformation!");
-      unsigned SelOpc = getPseudoCMOVOpc(VT);
-      X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
-      assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
-      MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2)
-              .addReg(SrcReg).addReg(t4)
-              .addImm(CC);
-      mainMBB = EmitLoweredSelect(MIB, mainMBB);
-      // Replace the original PHI node as mainMBB is changed after CMOV
-      // lowering.
-      BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4)
-        .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
-      Phi->eraseFromParent();
-    }
-    break;
-  }
-  }
-
-  // Copy PhyReg back from virtual register.
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg)
-    .addReg(t4);
-
-  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-    if (NewMO.isReg())
-      NewMO.setIsKill(false);
-    MIB.addOperand(NewMO);
-  }
-  MIB.addReg(t2);
-  MIB.setMemRefs(MMOBegin, MMOEnd);
-
-  // Copy PhyReg back to virtual register.
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3)
-    .addReg(PhyReg);
-
-  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
-
-  mainMBB->addSuccessor(origMainMBB);
-  mainMBB->addSuccessor(sinkMBB);
-
-  // sinkMBB:
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
-          TII->get(TargetOpcode::COPY), DstReg)
-    .addReg(t3);
-
-  MI->eraseFromParent();
-  return sinkMBB;
-}
-
-// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
-// instructions. They will be translated into a spin-loop or compare-exchange
-// loop from
-//
-//    ...
-//    dst = atomic-fetch-op MI.addr, MI.val
-//    ...
-//
-// to
-//
-//    ...
-//    t1L = LOAD [MI.addr + 0]
-//    t1H = LOAD [MI.addr + 4]
-// loop:
-//    t4L = phi(t1L, t3L / loop)
-//    t4H = phi(t1H, t3H / loop)
-//    t2L = OP MI.val.lo, t4L
-//    t2H = OP MI.val.hi, t4H
-//    EAX = t4L
-//    EDX = t4H
-//    EBX = t2L
-//    ECX = t2H
-//    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
-//    t3L = EAX
-//    t3H = EDX
-//    JNE loop
-// sink:
-//    dstL = t3L
-//    dstH = t3H
-//    ...
-MachineBasicBlock *
-X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
-                                           MachineBasicBlock *MBB) const {
-  MachineFunction *MF = MBB->getParent();
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
-
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-
-  const BasicBlock *BB = MBB->getBasicBlock();
-  MachineFunction::iterator I = MBB;
-  ++I;
-
-  assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 &&
-         "Unexpected number of operands");
-
-  assert(MI->hasOneMemOperand() &&
-         "Expected atomic-load-op32 to have one memoperand");
-
-  // Memory Reference
-  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
-  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
-
-  unsigned DstLoReg, DstHiReg;
-  unsigned SrcLoReg, SrcHiReg;
-  unsigned MemOpndSlot;
-
-  unsigned CurOp = 0;
-
-  DstLoReg = MI->getOperand(CurOp++).getReg();
-  DstHiReg = MI->getOperand(CurOp++).getReg();
-  MemOpndSlot = CurOp;
-  CurOp += X86::AddrNumOperands;
-  SrcLoReg = MI->getOperand(CurOp++).getReg();
-  SrcHiReg = MI->getOperand(CurOp++).getReg();
-
-  const TargetRegisterClass *RC = &X86::GR32RegClass;
-  const TargetRegisterClass *RC8 = &X86::GR8RegClass;
-
-  unsigned t1L = MRI.createVirtualRegister(RC);
-  unsigned t1H = MRI.createVirtualRegister(RC);
-  unsigned t2L = MRI.createVirtualRegister(RC);
-  unsigned t2H = MRI.createVirtualRegister(RC);
-  unsigned t3L = MRI.createVirtualRegister(RC);
-  unsigned t3H = MRI.createVirtualRegister(RC);
-  unsigned t4L = MRI.createVirtualRegister(RC);
-  unsigned t4H = MRI.createVirtualRegister(RC);
-
-  unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
-  unsigned LOADOpc = X86::MOV32rm;
-
-  // For the atomic load-arith operator, we generate
-  //
-  //  thisMBB:
-  //    t1L = LOAD [MI.addr + 0]
-  //    t1H = LOAD [MI.addr + 4]
-  //  mainMBB:
-  //    t4L = phi(t1L / thisMBB, t3L / mainMBB)
-  //    t4H = phi(t1H / thisMBB, t3H / mainMBB)
-  //    t2L = OP MI.val.lo, t4L
-  //    t2H = OP MI.val.hi, t4H
-  //    EBX = t2L
-  //    ECX = t2H
-  //    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
-  //    t3L = EAX
-  //    t3H = EDX
-  //    JNE loop
-  //  sinkMBB:
-  //    dstL = t3L
-  //    dstH = t3H
-
-  MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
-  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
-  MF->insert(I, mainMBB);
-  MF->insert(I, sinkMBB);
-
-  MachineInstrBuilder MIB;
-
-  // Transfer the remainder of BB and its successor edges to sinkMBB.
-  sinkMBB->splice(sinkMBB->begin(), MBB,
-                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
-
-  // thisMBB:
-  // Lo
-  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L);
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-    if (NewMO.isReg())
-      NewMO.setIsKill(false);
-    MIB.addOperand(NewMO);
-  }
-  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
-    unsigned flags = (*MMOI)->getFlags();
-    flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
-    MachineMemOperand *MMO =
-      MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
-                               (*MMOI)->getSize(),
-                               (*MMOI)->getBaseAlignment(),
-                               (*MMOI)->getTBAAInfo(),
-                               (*MMOI)->getRanges());
-    MIB.addMemOperand(MMO);
-  };
-  MachineInstr *LowMI = MIB;
-
-  // Hi
-  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H);
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    if (i == X86::AddrDisp) {
-      MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
-    } else {
-      MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-      if (NewMO.isReg())
-        NewMO.setIsKill(false);
-      MIB.addOperand(NewMO);
-    }
-  }
-  MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end());
-
-  thisMBB->addSuccessor(mainMBB);
-
-  // mainMBB:
-  MachineBasicBlock *origMainMBB = mainMBB;
-
-  // Add PHIs.
-  MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L)
-                        .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
-  MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H)
-                        .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
-
-  unsigned Opc = MI->getOpcode();
-  switch (Opc) {
-  default:
-    llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
-  case X86::ATOMAND6432:
-  case X86::ATOMOR6432:
-  case X86::ATOMXOR6432:
-  case X86::ATOMADD6432:
-  case X86::ATOMSUB6432: {
-    unsigned HiOpc;
-    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L)
-      .addReg(SrcLoReg);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H)
-      .addReg(SrcHiReg);
-    break;
-  }
-  case X86::ATOMNAND6432: {
-    unsigned HiOpc, NOTOpc;
-    unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
-    unsigned TmpL = MRI.createVirtualRegister(RC);
-    unsigned TmpH = MRI.createVirtualRegister(RC);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg)
-      .addReg(t4L);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg)
-      .addReg(t4H);
-    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL);
-    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH);
-    break;
-  }
-  case X86::ATOMMAX6432:
-  case X86::ATOMMIN6432:
-  case X86::ATOMUMAX6432:
-  case X86::ATOMUMIN6432: {
-    unsigned HiOpc;
-    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
-    unsigned cL = MRI.createVirtualRegister(RC8);
-    unsigned cH = MRI.createVirtualRegister(RC8);
-    unsigned cL32 = MRI.createVirtualRegister(RC);
-    unsigned cH32 = MRI.createVirtualRegister(RC);
-    unsigned cc = MRI.createVirtualRegister(RC);
-    // cl := cmp src_lo, lo
-    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
-      .addReg(SrcLoReg).addReg(t4L);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
-    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
-    // ch := cmp src_hi, hi
-    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
-      .addReg(SrcHiReg).addReg(t4H);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
-    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
-    // cc := if (src_hi == hi) ? cl : ch;
-    if (Subtarget->hasCMov()) {
-      BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc)
-        .addReg(cH32).addReg(cL32);
-    } else {
-      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc)
-              .addReg(cH32).addReg(cL32)
-              .addImm(X86::COND_E);
-      mainMBB = EmitLoweredSelect(MIB, mainMBB);
-    }
-    BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
-    if (Subtarget->hasCMov()) {
-      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L)
-        .addReg(SrcLoReg).addReg(t4L);
-      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H)
-        .addReg(SrcHiReg).addReg(t4H);
-    } else {
-      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L)
-              .addReg(SrcLoReg).addReg(t4L)
-              .addImm(X86::COND_NE);
-      mainMBB = EmitLoweredSelect(MIB, mainMBB);
-      // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the
-      // 2nd CMOV lowering.
-      mainMBB->addLiveIn(X86::EFLAGS);
-      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H)
-              .addReg(SrcHiReg).addReg(t4H)
-              .addImm(X86::COND_NE);
-      mainMBB = EmitLoweredSelect(MIB, mainMBB);
-      // Replace the original PHI node as mainMBB is changed after CMOV
-      // lowering.
-      BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L)
-        .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
-      BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H)
-        .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
-      PhiL->eraseFromParent();
-      PhiH->eraseFromParent();
-    }
-    break;
-  }
-  case X86::ATOMSWAP6432: {
-    unsigned HiOpc;
-    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg);
-    break;
-  }
-  }
-
-  // Copy EDX:EAX back from HiReg:LoReg
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L);
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H);
-  // Copy ECX:EBX from t1H:t1L
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L);
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H);
-
-  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-    if (NewMO.isReg())
-      NewMO.setIsKill(false);
-    MIB.addOperand(NewMO);
-  }
-  MIB.setMemRefs(MMOBegin, MMOEnd);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
  
-  // Copy EDX:EAX back to t3H:t3L
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX);
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX);
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
  
-  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
+  // thisMBB:
+  //  xbegin sinkMBB
+  //  # fallthrough to mainMBB
+  //  # abortion to sinkMBB
+  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(sinkMBB);
  
-  mainMBB->addSuccessor(origMainMBB);
+  // mainMBB:
+  //  EAX = -1
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
    mainMBB->addSuccessor(sinkMBB);
  
    // sinkMBB:
+  // EAX is live into the sinkMBB
+  sinkMBB->addLiveIn(X86::EAX);
    BuildMI(*sinkMBB, sinkMBB->begin(), DL,
-          TII->get(TargetOpcode::COPY), DstLoReg)
-    .addReg(t3L);
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
-          TII->get(TargetOpcode::COPY), DstHiReg)
-    .addReg(t3H);
+          TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::EAX);
  
    MI->eraseFromParent();
    return sinkMBB;
@@ -16583,1423 +17637,1947 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
    // 8  ) Align         : Alignment of type
    // 9  ) EFLAGS (implicit-def)
  
-  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
-  assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
+  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
+  assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
+
+  unsigned DestReg = MI->getOperand(0).getReg();
+  MachineOperand &Base = MI->getOperand(1);
+  MachineOperand &Scale = MI->getOperand(2);
+  MachineOperand &Index = MI->getOperand(3);
+  MachineOperand &Disp = MI->getOperand(4);
+  MachineOperand &Segment = MI->getOperand(5);
+  unsigned ArgSize = MI->getOperand(6).getImm();
+  unsigned ArgMode = MI->getOperand(7).getImm();
+  unsigned Align = MI->getOperand(8).getImm();
+
+  // Memory Reference
+  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  // Machine Information
+  const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
+  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
+  DebugLoc DL = MI->getDebugLoc();
+
+  // struct va_list {
+  //   i32   gp_offset
+  //   i32   fp_offset
+  //   i64   overflow_area (address)
+  //   i64   reg_save_area (address)
+  // }
+  // sizeof(va_list) = 24
+  // alignment(va_list) = 8
+
+  unsigned TotalNumIntRegs = 6;
+  unsigned TotalNumXMMRegs = 8;
+  bool UseGPOffset = (ArgMode == 1);
+  bool UseFPOffset = (ArgMode == 2);
+  unsigned MaxOffset = TotalNumIntRegs * 8 +
+                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
+
+  /* Align ArgSize to a multiple of 8 */
+  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
+  bool NeedsAlign = (Align > 8);
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *overflowMBB;
+  MachineBasicBlock *offsetMBB;
+  MachineBasicBlock *endMBB;
+
+  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
+  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
+  unsigned OffsetReg = 0;
+
+  if (!UseGPOffset && !UseFPOffset) {
+    // If we only pull from the overflow region, we don't create a branch.
+    // We don't need to alter control flow.
+    OffsetDestReg = 0; // unused
+    OverflowDestReg = DestReg;
+
+    offsetMBB = nullptr;
+    overflowMBB = thisMBB;
+    endMBB = thisMBB;
+  } else {
+    // First emit code to check if gp_offset (or fp_offset) is below the bound.
+    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
+    // If not, pull from overflow_area. (branch to overflowMBB)
+    //
+    //       thisMBB
+    //         |     .
+    //         |        .
+    //     offsetMBB   overflowMBB
+    //         |        .
+    //         |     .
+    //        endMBB
+
+    // Registers for the PHI in endMBB
+    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
+    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
+
+    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+    MachineFunction *MF = MBB->getParent();
+    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+    MachineFunction::iterator MBBIter = MBB;
+    ++MBBIter;
+
+    // Insert the new basic blocks
+    MF->insert(MBBIter, offsetMBB);
+    MF->insert(MBBIter, overflowMBB);
+    MF->insert(MBBIter, endMBB);
+
+    // Transfer the remainder of MBB and its successor edges to endMBB.
+    endMBB->splice(endMBB->begin(), thisMBB,
+                   std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
+    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
+
+    // Make offsetMBB and overflowMBB successors of thisMBB
+    thisMBB->addSuccessor(offsetMBB);
+    thisMBB->addSuccessor(overflowMBB);
+
+    // endMBB is a successor of both offsetMBB and overflowMBB
+    offsetMBB->addSuccessor(endMBB);
+    overflowMBB->addSuccessor(endMBB);
+
+    // Load the offset value into a register
+    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, UseFPOffset ? 4 : 0)
+      .addOperand(Segment)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Check if there is enough room left to pull this argument.
+    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
+      .addReg(OffsetReg)
+      .addImm(MaxOffset + 8 - ArgSizeA8);
+
+    // Branch to "overflowMBB" if offset >= max
+    // Fall through to "offsetMBB" otherwise
+    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
+      .addMBB(overflowMBB);
+  }
+
+  // In offsetMBB, emit code to use the reg_save_area.
+  if (offsetMBB) {
+    assert(OffsetReg != 0);
+
+    // Read the reg_save_area address.
+    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
+    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, 16)
+      .addOperand(Segment)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Zero-extend the offset
+    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+        .addImm(0)
+        .addReg(OffsetReg)
+        .addImm(X86::sub_32bit);
+
+    // Add the offset to the reg_save_area to get the final address.
+    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
+      .addReg(OffsetReg64)
+      .addReg(RegSaveReg);
+
+    // Compute the offset for the next argument
+    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
+      .addReg(OffsetReg)
+      .addImm(UseFPOffset ? 16 : 8);
+
+    // Store it back into the va_list.
+    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, UseFPOffset ? 4 : 0)
+      .addOperand(Segment)
+      .addReg(NextOffsetReg)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Jump to endMBB
+    BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
+      .addMBB(endMBB);
+  }
+
+  //
+  // Emit code to use overflow area
+  //
+
+  // Load the overflow_area address into a register.
+  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
+  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
+    .addOperand(Base)
+    .addOperand(Scale)
+    .addOperand(Index)
+    .addDisp(Disp, 8)
+    .addOperand(Segment)
+    .setMemRefs(MMOBegin, MMOEnd);
+
+  // If we need to align it, do so. Otherwise, just copy the address
+  // to OverflowDestReg.
+  if (NeedsAlign) {
+    // Align the overflow address
+    assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
+    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
+
+    // aligned_addr = (addr + (align-1)) & ~(align-1)
+    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
+      .addReg(OverflowAddrReg)
+      .addImm(Align-1);
+
+    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
+      .addReg(TmpReg)
+      .addImm(~(uint64_t)(Align-1));
+  } else {
+    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
+      .addReg(OverflowAddrReg);
+  }
+
+  // Compute the next overflow address after this argument.
+  // (the overflow address should be kept 8-byte aligned)
+  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
+  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
+    .addReg(OverflowDestReg)
+    .addImm(ArgSizeA8);
+
+  // Store the new overflow address.
+  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
+    .addOperand(Base)
+    .addOperand(Scale)
+    .addOperand(Index)
+    .addDisp(Disp, 8)
+    .addOperand(Segment)
+    .addReg(NextAddrReg)
+    .setMemRefs(MMOBegin, MMOEnd);
+
+  // If we branched, emit the PHI to the front of endMBB.
+  if (offsetMBB) {
+    BuildMI(*endMBB, endMBB->begin(), DL,
+            TII->get(X86::PHI), DestReg)
+      .addReg(OffsetDestReg).addMBB(offsetMBB)
+      .addReg(OverflowDestReg).addMBB(overflowMBB);
+  }
+
+  // Erase the pseudo instruction
+  MI->eraseFromParent();
+
+  return endMBB;
+}
  
-  unsigned DestReg = MI->getOperand(0).getReg();
-  MachineOperand &Base = MI->getOperand(1);
-  MachineOperand &Scale = MI->getOperand(2);
-  MachineOperand &Index = MI->getOperand(3);
-  MachineOperand &Disp = MI->getOperand(4);
-  MachineOperand &Segment = MI->getOperand(5);
-  unsigned ArgSize = MI->getOperand(6).getImm();
-  unsigned ArgMode = MI->getOperand(7).getImm();
-  unsigned Align = MI->getOperand(8).getImm();
+MachineBasicBlock *
+X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+                                                 MachineInstr *MI,
+                                                 MachineBasicBlock *MBB) const {
+  // Emit code to save XMM registers to the stack. The ABI says that the
+  // number of registers to save is given in %al, so it's theoretically
+  // possible to do an indirect jump trick to avoid saving all of them,
+  // however this code takes a simpler approach and just executes all
+  // of the stores if %al is non-zero. It's less code, and it's probably
+  // easier on the hardware branch predictor, and stores aren't all that
+  // expensive anyway.
  
-  // Memory Reference
-  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
-  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
-  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+  // Create the new basic blocks. One block contains all the XMM stores,
+  // and one block is the final destination regardless of whether any
+  // stores were performed.
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction *F = MBB->getParent();
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, XMMSaveMBB);
+  F->insert(MBBIter, EndMBB);
  
-  // Machine Information
-  const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
-  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
+  // Transfer the remainder of MBB and its successor edges to EndMBB.
+  EndMBB->splice(EndMBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // The original block will now fall through to the XMM save block.
+  MBB->addSuccessor(XMMSaveMBB);
+  // The XMMSaveMBB will fall through to the end block.
+  XMMSaveMBB->addSuccessor(EndMBB);
+
+  // Now add the instructions.
+  const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
    DebugLoc DL = MI->getDebugLoc();
  
-  // struct va_list {
-  //   i32   gp_offset
-  //   i32   fp_offset
-  //   i64   overflow_area (address)
-  //   i64   reg_save_area (address)
-  // }
-  // sizeof(va_list) = 24
-  // alignment(va_list) = 8
+  unsigned CountReg = MI->getOperand(0).getReg();
+  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
+  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
  
-  unsigned TotalNumIntRegs = 6;
-  unsigned TotalNumXMMRegs = 8;
-  bool UseGPOffset = (ArgMode == 1);
-  bool UseFPOffset = (ArgMode == 2);
-  unsigned MaxOffset = TotalNumIntRegs * 8 +
-                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
+  if (!Subtarget->isTargetWin64()) {
+    // If %al is 0, branch around the XMM save block.
+    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
+    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
+    MBB->addSuccessor(EndMBB);
+  }
  
-  /* Align ArgSize to a multiple of 8 */
-  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
-  bool NeedsAlign = (Align > 8);
+  // Make sure the last operand is EFLAGS, which gets clobbered by the branch
+  // that was just emitted, but clearly shouldn't be "saved".
+  assert((MI->getNumOperands() <= 3 ||
+          !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
+          MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
+         && "Expected last argument to be EFLAGS");
+  unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+  // In the XMM save block, save all the XMM argument registers.
+  for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
+    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
+    MachineMemOperand *MMO =
+      F->getMachineMemOperand(
+          MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
+        MachineMemOperand::MOStore,
+        /*Size=*/16, /*Align=*/16);
+    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
+      .addFrameIndex(RegSaveFrameIndex)
+      .addImm(/*Scale=*/1)
+      .addReg(/*IndexReg=*/0)
+      .addImm(/*Disp=*/Offset)
+      .addReg(/*Segment=*/0)
+      .addReg(MI->getOperand(i).getReg())
+      .addMemOperand(MMO);
+  }
  
-  MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *overflowMBB;
-  MachineBasicBlock *offsetMBB;
-  MachineBasicBlock *endMBB;
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
  
-  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
-  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
-  unsigned OffsetReg = 0;
+  return EndMBB;
+}
  
-  if (!UseGPOffset && !UseFPOffset) {
-    // If we only pull from the overflow region, we don't create a branch.
-    // We don't need to alter control flow.
-    OffsetDestReg = 0; // unused
-    OverflowDestReg = DestReg;
+// The EFLAGS operand of SelectItr might be missing a kill marker
+// because there were multiple uses of EFLAGS, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
+                                     MachineBasicBlock* BB,
+                                     const TargetRegisterInfo* TRI) {
+  // Scan forward through BB for a use/def of EFLAGS.
+  MachineBasicBlock::iterator miI(std::next(SelectItr));
+  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+    const MachineInstr& mi = *miI;
+    if (mi.readsRegister(X86::EFLAGS))
+      return false;
+    if (mi.definesRegister(X86::EFLAGS))
+      break; // Should have kill-flag - update below.
+  }
  
-    offsetMBB = nullptr;
-    overflowMBB = thisMBB;
-    endMBB = thisMBB;
-  } else {
-    // First emit code to check if gp_offset (or fp_offset) is below the bound.
-    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
-    // If not, pull from overflow_area. (branch to overflowMBB)
-    //
-    //       thisMBB
-    //         |     .
-    //         |        .
-    //     offsetMBB   overflowMBB
-    //         |        .
-    //         |     .
-    //        endMBB
+  // If we hit the end of the block, check whether EFLAGS is live into a
+  // successor.
+  if (miI == BB->end()) {
+    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+                                          sEnd = BB->succ_end();
+         sItr != sEnd; ++sItr) {
+      MachineBasicBlock* succ = *sItr;
+      if (succ->isLiveIn(X86::EFLAGS))
+        return false;
+    }
+  }
  
-    // Registers for the PHI in endMBB
-    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
-    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
+  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
+  // out. SelectMI should have a kill flag on EFLAGS.
+  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
+  return true;
+}
  
-    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-    MachineFunction *MF = MBB->getParent();
-    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
  
-    MachineFunction::iterator MBBIter = MBB;
-    ++MBBIter;
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
  
-    // Insert the new basic blocks
-    MF->insert(MBBIter, offsetMBB);
-    MF->insert(MBBIter, overflowMBB);
-    MF->insert(MBBIter, endMBB);
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
  
-    // Transfer the remainder of MBB and its successor edges to endMBB.
-    endMBB->splice(endMBB->begin(), thisMBB,
-                   std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
-    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
+  // If the EFLAGS register isn't dead in the terminator, then claim that it's
+  // live into the sink and copy blocks.
+  const TargetRegisterInfo *TRI =
+      BB->getParent()->getSubtarget().getRegisterInfo();
+  if (!MI->killsRegister(X86::EFLAGS) &&
+      !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
+    copy0MBB->addLiveIn(X86::EFLAGS);
+    sinkMBB->addLiveIn(X86::EFLAGS);
+  }
  
-    // Make offsetMBB and overflowMBB successors of thisMBB
-    thisMBB->addSuccessor(offsetMBB);
-    thisMBB->addSuccessor(overflowMBB);
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
  
-    // endMBB is a successor of both offsetMBB and overflowMBB
-    offsetMBB->addSuccessor(endMBB);
-    overflowMBB->addSuccessor(endMBB);
+  // Add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
  
-    // Load the offset value into a register
-    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
-    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
-      .addOperand(Base)
-      .addOperand(Scale)
-      .addOperand(Index)
-      .addDisp(Disp, UseFPOffset ? 4 : 0)
-      .addOperand(Segment)
-      .setMemRefs(MMOBegin, MMOEnd);
+  // Create the conditional branch instruction.
+  unsigned Opc =
+    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
  
-    // Check if there is enough room left to pull this argument.
-    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
-      .addReg(OffsetReg)
-      .addImm(MaxOffset + 8 - ArgSizeA8);
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  copy0MBB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(X86::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
  
-    // Branch to "overflowMBB" if offset >= max
-    // Fall through to "offsetMBB" otherwise
-    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
-      .addMBB(overflowMBB);
-  }
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return sinkMBB;
+}
  
-  // In offsetMBB, emit code to use the reg_save_area.
-  if (offsetMBB) {
-    assert(OffsetReg != 0);
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
+                                        bool Is64Bit) const {
+  MachineFunction *MF = BB->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
  
-    // Read the reg_save_area address.
-    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
-    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
-      .addOperand(Base)
-      .addOperand(Scale)
-      .addOperand(Index)
-      .addDisp(Disp, 16)
-      .addOperand(Segment)
-      .setMemRefs(MMOBegin, MMOEnd);
+  assert(MF->shouldSplitStack());
  
-    // Zero-extend the offset
-    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
-      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
-        .addImm(0)
-        .addReg(OffsetReg)
-        .addImm(X86::sub_32bit);
+  unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+  unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
  
-    // Add the offset to the reg_save_area to get the final address.
-    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
-      .addReg(OffsetReg64)
-      .addReg(RegSaveReg);
+  // BB:
+  //  ... [Till the alloca]
+  // If stacklet is not large enough, jump to mallocMBB
+  //
+  // bumpMBB:
+  //  Allocate by subtracting from RSP
+  //  Jump to continueMBB
+  //
+  // mallocMBB:
+  //  Allocate by call to runtime
+  //
+  // continueMBB:
+  //  ...
+  //  [rest of original BB]
+  //
  
-    // Compute the offset for the next argument
-    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
-    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
-      .addReg(OffsetReg)
-      .addImm(UseFPOffset ? 16 : 8);
+  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  
-    // Store it back into the va_list.
-    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
-      .addOperand(Base)
-      .addOperand(Scale)
-      .addOperand(Index)
-      .addDisp(Disp, UseFPOffset ? 4 : 0)
-      .addOperand(Segment)
-      .addReg(NextOffsetReg)
-      .setMemRefs(MMOBegin, MMOEnd);
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterClass *AddrRegClass =
+    getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
  
-    // Jump to endMBB
-    BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
-      .addMBB(endMBB);
-  }
+  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+    bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+    tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+    SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
+    sizeVReg = MI->getOperand(1).getReg(),
+    physSPReg = Is64Bit ? X86::RSP : X86::ESP;
  
-  //
-  // Emit code to use overflow area
-  //
+  MachineFunction::iterator MBBIter = BB;
+  ++MBBIter;
  
-  // Load the overflow_area address into a register.
-  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
-  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
-    .addOperand(Base)
-    .addOperand(Scale)
-    .addOperand(Index)
-    .addDisp(Disp, 8)
-    .addOperand(Segment)
-    .setMemRefs(MMOBegin, MMOEnd);
+  MF->insert(MBBIter, bumpMBB);
+  MF->insert(MBBIter, mallocMBB);
+  MF->insert(MBBIter, continueMBB);
  
-  // If we need to align it, do so. Otherwise, just copy the address
-  // to OverflowDestReg.
-  if (NeedsAlign) {
-    // Align the overflow address
-    assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
-    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
+  continueMBB->splice(continueMBB->begin(), BB,
+                      std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
  
-    // aligned_addr = (addr + (align-1)) & ~(align-1)
-    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
-      .addReg(OverflowAddrReg)
-      .addImm(Align-1);
+  // Add code to the main basic block to check if the stack limit has been hit,
+  // and if so, jump to mallocMBB otherwise to bumpMBB.
+  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
+  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
+    .addReg(tmpSPVReg).addReg(sizeVReg);
+  BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
+    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+    .addReg(SPLimitVReg);
+  BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
  
-    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
-      .addReg(TmpReg)
-      .addImm(~(uint64_t)(Align-1));
+  // bumpMBB simply decreases the stack pointer, since we know the current
+  // stacklet has enough space.
+  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
+    .addReg(SPLimitVReg);
+  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
+    .addReg(SPLimitVReg);
+  BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+  // Calls into a routine in libgcc to allocate more space from the heap.
+  const uint32_t *RegMask = MF->getTarget()
+                                .getSubtargetImpl()
+                                ->getRegisterInfo()
+                                ->getCallPreservedMask(CallingConv::C);
+  if (Is64Bit) {
+    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
+      .addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space")
+      .addRegMask(RegMask)
+      .addReg(X86::RDI, RegState::Implicit)
+      .addReg(X86::RAX, RegState::ImplicitDefine);
    } else {
-    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
-      .addReg(OverflowAddrReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
+      .addImm(12);
+    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space")
+      .addRegMask(RegMask)
+      .addReg(X86::EAX, RegState::ImplicitDefine);
    }
  
-  // Compute the next overflow address after this argument.
-  // (the overflow address should be kept 8-byte aligned)
-  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
-  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
-    .addReg(OverflowDestReg)
-    .addImm(ArgSizeA8);
+  if (!Is64Bit)
+    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
+      .addImm(16);
  
-  // Store the new overflow address.
-  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
-    .addOperand(Base)
-    .addOperand(Scale)
-    .addOperand(Index)
-    .addDisp(Disp, 8)
-    .addOperand(Segment)
-    .addReg(NextAddrReg)
-    .setMemRefs(MMOBegin, MMOEnd);
+  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
+    .addReg(Is64Bit ? X86::RAX : X86::EAX);
+  BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
  
-  // If we branched, emit the PHI to the front of endMBB.
-  if (offsetMBB) {
-    BuildMI(*endMBB, endMBB->begin(), DL,
-            TII->get(X86::PHI), DestReg)
-      .addReg(OffsetDestReg).addMBB(offsetMBB)
-      .addReg(OverflowDestReg).addMBB(overflowMBB);
-  }
+  // Set up the CFG correctly.
+  BB->addSuccessor(bumpMBB);
+  BB->addSuccessor(mallocMBB);
+  mallocMBB->addSuccessor(continueMBB);
+  bumpMBB->addSuccessor(continueMBB);
  
-  // Erase the pseudo instruction
+  // Take care of the PHI nodes.
+  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(mallocPtrVReg).addMBB(mallocMBB)
+    .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
+
+  // Delete the original pseudo instruction.
    MI->eraseFromParent();
  
-  return endMBB;
+  // And we're done.
+  return continueMBB;
  }
  
  MachineBasicBlock *
-X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
-                                                 MachineInstr *MI,
-                                                 MachineBasicBlock *MBB) const {
-  // Emit code to save XMM registers to the stack. The ABI says that the
-  // number of registers to save is given in %al, so it's theoretically
-  // possible to do an indirect jump trick to avoid saving all of them,
-  // however this code takes a simpler approach and just executes all
-  // of the stores if %al is non-zero. It's less code, and it's probably
-  // easier on the hardware branch predictor, and stores aren't all that
-  // expensive anyway.
-
-  // Create the new basic blocks. One block contains all the XMM stores,
-  // and one block is the final destination regardless of whether any
-  // stores were performed.
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction *F = MBB->getParent();
-  MachineFunction::iterator MBBIter = MBB;
-  ++MBBIter;
-  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(MBBIter, XMMSaveMBB);
-  F->insert(MBBIter, EndMBB);
-
-  // Transfer the remainder of MBB and its successor edges to EndMBB.
-  EndMBB->splice(EndMBB->begin(), MBB,
-                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
-
-  // The original block will now fall through to the XMM save block.
-  MBB->addSuccessor(XMMSaveMBB);
-  // The XMMSaveMBB will fall through to the end block.
-  XMMSaveMBB->addSuccessor(EndMBB);
-
-  // Now add the instructions.
-  const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
+X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
+                                        MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
    DebugLoc DL = MI->getDebugLoc();
  
-  unsigned CountReg = MI->getOperand(0).getReg();
-  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
-  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
+  assert(!Subtarget->isTargetMacho());
  
-  if (!Subtarget->isTargetWin64()) {
-    // If %al is 0, branch around the XMM save block.
-    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
-    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
-    MBB->addSuccessor(EndMBB);
-  }
+  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
+  // non-trivial part is impdef of ESP.
+
+  if (Subtarget->isTargetWin64()) {
+    if (Subtarget->isTargetCygMing()) {
+      // ___chkstk(Mingw64):
+      // Clobbers R10, R11, RAX and EFLAGS.
+      // Updates RSP.
+      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
+        .addExternalSymbol("___chkstk")
+        .addReg(X86::RAX, RegState::Implicit)
+        .addReg(X86::RSP, RegState::Implicit)
+        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
+        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
+        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+    } else {
+      // __chkstk(MSVCRT): does not update stack pointer.
+      // Clobbers R10, R11 and EFLAGS.
+      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
+        .addExternalSymbol("__chkstk")
+        .addReg(X86::RAX, RegState::Implicit)
+        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+      // RAX has the offset to be subtracted from RSP.
+      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
+        .addReg(X86::RSP)
+        .addReg(X86::RAX);
+    }
+  } else {
+    const char *StackProbeSymbol =
+      Subtarget->isTargetKnownWindowsMSVC() ? "_chkstk" : "_alloca";
  
-  // Make sure the last operand is EFLAGS, which gets clobbered by the branch
-  // that was just emitted, but clearly shouldn't be "saved".
-  assert((MI->getNumOperands() <= 3 ||
-          !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
-          MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
-         && "Expected last argument to be EFLAGS");
-  unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
-  // In the XMM save block, save all the XMM argument registers.
-  for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
-    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
-    MachineMemOperand *MMO =
-      F->getMachineMemOperand(
-          MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
-        MachineMemOperand::MOStore,
-        /*Size=*/16, /*Align=*/16);
-    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
-      .addFrameIndex(RegSaveFrameIndex)
-      .addImm(/*Scale=*/1)
-      .addReg(/*IndexReg=*/0)
-      .addImm(/*Disp=*/Offset)
-      .addReg(/*Segment=*/0)
-      .addReg(MI->getOperand(i).getReg())
-      .addMemOperand(MMO);
+    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
+      .addExternalSymbol(StackProbeSymbol)
+      .addReg(X86::EAX, RegState::Implicit)
+      .addReg(X86::ESP, RegState::Implicit)
+      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
+      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
    }
  
    MI->eraseFromParent();   // The pseudo instruction is gone now.
-
-  return EndMBB;
+  return BB;
  }
  
-// The EFLAGS operand of SelectItr might be missing a kill marker
-// because there were multiple uses of EFLAGS, and ISel didn't know
-// which to mark. Figure out whether SelectItr should have had a
-// kill marker, and set it if it should. Returns the correct kill
-// marker value.
-static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
-                                     MachineBasicBlock* BB,
-                                     const TargetRegisterInfo* TRI) {
-  // Scan forward through BB for a use/def of EFLAGS.
-  MachineBasicBlock::iterator miI(std::next(SelectItr));
-  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
-    const MachineInstr& mi = *miI;
-    if (mi.readsRegister(X86::EFLAGS))
-      return false;
-    if (mi.definesRegister(X86::EFLAGS))
-      break; // Should have kill-flag - update below.
-  }
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
+                                      MachineBasicBlock *BB) const {
+  // This is pretty easy.  We're taking the value that we received from
+  // our load from the relocation, sticking it in either RDI (x86-64)
+  // or EAX and doing an indirect call.  The return value will then
+  // be in the normal return register.
+  MachineFunction *F = BB->getParent();
+  const X86InstrInfo *TII =
+      static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());
+  DebugLoc DL = MI->getDebugLoc();
  
-  // If we hit the end of the block, check whether EFLAGS is live into a
-  // successor.
-  if (miI == BB->end()) {
-    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
-                                          sEnd = BB->succ_end();
-         sItr != sEnd; ++sItr) {
-      MachineBasicBlock* succ = *sItr;
-      if (succ->isLiveIn(X86::EFLAGS))
-        return false;
-    }
+  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
+  assert(MI->getOperand(3).isGlobal() && "This should be a global");
+
+  // Get a register mask for the lowered call.
+  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
+  // proper register mask.
+  const uint32_t *RegMask = F->getTarget()
+                                .getSubtargetImpl()
+                                ->getRegisterInfo()
+                                ->getCallPreservedMask(CallingConv::C);
+  if (Subtarget->is64Bit()) {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV64rm), X86::RDI)
+    .addReg(X86::RIP)
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
+    addDirectMem(MIB, X86::RDI);
+    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
+  } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV32rm), X86::EAX)
+    .addReg(0)
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    addDirectMem(MIB, X86::EAX);
+    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+  } else {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV32rm), X86::EAX)
+    .addReg(TII->getGlobalBaseReg(F))
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    addDirectMem(MIB, X86::EAX);
+    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
    }
  
-  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
-  // out. SelectMI should have a kill flag on EFLAGS.
-  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
-  return true;
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
  }
  
  MachineBasicBlock *
-X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
-                                     MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
+X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
+                                    MachineBasicBlock *MBB) const {
    DebugLoc DL = MI->getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
  
-  // To "insert" a SELECT_CC instruction, we actually have to insert the
-  // diamond control-flow pattern.  The incoming instruction knows the
-  // destination vreg to set, the condition code register to branch on, the
-  // true/false values to select between, and a branch opcode to use.
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It;
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
  
-  //  thisMBB:
-  //  ...
-  //   TrueVal = ...
-  //   cmpTY ccX, r1, r2
-  //   bCC copy1MBB
-  //   fallthrough --> copy0MBB
-  MachineBasicBlock *thisMBB = BB;
-  MachineFunction *F = BB->getParent();
-  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(It, copy0MBB);
-  F->insert(It, sinkMBB);
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
  
-  // If the EFLAGS register isn't dead in the terminator, then claim that it's
-  // live into the sink and copy blocks.
-  const TargetRegisterInfo* TRI = BB->getParent()->getTarget().getRegisterInfo();
-  if (!MI->killsRegister(X86::EFLAGS) &&
-      !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
-    copy0MBB->addLiveIn(X86::EFLAGS);
-    sinkMBB->addLiveIn(X86::EFLAGS);
-  }
+  unsigned DstReg;
+  unsigned MemOpndSlot = 0;
+
+  unsigned CurOp = 0;
+
+  DstReg = MI->getOperand(CurOp++).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  unsigned mainDstReg = MRI.createVirtualRegister(RC);
+  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+  MemOpndSlot = CurOp;
+
+  MVT PVT = getPointerTy();
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  // For v = setjmp(buf), we generate
+  //
+  // thisMBB:
+  //  buf[LabelOffset] = restoreMBB
+  //  SjLjSetup restoreMBB
+  //
+  // mainMBB:
+  //  v_main = 0
+  //
+  // sinkMBB:
+  //  v = phi(main, restore)
+  //
+  // restoreMBB:
+  //  v_restore = 1
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+  MF->push_back(restoreMBB);
+
+  MachineInstrBuilder MIB;
  
    // Transfer the remainder of BB and its successor edges to sinkMBB.
-  sinkMBB->splice(sinkMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
  
-  // Add the true and fallthrough blocks as its successors.
-  BB->addSuccessor(copy0MBB);
-  BB->addSuccessor(sinkMBB);
+  // thisMBB:
+  unsigned PtrStoreOpc = 0;
+  unsigned LabelReg = 0;
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  Reloc::Model RM = MF->getTarget().getRelocationModel();
+  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
+                     (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
+
+  // Prepare IP either in reg or imm.
+  if (!UseImmLabel) {
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+    LabelReg = MRI.createVirtualRegister(PtrRC);
+    if (Subtarget->is64Bit()) {
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
+              .addReg(X86::RIP)
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB)
+              .addReg(0);
+    } else {
+      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
+              .addReg(XII->getGlobalBaseReg(MF))
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
+              .addReg(0);
+    }
+  } else
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+  // Store IP
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
+    else
+      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  }
+  if (!UseImmLabel)
+    MIB.addReg(LabelReg);
+  else
+    MIB.addMBB(restoreMBB);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Setup
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
+          .addMBB(restoreMBB);
  
-  // Create the conditional branch instruction.
-  unsigned Opc =
-    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
-  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
+  MIB.addRegMask(RegInfo->getNoPreservedMask());
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(restoreMBB);
  
-  //  copy0MBB:
-  //   %FalseValue = ...
-  //   # fallthrough to sinkMBB
-  copy0MBB->addSuccessor(sinkMBB);
+  // mainMBB:
+  //  EAX = 0
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
+  mainMBB->addSuccessor(sinkMBB);
  
-  //  sinkMBB:
-  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
-  //  ...
+  // sinkMBB:
    BuildMI(*sinkMBB, sinkMBB->begin(), DL,
-          TII->get(X86::PHI), MI->getOperand(0).getReg())
-    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
-    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+          TII->get(X86::PHI), DstReg)
+    .addReg(mainDstReg).addMBB(mainMBB)
+    .addReg(restoreDstReg).addMBB(restoreMBB);
  
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  // restoreMBB:
+  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
+  BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
+  restoreMBB->addSuccessor(sinkMBB);
+
+  MI->eraseFromParent();
    return sinkMBB;
  }
  
  MachineBasicBlock *
-X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
-                                        bool Is64Bit) const {
-  MachineFunction *MF = BB->getParent();
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
+                                     MachineBasicBlock *MBB) const {
    DebugLoc DL = MI->getDebugLoc();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-
-  assert(MF->shouldSplitStack());
-
-  unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
-  unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
-
-  // BB:
-  //  ... [Till the alloca]
-  // If stacklet is not large enough, jump to mallocMBB
-  //
-  // bumpMBB:
-  //  Allocate by subtracting from RSP
-  //  Jump to continueMBB
-  //
-  // mallocMBB:
-  //  Allocate by call to runtime
-  //
-  // continueMBB:
-  //  ...
-  //  [rest of original BB]
-  //
-
-  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
    MachineRegisterInfo &MRI = MF->getRegInfo();
-  const TargetRegisterClass *AddrRegClass =
-    getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
  
-  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
-    bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
-    tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
-    SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
-    sizeVReg = MI->getOperand(1).getReg(),
-    physSPReg = Is64Bit ? X86::RSP : X86::ESP;
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
  
-  MachineFunction::iterator MBBIter = BB;
-  ++MBBIter;
+  MVT PVT = getPointerTy();
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
  
-  MF->insert(MBBIter, bumpMBB);
-  MF->insert(MBBIter, mallocMBB);
-  MF->insert(MBBIter, continueMBB);
+  const TargetRegisterClass *RC =
+    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+  unsigned Tmp = MRI.createVirtualRegister(RC);
+  // Since FP is only updated here but NOT referenced, it's treated as GPR.
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
+  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+  unsigned SP = RegInfo->getStackRegister();
  
-  continueMBB->splice(continueMBB->begin(), BB,
-                      std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
+  MachineInstrBuilder MIB;
  
-  // Add code to the main basic block to check if the stack limit has been hit,
-  // and if so, jump to mallocMBB otherwise to bumpMBB.
-  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
-  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
-    .addReg(tmpSPVReg).addReg(sizeVReg);
-  BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
-    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
-    .addReg(SPLimitVReg);
-  BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  const int64_t SPOffset = 2 * PVT.getStoreSize();
  
-  // bumpMBB simply decreases the stack pointer, since we know the current
-  // stacklet has enough space.
-  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
-    .addReg(SPLimitVReg);
-  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
-    .addReg(SPLimitVReg);
-  BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
  
-  // Calls into a routine in libgcc to allocate more space from the heap.
-  const uint32_t *RegMask =
-    MF->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
-  if (Is64Bit) {
-    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
-      .addReg(sizeVReg);
-    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
-      .addExternalSymbol("__morestack_allocate_stack_space")
-      .addRegMask(RegMask)
-      .addReg(X86::RDI, RegState::Implicit)
-      .addReg(X86::RAX, RegState::ImplicitDefine);
-  } else {
-    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
-      .addImm(12);
-    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
-    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
-      .addExternalSymbol("__morestack_allocate_stack_space")
-      .addRegMask(RegMask)
-      .addReg(X86::EAX, RegState::ImplicitDefine);
+  // Reload FP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Reload IP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(i), LabelOffset);
+    else
+      MIB.addOperand(MI->getOperand(i));
    }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Reload SP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(i), SPOffset);
+    else
+      MIB.addOperand(MI->getOperand(i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Jump
+  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
  
-  if (!Is64Bit)
-    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
-      .addImm(16);
-
-  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
-    .addReg(Is64Bit ? X86::RAX : X86::EAX);
-  BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+  MI->eraseFromParent();
+  return MBB;
+}
  
-  // Set up the CFG correctly.
-  BB->addSuccessor(bumpMBB);
-  BB->addSuccessor(mallocMBB);
-  mallocMBB->addSuccessor(continueMBB);
-  bumpMBB->addSuccessor(continueMBB);
+// Replace 213-type (isel default) FMA3 instructions with 231-type for
+// accumulator loops. Writing back to the accumulator allows the coalescer
+// to remove extra copies in the loop.   
+MachineBasicBlock *
+X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
+                                 MachineBasicBlock *MBB) const {
+  MachineOperand &AddendOp = MI->getOperand(3);
  
-  // Take care of the PHI nodes.
-  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
-          MI->getOperand(0).getReg())
-    .addReg(mallocPtrVReg).addMBB(mallocMBB)
-    .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
+  // Bail out early if the addend isn't a register - we can't switch these.
+  if (!AddendOp.isReg())
+    return MBB;
  
-  // Delete the original pseudo instruction.
-  MI->eraseFromParent();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
  
-  // And we're done.
-  return continueMBB;
-}
+  // Check whether the addend is defined by a PHI:
+  assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
+  MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
+  if (!AddendDef.isPHI())
+    return MBB;
  
-MachineBasicBlock *
-X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
-                                        MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
+  // Look for the following pattern:
+  // loop:
+  //   %addend = phi [%entry, 0], [%loop, %result]
+  //   ...
+  //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
  
-  assert(!Subtarget->isTargetMacho());
+  // Replace with:
+  //   loop:
+  //   %addend = phi [%entry, 0], [%loop, %result]
+  //   ...
+  //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
  
-  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
-  // non-trivial part is impdef of ESP.
+  for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
+    assert(AddendDef.getOperand(i).isReg());
+    MachineOperand PHISrcOp = AddendDef.getOperand(i);
+    MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
+    if (&PHISrcInst == MI) {
+      // Found a matching instruction.
+      unsigned NewFMAOpc = 0;
+      switch (MI->getOpcode()) {
+        case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
+        case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
+        case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
+        case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
+        case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
+        case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
+        case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
+        case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
+        case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
+        case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
+        case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
+        case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
+        case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
+        case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
+        case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
+        case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
+        case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
+        case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
+        case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
+        case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
+        case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
+        case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
+        case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
+        case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
+        default: llvm_unreachable("Unrecognized FMA variant.");
+      }
  
-  if (Subtarget->isTargetWin64()) {
-    if (Subtarget->isTargetCygMing()) {
-      // ___chkstk(Mingw64):
-      // Clobbers R10, R11, RAX and EFLAGS.
-      // Updates RSP.
-      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
-        .addExternalSymbol("___chkstk")
-        .addReg(X86::RAX, RegState::Implicit)
-        .addReg(X86::RSP, RegState::Implicit)
-        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
-        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
-        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
-    } else {
-      // __chkstk(MSVCRT): does not update stack pointer.
-      // Clobbers R10, R11 and EFLAGS.
-      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
-        .addExternalSymbol("__chkstk")
-        .addReg(X86::RAX, RegState::Implicit)
-        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
-      // RAX has the offset to be subtracted from RSP.
-      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
-        .addReg(X86::RSP)
-        .addReg(X86::RAX);
+      const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+      MachineInstrBuilder MIB =
+        BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
+        .addOperand(MI->getOperand(0))
+        .addOperand(MI->getOperand(3))
+        .addOperand(MI->getOperand(2))
+        .addOperand(MI->getOperand(1));
+      MBB->insert(MachineBasicBlock::iterator(MI), MIB);
+      MI->eraseFromParent();
      }
-  } else {
-    const char *StackProbeSymbol =
-      Subtarget->isTargetKnownWindowsMSVC() ? "_chkstk" : "_alloca";
-
-    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
-      .addExternalSymbol(StackProbeSymbol)
-      .addReg(X86::EAX, RegState::Implicit)
-      .addReg(X86::ESP, RegState::Implicit)
-      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
-      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
-      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
    }
  
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
-  return BB;
+  return MBB;
  }
  
  MachineBasicBlock *
-X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
-                                      MachineBasicBlock *BB) const {
-  // This is pretty easy.  We're taking the value that we received from
-  // our load from the relocation, sticking it in either RDI (x86-64)
-  // or EAX and doing an indirect call.  The return value will then
-  // be in the normal return register.
-  MachineFunction *F = BB->getParent();
-  const X86InstrInfo *TII
-    = static_cast<const X86InstrInfo*>(F->getTarget().getInstrInfo());
-  DebugLoc DL = MI->getDebugLoc();
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unexpected instr type to insert");
+  case X86::TAILJMPd64:
+  case X86::TAILJMPr64:
+  case X86::TAILJMPm64:
+    llvm_unreachable("TAILJMP64 would not be touched here.");
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+    return BB;
+  case X86::WIN_ALLOCA:
+    return EmitLoweredWinAlloca(MI, BB);
+  case X86::SEG_ALLOCA_32:
+    return EmitLoweredSegAlloca(MI, BB, false);
+  case X86::SEG_ALLOCA_64:
+    return EmitLoweredSegAlloca(MI, BB, true);
+  case X86::TLSCall_32:
+  case X86::TLSCall_64:
+    return EmitLoweredTLSCall(MI, BB);
+  case X86::CMOV_GR8:
+  case X86::CMOV_FR32:
+  case X86::CMOV_FR64:
+  case X86::CMOV_V4F32:
+  case X86::CMOV_V2F64:
+  case X86::CMOV_V2I64:
+  case X86::CMOV_V8F32:
+  case X86::CMOV_V4F64:
+  case X86::CMOV_V4I64:
+  case X86::CMOV_V16F32:
+  case X86::CMOV_V8F64:
+  case X86::CMOV_V8I64:
+  case X86::CMOV_GR16:
+  case X86::CMOV_GR32:
+  case X86::CMOV_RFP32:
+  case X86::CMOV_RFP64:
+  case X86::CMOV_RFP80:
+    return EmitLoweredSelect(MI, BB);
  
-  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
-  assert(MI->getOperand(3).isGlobal() && "This should be a global");
+  case X86::FP32_TO_INT16_IN_MEM:
+  case X86::FP32_TO_INT32_IN_MEM:
+  case X86::FP32_TO_INT64_IN_MEM:
+  case X86::FP64_TO_INT16_IN_MEM:
+  case X86::FP64_TO_INT32_IN_MEM:
+  case X86::FP64_TO_INT64_IN_MEM:
+  case X86::FP80_TO_INT16_IN_MEM:
+  case X86::FP80_TO_INT32_IN_MEM:
+  case X86::FP80_TO_INT64_IN_MEM: {
+    MachineFunction *F = BB->getParent();
+    const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
+    DebugLoc DL = MI->getDebugLoc();
  
-  // Get a register mask for the lowered call.
-  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
-  // proper register mask.
-  const uint32_t *RegMask =
-    F->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
-  if (Subtarget->is64Bit()) {
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
-                                      TII->get(X86::MOV64rm), X86::RDI)
-    .addReg(X86::RIP)
-    .addImm(0).addReg(0)
-    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
-                      MI->getOperand(3).getTargetFlags())
-    .addReg(0);
-    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
-    addDirectMem(MIB, X86::RDI);
-    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
-  } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
-                                      TII->get(X86::MOV32rm), X86::EAX)
-    .addReg(0)
-    .addImm(0).addReg(0)
-    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
-                      MI->getOperand(3).getTargetFlags())
-    .addReg(0);
-    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
-    addDirectMem(MIB, X86::EAX);
-    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
-  } else {
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
-                                      TII->get(X86::MOV32rm), X86::EAX)
-    .addReg(TII->getGlobalBaseReg(F))
-    .addImm(0).addReg(0)
-    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
-                      MI->getOperand(3).getTargetFlags())
-    .addReg(0);
-    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
-    addDirectMem(MIB, X86::EAX);
-    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
-  }
+    // Change the floating point control register to use "round towards zero"
+    // mode when truncating to an integer value.
+    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
  
-  MI->eraseFromParent(); // The pseudo instruction is gone now.
-  return BB;
-}
+    // Load the old value of the high byte of the control word...
+    unsigned OldCW =
+      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
+                      CWFrameIdx);
  
-MachineBasicBlock *
-X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
-                                    MachineBasicBlock *MBB) const {
-  DebugLoc DL = MI->getDebugLoc();
-  MachineFunction *MF = MBB->getParent();
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
+    // Set the high part to be round to zero...
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
+      .addImm(0xC7F);
  
-  const BasicBlock *BB = MBB->getBasicBlock();
-  MachineFunction::iterator I = MBB;
-  ++I;
+    // Reload the modified control word now...
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FLDCW16m)), CWFrameIdx);
  
-  // Memory Reference
-  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
-  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+    // Restore the memory image of control word to original value
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
+      .addReg(OldCW);
  
-  unsigned DstReg;
-  unsigned MemOpndSlot = 0;
+    // Get the X86 opcode to use.
+    unsigned Opc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
+    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
+    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
+    }
  
-  unsigned CurOp = 0;
+    X86AddressMode AM;
+    MachineOperand &Op = MI->getOperand(0);
+    if (Op.isReg()) {
+      AM.BaseType = X86AddressMode::RegBase;
+      AM.Base.Reg = Op.getReg();
+    } else {
+      AM.BaseType = X86AddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = Op.getIndex();
+    }
+    Op = MI->getOperand(1);
+    if (Op.isImm())
+      AM.Scale = Op.getImm();
+    Op = MI->getOperand(2);
+    if (Op.isImm())
+      AM.IndexReg = Op.getImm();
+    Op = MI->getOperand(3);
+    if (Op.isGlobal()) {
+      AM.GV = Op.getGlobal();
+    } else {
+      AM.Disp = Op.getImm();
+    }
+    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
+                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
  
-  DstReg = MI->getOperand(CurOp++).getReg();
-  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
-  assert(RC->hasType(MVT::i32) && "Invalid destination!");
-  unsigned mainDstReg = MRI.createVirtualRegister(RC);
-  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+    // Reload the original control word now.
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FLDCW16m)), CWFrameIdx);
  
-  MemOpndSlot = CurOp;
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
+    return BB;
+  }
+    // String/text processing lowering.
+  case X86::PCMPISTRM128REG:
+  case X86::VPCMPISTRM128REG:
+  case X86::PCMPISTRM128MEM:
+  case X86::VPCMPISTRM128MEM:
+  case X86::PCMPESTRM128REG:
+  case X86::VPCMPESTRM128REG:
+  case X86::PCMPESTRM128MEM:
+  case X86::VPCMPESTRM128MEM:
+    assert(Subtarget->hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
  
-  MVT PVT = getPointerTy();
-  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
-         "Invalid Pointer Size!");
+  // String/text processing lowering.
+  case X86::PCMPISTRIREG:
+  case X86::VPCMPISTRIREG:
+  case X86::PCMPISTRIMEM:
+  case X86::VPCMPISTRIMEM:
+  case X86::PCMPESTRIREG:
+  case X86::VPCMPESTRIREG:
+  case X86::PCMPESTRIMEM:
+  case X86::VPCMPESTRIMEM:
+    assert(Subtarget->hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
  
-  // For v = setjmp(buf), we generate
-  //
-  // thisMBB:
-  //  buf[LabelOffset] = restoreMBB
-  //  SjLjSetup restoreMBB
-  //
-  // mainMBB:
-  //  v_main = 0
-  //
-  // sinkMBB:
-  //  v = phi(main, restore)
-  //
-  // restoreMBB:
-  //  v_restore = 1
+  // Thread synchronization.
+  case X86::MONITOR:
+    return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),
+                       Subtarget);
  
-  MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
-  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
-  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
-  MF->insert(I, mainMBB);
-  MF->insert(I, sinkMBB);
-  MF->push_back(restoreMBB);
+  // xbegin
+  case X86::XBEGIN:
+    return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
  
-  MachineInstrBuilder MIB;
+  case X86::VASTART_SAVE_XMM_REGS:
+    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
  
-  // Transfer the remainder of BB and its successor edges to sinkMBB.
-  sinkMBB->splice(sinkMBB->begin(), MBB,
-                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  case X86::VAARG_64:
+    return EmitVAARG64WithCustomInserter(MI, BB);
  
-  // thisMBB:
-  unsigned PtrStoreOpc = 0;
-  unsigned LabelReg = 0;
-  const int64_t LabelOffset = 1 * PVT.getStoreSize();
-  Reloc::Model RM = MF->getTarget().getRelocationModel();
-  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
-                     (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
+  case X86::EH_SjLj_SetJmp32:
+  case X86::EH_SjLj_SetJmp64:
+    return emitEHSjLjSetJmp(MI, BB);
  
-  // Prepare IP either in reg or imm.
-  if (!UseImmLabel) {
-    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
-    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
-    LabelReg = MRI.createVirtualRegister(PtrRC);
-    if (Subtarget->is64Bit()) {
-      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
-              .addReg(X86::RIP)
-              .addImm(0)
-              .addReg(0)
-              .addMBB(restoreMBB)
-              .addReg(0);
-    } else {
-      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
-      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
-              .addReg(XII->getGlobalBaseReg(MF))
-              .addImm(0)
-              .addReg(0)
-              .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
-              .addReg(0);
-    }
-  } else
-    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
-  // Store IP
-  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    if (i == X86::AddrDisp)
-      MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
-    else
-      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
-  }
-  if (!UseImmLabel)
-    MIB.addReg(LabelReg);
-  else
-    MIB.addMBB(restoreMBB);
-  MIB.setMemRefs(MMOBegin, MMOEnd);
-  // Setup
-  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
-          .addMBB(restoreMBB);
+  case X86::EH_SjLj_LongJmp32:
+  case X86::EH_SjLj_LongJmp64:
+    return emitEHSjLjLongJmp(MI, BB);
  
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
-  MIB.addRegMask(RegInfo->getNoPreservedMask());
-  thisMBB->addSuccessor(mainMBB);
-  thisMBB->addSuccessor(restoreMBB);
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::PATCHPOINT:
+    return emitPatchPoint(MI, BB);
  
-  // mainMBB:
-  //  EAX = 0
-  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
-  mainMBB->addSuccessor(sinkMBB);
+  case X86::VFMADDPDr213r:
+  case X86::VFMADDPSr213r:
+  case X86::VFMADDSDr213r:
+  case X86::VFMADDSSr213r:
+  case X86::VFMSUBPDr213r:
+  case X86::VFMSUBPSr213r:
+  case X86::VFMSUBSDr213r:
+  case X86::VFMSUBSSr213r:
+  case X86::VFNMADDPDr213r:
+  case X86::VFNMADDPSr213r:
+  case X86::VFNMADDSDr213r:
+  case X86::VFNMADDSSr213r:
+  case X86::VFNMSUBPDr213r:
+  case X86::VFNMSUBPSr213r:
+  case X86::VFNMSUBSDr213r:
+  case X86::VFNMSUBSSr213r:
+  case X86::VFMADDPDr213rY:
+  case X86::VFMADDPSr213rY:
+  case X86::VFMSUBPDr213rY:
+  case X86::VFMSUBPSr213rY:
+  case X86::VFNMADDPDr213rY:
+  case X86::VFNMADDPSr213rY:
+  case X86::VFNMSUBPDr213rY:
+  case X86::VFNMSUBPSr213rY:
+    return emitFMA3Instr(MI, BB);
+  }
+}
  
-  // sinkMBB:
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
-          TII->get(X86::PHI), DstReg)
-    .addReg(mainDstReg).addMBB(mainMBB)
-    .addReg(restoreDstReg).addMBB(restoreMBB);
+//===----------------------------------------------------------------------===//
+//                           X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
  
-  // restoreMBB:
-  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
-  BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
-  restoreMBB->addSuccessor(sinkMBB);
+void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
+  unsigned BitWidth = KnownZero.getBitWidth();
+  unsigned Opc = Op.getOpcode();
+  assert((Opc >= ISD::BUILTIN_OP_END ||
+          Opc == ISD::INTRINSIC_WO_CHAIN ||
+          Opc == ISD::INTRINSIC_W_CHAIN ||
+          Opc == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
  
-  MI->eraseFromParent();
-  return sinkMBB;
+  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
+  switch (Opc) {
+  default: break;
+  case X86ISD::ADD:
+  case X86ISD::SUB:
+  case X86ISD::ADC:
+  case X86ISD::SBB:
+  case X86ISD::SMUL:
+  case X86ISD::UMUL:
+  case X86ISD::INC:
+  case X86ISD::DEC:
+  case X86ISD::OR:
+  case X86ISD::XOR:
+  case X86ISD::AND:
+    // These nodes' second result is a boolean.
+    if (Op.getResNo() == 0)
+      break;
+    // Fallthrough
+  case X86ISD::SETCC:
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned NumLoBits = 0;
+    switch (IntId) {
+    default: break;
+    case Intrinsic::x86_sse_movmsk_ps:
+    case Intrinsic::x86_avx_movmsk_ps_256:
+    case Intrinsic::x86_sse2_movmsk_pd:
+    case Intrinsic::x86_avx_movmsk_pd_256:
+    case Intrinsic::x86_mmx_pmovmskb:
+    case Intrinsic::x86_sse2_pmovmskb_128:
+    case Intrinsic::x86_avx2_pmovmskb: {
+      // High bits of movmskp{s|d}, pmovmskb are known zero.
+      switch (IntId) {
+        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
+        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
+        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
+        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
+        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
+        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
+        case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
+      }
+      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
+      break;
+    }
+    }
+    break;
+  }
+  }
  }
  
-MachineBasicBlock *
-X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
-                                     MachineBasicBlock *MBB) const {
-  DebugLoc DL = MI->getDebugLoc();
-  MachineFunction *MF = MBB->getParent();
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
+unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
+  SDValue Op,
+  const SelectionDAG &,
+  unsigned Depth) const {
+  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
+  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
+    return Op.getValueType().getScalarType().getSizeInBits();
  
-  // Memory Reference
-  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
-  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+  // Fallback case.
+  return 1;
+}
  
-  MVT PVT = getPointerTy();
-  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
-         "Invalid Pointer Size!");
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + offset.
+bool X86TargetLowering::isGAPlusOffset(SDNode *N,
+                                       const GlobalValue* &GA,
+                                       int64_t &Offset) const {
+  if (N->getOpcode() == X86ISD::Wrapper) {
+    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
+      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
+      return true;
+    }
+  }
+  return TargetLowering::isGAPlusOffset(N, GA, Offset);
+}
  
-  const TargetRegisterClass *RC =
-    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
-  unsigned Tmp = MRI.createVirtualRegister(RC);
-  // Since FP is only updated here but NOT referenced, it's treated as GPR.
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
-  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
-  unsigned SP = RegInfo->getStackRegister();
+/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
+/// same as extracting the high 128-bit part of 256-bit vector and then
+/// inserting the result into the low part of a new 256-bit vector
+static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
  
-  MachineInstrBuilder MIB;
+  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
+    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+        SVOp->getMaskElt(j) >= 0)
+      return false;
  
-  const int64_t LabelOffset = 1 * PVT.getStoreSize();
-  const int64_t SPOffset = 2 * PVT.getStoreSize();
+  return true;
+}
  
-  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
-  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
+/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
+/// same as extracting the low 128-bit part of 256-bit vector and then
+/// inserting the result into the high part of a new 256-bit vector
+static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
  
-  // Reload FP
-  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.addOperand(MI->getOperand(i));
-  MIB.setMemRefs(MMOBegin, MMOEnd);
-  // Reload IP
-  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    if (i == X86::AddrDisp)
-      MIB.addDisp(MI->getOperand(i), LabelOffset);
-    else
-      MIB.addOperand(MI->getOperand(i));
-  }
-  MIB.setMemRefs(MMOBegin, MMOEnd);
-  // Reload SP
-  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    if (i == X86::AddrDisp)
-      MIB.addDisp(MI->getOperand(i), SPOffset);
-    else
-      MIB.addOperand(MI->getOperand(i));
-  }
-  MIB.setMemRefs(MMOBegin, MMOEnd);
-  // Jump
-  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
+    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+        SVOp->getMaskElt(j) >= 0)
+      return false;
  
-  MI->eraseFromParent();
-  return MBB;
+  return true;
  }
  
-// Replace 213-type (isel default) FMA3 instructions with 231-type for
-// accumulator loops. Writing back to the accumulator allows the coalescer
-// to remove extra copies in the loop.   
-MachineBasicBlock *
-X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
-                                 MachineBasicBlock *MBB) const {
-  MachineOperand &AddendOp = MI->getOperand(3);
+/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget* Subtarget) {
+  SDLoc dl(N);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  EVT VT = SVOp->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
  
-  // Bail out early if the addend isn't a register - we can't switch these.
-  if (!AddendOp.isReg())
-    return MBB;
+  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
+      V2.getOpcode() == ISD::CONCAT_VECTORS) {
+    //
+    //                   0,0,0,...
+    //                      |
+    //    V      UNDEF    BUILD_VECTOR    UNDEF
+    //     \      /           \           /
+    //  CONCAT_VECTOR         CONCAT_VECTOR
+    //         \                  /
+    //          \                /
+    //          RESULT: V + zero extended
+    //
+    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
+        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
+        V1.getOperand(1).getOpcode() != ISD::UNDEF)
+      return SDValue();
  
-  MachineFunction &MF = *MBB->getParent();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
+    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
+      return SDValue();
  
-  // Check whether the addend is defined by a PHI:
-  assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
-  MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
-  if (!AddendDef.isPHI())
-    return MBB;
+    // To match the shuffle mask, the first half of the mask should
+    // be exactly the first vector, and all the rest a splat with the
+    // first element of the second one.
+    for (unsigned i = 0; i != NumElems/2; ++i)
+      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
+          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
+        return SDValue();
  
-  // Look for the following pattern:
-  // loop:
-  //   %addend = phi [%entry, 0], [%loop, %result]
-  //   ...
-  //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
+    // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
+    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
+      if (Ld->hasNUsesOfValue(1, 0)) {
+        SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
+        SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
+        SDValue ResNode =
+          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+                                  Ld->getMemoryVT(),
+                                  Ld->getPointerInfo(),
+                                  Ld->getAlignment(),
+                                  false/*isVolatile*/, true/*ReadMem*/,
+                                  false/*WriteMem*/);
  
-  // Replace with:
-  //   loop:
-  //   %addend = phi [%entry, 0], [%loop, %result]
-  //   ...
-  //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
+        // Make sure the newly-created LOAD is in the same position as Ld in
+        // terms of dependency. We create a TokenFactor for Ld and ResNode,
+        // and update uses of Ld's output chain to use the TokenFactor.
+        if (Ld->hasAnyUseOfValue(1)) {
+          SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
+          DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+          DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
+                                 SDValue(ResNode.getNode(), 1));
+        }
  
-  for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
-    assert(AddendDef.getOperand(i).isReg());
-    MachineOperand PHISrcOp = AddendDef.getOperand(i);
-    MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
-    if (&PHISrcInst == MI) {
-      // Found a matching instruction.
-      unsigned NewFMAOpc = 0;
-      switch (MI->getOpcode()) {
-        case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
-        case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
-        case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
-        case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
-        case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
-        case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
-        case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
-        case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
-        case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
-        case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
-        case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
-        case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
-        case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
-        case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
-        case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
-        case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
-        case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
-        case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
-        case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
-        case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
-        case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
-        case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
-        case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
-        case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
-        default: llvm_unreachable("Unrecognized FMA variant.");
+        return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
        }
-
-      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-      MachineInstrBuilder MIB =
-        BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
-        .addOperand(MI->getOperand(0))
-        .addOperand(MI->getOperand(3))
-        .addOperand(MI->getOperand(2))
-        .addOperand(MI->getOperand(1));
-      MBB->insert(MachineBasicBlock::iterator(MI), MIB);
-      MI->eraseFromParent();
      }
-  }
-
-  return MBB;
-}
  
-MachineBasicBlock *
-X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                               MachineBasicBlock *BB) const {
-  switch (MI->getOpcode()) {
-  default: llvm_unreachable("Unexpected instr type to insert");
-  case X86::TAILJMPd64:
-  case X86::TAILJMPr64:
-  case X86::TAILJMPm64:
-    llvm_unreachable("TAILJMP64 would not be touched here.");
-  case X86::TCRETURNdi64:
-  case X86::TCRETURNri64:
-  case X86::TCRETURNmi64:
-    return BB;
-  case X86::WIN_ALLOCA:
-    return EmitLoweredWinAlloca(MI, BB);
-  case X86::SEG_ALLOCA_32:
-    return EmitLoweredSegAlloca(MI, BB, false);
-  case X86::SEG_ALLOCA_64:
-    return EmitLoweredSegAlloca(MI, BB, true);
-  case X86::TLSCall_32:
-  case X86::TLSCall_64:
-    return EmitLoweredTLSCall(MI, BB);
-  case X86::CMOV_GR8:
-  case X86::CMOV_FR32:
-  case X86::CMOV_FR64:
-  case X86::CMOV_V4F32:
-  case X86::CMOV_V2F64:
-  case X86::CMOV_V2I64:
-  case X86::CMOV_V8F32:
-  case X86::CMOV_V4F64:
-  case X86::CMOV_V4I64:
-  case X86::CMOV_V16F32:
-  case X86::CMOV_V8F64:
-  case X86::CMOV_V8I64:
-  case X86::CMOV_GR16:
-  case X86::CMOV_GR32:
-  case X86::CMOV_RFP32:
-  case X86::CMOV_RFP64:
-  case X86::CMOV_RFP80:
-    return EmitLoweredSelect(MI, BB);
+    // Emit a zeroed vector and insert the desired subvector on its
+    // first half.
+    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
  
-  case X86::FP32_TO_INT16_IN_MEM:
-  case X86::FP32_TO_INT32_IN_MEM:
-  case X86::FP32_TO_INT64_IN_MEM:
-  case X86::FP64_TO_INT16_IN_MEM:
-  case X86::FP64_TO_INT32_IN_MEM:
-  case X86::FP64_TO_INT64_IN_MEM:
-  case X86::FP80_TO_INT16_IN_MEM:
-  case X86::FP80_TO_INT32_IN_MEM:
-  case X86::FP80_TO_INT64_IN_MEM: {
-    MachineFunction *F = BB->getParent();
-    const TargetInstrInfo *TII = F->getTarget().getInstrInfo();
-    DebugLoc DL = MI->getDebugLoc();
+  //===--------------------------------------------------------------------===//
+  // Combine some shuffles into subvector extracts and inserts:
+  //
  
-    // Change the floating point control register to use "round towards zero"
-    // mode when truncating to an integer value.
-    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
-    addFrameReference(BuildMI(*BB, MI, DL,
-                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
+  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  if (isShuffleHigh128VectorInsertLow(SVOp)) {
+    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
  
-    // Load the old value of the high byte of the control word...
-    unsigned OldCW =
-      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
-    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
-                      CWFrameIdx);
+  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  if (isShuffleLow128VectorInsertHigh(SVOp)) {
+    SDValue V = Extract128BitVector(V1, 0, DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
  
-    // Set the high part to be round to zero...
-    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
-      .addImm(0xC7F);
+  return SDValue();
+}
  
-    // Reload the modified control word now...
-    addFrameReference(BuildMI(*BB, MI, DL,
-                              TII->get(X86::FLDCW16m)), CWFrameIdx);
+/// \brief Combine an arbitrary chain of shuffles into a single instruction if
+/// possible.
+///
+/// This is the leaf of the recursive combinine below. When we have found some
+/// chain of single-use x86 shuffle instructions and accumulated the combined
+/// shuffle mask represented by them, this will try to pattern match that mask
+/// into either a single instruction if there is a special purpose instruction
+/// for this operation, or into a PSHUFB instruction which is a fully general
+/// instruction but should only be used to replace chains over a certain depth.
+static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
+                                   int Depth, bool HasPSHUFB, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
+
+  // Find the operand that enters the chain. Note that multiple uses are OK
+  // here, we're not going to remove the operand we find.
+  SDValue Input = Op.getOperand(0);
+  while (Input.getOpcode() == ISD::BITCAST)
+    Input = Input.getOperand(0);
+
+  MVT VT = Input.getSimpleValueType();
+  MVT RootVT = Root.getSimpleValueType();
+  SDLoc DL(Root);
+
+  // Just remove no-op shuffle masks.
+  if (Mask.size() == 1) {
+    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
+                  /*AddTo*/ true);
+    return true;
+  }
  
-    // Restore the memory image of control word to original value
-    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
-      .addReg(OldCW);
+  // Use the float domain if the operand type is a floating point type.
+  bool FloatDomain = VT.isFloatingPoint();
  
-    // Get the X86 opcode to use.
-    unsigned Opc;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("illegal opcode!");
-    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
-    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
-    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
-    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
-    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
-    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
-    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
-    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
-    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
+  // If we don't have access to VEX encodings, the generic PSHUF instructions
+  // are preferable to some of the specialized forms despite requiring one more
+  // byte to encode because they can implicitly copy.
+  //
+  // IF we *do* have VEX encodings, than we can use shorter, more specific
+  // shuffle instructions freely as they can copy due to the extra register
+  // operand.
+  if (Subtarget->hasAVX()) {
+    // We have both floating point and integer variants of shuffles that dup
+    // either the low or high half of the vector.
+    if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
+      bool Lo = Mask.equals(0, 0);
+      unsigned Shuffle = FloatDomain ? (Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS)
+                                     : (Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH);
+      if (Depth == 1 && Root->getOpcode() == Shuffle)
+        return false; // Nothing to do!
+      MVT ShuffleVT = FloatDomain ? MVT::v4f32 : MVT::v2i64;
+      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+      DCI.AddToWorklist(Op.getNode());
+      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+      DCI.AddToWorklist(Op.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                    /*AddTo*/ true);
+      return true;
      }
  
-    X86AddressMode AM;
-    MachineOperand &Op = MI->getOperand(0);
-    if (Op.isReg()) {
-      AM.BaseType = X86AddressMode::RegBase;
-      AM.Base.Reg = Op.getReg();
-    } else {
-      AM.BaseType = X86AddressMode::FrameIndexBase;
-      AM.Base.FrameIndex = Op.getIndex();
-    }
-    Op = MI->getOperand(1);
-    if (Op.isImm())
-      AM.Scale = Op.getImm();
-    Op = MI->getOperand(2);
-    if (Op.isImm())
-      AM.IndexReg = Op.getImm();
-    Op = MI->getOperand(3);
-    if (Op.isGlobal()) {
-      AM.GV = Op.getGlobal();
-    } else {
-      AM.Disp = Op.getImm();
+    // FIXME: We should match UNPCKLPS and UNPCKHPS here.
+
+    // For the integer domain we have specialized instructions for duplicating
+    // any element size from the low or high half.
+    if (!FloatDomain &&
+        (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3) ||
+         Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
+         Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
+         Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
+         Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
+                     15))) {
+      bool Lo = Mask[0] == 0;
+      unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+      if (Depth == 1 && Root->getOpcode() == Shuffle)
+        return false; // Nothing to do!
+      MVT ShuffleVT;
+      switch (Mask.size()) {
+      case 4: ShuffleVT = MVT::v4i32; break;
+      case 8: ShuffleVT = MVT::v8i16; break;
+      case 16: ShuffleVT = MVT::v16i8; break;
+      };
+      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+      DCI.AddToWorklist(Op.getNode());
+      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+      DCI.AddToWorklist(Op.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                    /*AddTo*/ true);
+      return true;
      }
-    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
-                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
+  }
  
-    // Reload the original control word now.
-    addFrameReference(BuildMI(*BB, MI, DL,
-                              TII->get(X86::FLDCW16m)), CWFrameIdx);
+  // Don't try to re-form single instruction chains under any circumstances now
+  // that we've done encoding canonicalization for them.
+  if (Depth < 2)
+    return false;
  
-    MI->eraseFromParent();   // The pseudo instruction is gone now.
-    return BB;
+  // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
+  // can replace them with a single PSHUFB instruction profitably. Intel's
+  // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
+  // in practice PSHUFB tends to be *very* fast so we're more aggressive.
+  if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
+    SmallVector<SDValue, 16> PSHUFBMask;
+    assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
+    int Ratio = 16 / Mask.size();
+    for (unsigned i = 0; i < 16; ++i) {
+      int M = Ratio * Mask[i / Ratio] + i % Ratio;
+      PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
+    }
+    Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
+    DCI.AddToWorklist(Op.getNode());
+    SDValue PSHUFBMaskOp =
+        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
+    DCI.AddToWorklist(PSHUFBMaskOp.getNode());
+    Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
+    DCI.AddToWorklist(Op.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                  /*AddTo*/ true);
+    return true;
    }
-    // String/text processing lowering.
-  case X86::PCMPISTRM128REG:
-  case X86::VPCMPISTRM128REG:
-  case X86::PCMPISTRM128MEM:
-  case X86::VPCMPISTRM128MEM:
-  case X86::PCMPESTRM128REG:
-  case X86::VPCMPESTRM128REG:
-  case X86::PCMPESTRM128MEM:
-  case X86::VPCMPESTRM128MEM:
-    assert(Subtarget->hasSSE42() &&
-           "Target must have SSE4.2 or AVX features enabled");
-    return EmitPCMPSTRM(MI, BB, BB->getParent()->getTarget().getInstrInfo());
  
-  // String/text processing lowering.
-  case X86::PCMPISTRIREG:
-  case X86::VPCMPISTRIREG:
-  case X86::PCMPISTRIMEM:
-  case X86::VPCMPISTRIMEM:
-  case X86::PCMPESTRIREG:
-  case X86::VPCMPESTRIREG:
-  case X86::PCMPESTRIMEM:
-  case X86::VPCMPESTRIMEM:
-    assert(Subtarget->hasSSE42() &&
-           "Target must have SSE4.2 or AVX features enabled");
-    return EmitPCMPSTRI(MI, BB, BB->getParent()->getTarget().getInstrInfo());
+  // Failed to find any combines.
+  return false;
+}
  
-  // Thread synchronization.
-  case X86::MONITOR:
-    return EmitMonitor(MI, BB, BB->getParent()->getTarget().getInstrInfo(), Subtarget);
+/// \brief Fully generic combining of x86 shuffle instructions.
+///
+/// This should be the last combine run over the x86 shuffle instructions. Once
+/// they have been fully optimized, this will recursively consdier all chains
+/// of single-use shuffle instructions, build a generic model of the cumulative
+/// shuffle operation, and check for simpler instructions which implement this
+/// operation. We use this primarily for two purposes:
+///
+/// 1) Collapse generic shuffles to specialized single instructions when
+///    equivalent. In most cases, this is just an encoding size win, but
+///    sometimes we will collapse multiple generic shuffles into a single
+///    special-purpose shuffle.
+/// 2) Look for sequences of shuffle instructions with 3 or more total
+///    instructions, and replace them with the slightly more expensive SSSE3
+///    PSHUFB instruction if available. We do this as the last combining step
+///    to ensure we avoid using PSHUFB if we can implement the shuffle with
+///    a suitable short sequence of other instructions. The PHUFB will either
+///    use a register or have to read from memory and so is slightly (but only
+///    slightly) more expensive than the other shuffle instructions.
+///
+/// Because this is inherently a quadratic operation (for each shuffle in
+/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
+/// This should never be an issue in practice as the shuffle lowering doesn't
+/// produce sequences of more than 8 instructions.
+///
+/// FIXME: We will currently miss some cases where the redundant shuffling
+/// would simplify under the threshold for PSHUFB formation because of
+/// combine-ordering. To fix this, we should do the redundant instruction
+/// combining in this recursive walk.
+static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
+                                          ArrayRef<int> IncomingMask, int Depth,
+                                          bool HasPSHUFB, SelectionDAG &DAG,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          const X86Subtarget *Subtarget) {
+  // Bound the depth of our recursive combine because this is ultimately
+  // quadratic in nature.
+  if (Depth > 8)
+    return false;
  
-  // xbegin
-  case X86::XBEGIN:
-    return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo());
-
-  // Atomic Lowering.
-  case X86::ATOMAND8:
-  case X86::ATOMAND16:
-  case X86::ATOMAND32:
-  case X86::ATOMAND64:
-    // Fall through
-  case X86::ATOMOR8:
-  case X86::ATOMOR16:
-  case X86::ATOMOR32:
-  case X86::ATOMOR64:
-    // Fall through
-  case X86::ATOMXOR16:
-  case X86::ATOMXOR8:
-  case X86::ATOMXOR32:
-  case X86::ATOMXOR64:
-    // Fall through
-  case X86::ATOMNAND8:
-  case X86::ATOMNAND16:
-  case X86::ATOMNAND32:
-  case X86::ATOMNAND64:
-    // Fall through
-  case X86::ATOMMAX8:
-  case X86::ATOMMAX16:
-  case X86::ATOMMAX32:
-  case X86::ATOMMAX64:
-    // Fall through
-  case X86::ATOMMIN8:
-  case X86::ATOMMIN16:
-  case X86::ATOMMIN32:
-  case X86::ATOMMIN64:
-    // Fall through
-  case X86::ATOMUMAX8:
-  case X86::ATOMUMAX16:
-  case X86::ATOMUMAX32:
-  case X86::ATOMUMAX64:
-    // Fall through
-  case X86::ATOMUMIN8:
-  case X86::ATOMUMIN16:
-  case X86::ATOMUMIN32:
-  case X86::ATOMUMIN64:
-    return EmitAtomicLoadArith(MI, BB);
-
-  // This group does 64-bit operations on a 32-bit host.
-  case X86::ATOMAND6432:
-  case X86::ATOMOR6432:
-  case X86::ATOMXOR6432:
-  case X86::ATOMNAND6432:
-  case X86::ATOMADD6432:
-  case X86::ATOMSUB6432:
-  case X86::ATOMMAX6432:
-  case X86::ATOMMIN6432:
-  case X86::ATOMUMAX6432:
-  case X86::ATOMUMIN6432:
-  case X86::ATOMSWAP6432:
-    return EmitAtomicLoadArith6432(MI, BB);
+  // Directly rip through bitcasts to find the underlying operand.
+  while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
+    Op = Op.getOperand(0);
  
-  case X86::VASTART_SAVE_XMM_REGS:
-    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
+  MVT VT = Op.getSimpleValueType();
+  if (!VT.isVector())
+    return false; // Bail if we hit a non-vector.
+  // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
+  // version should be added.
+  if (VT.getSizeInBits() != 128)
+    return false;
  
-  case X86::VAARG_64:
-    return EmitVAARG64WithCustomInserter(MI, BB);
+  assert(Root.getSimpleValueType().isVector() &&
+         "Shuffles operate on vector types!");
+  assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
+         "Can only combine shuffles of the same vector register size.");
  
-  case X86::EH_SjLj_SetJmp32:
-  case X86::EH_SjLj_SetJmp64:
-    return emitEHSjLjSetJmp(MI, BB);
+  if (!isTargetShuffle(Op.getOpcode()))
+    return false;
+  SmallVector<int, 16> OpMask;
+  bool IsUnary;
+  bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
+  // We only can combine unary shuffles which we can decode the mask for.
+  if (!HaveMask || !IsUnary)
+    return false;
  
-  case X86::EH_SjLj_LongJmp32:
-  case X86::EH_SjLj_LongJmp64:
-    return emitEHSjLjLongJmp(MI, BB);
+  assert(VT.getVectorNumElements() == OpMask.size() &&
+         "Different mask size from vector size!");
  
-  case TargetOpcode::STACKMAP:
-  case TargetOpcode::PATCHPOINT:
-    return emitPatchPoint(MI, BB);
+  SmallVector<int, 16> Mask;
+  Mask.reserve(std::max(OpMask.size(), IncomingMask.size()));
+
+  // Merge this shuffle operation's mask into our accumulated mask. This is
+  // a bit tricky as the shuffle may have a different size from the root.
+  if (OpMask.size() == IncomingMask.size()) {
+    for (int M : IncomingMask)
+      Mask.push_back(OpMask[M]);
+  } else if (OpMask.size() < IncomingMask.size()) {
+    assert(IncomingMask.size() % OpMask.size() == 0 &&
+           "The smaller number of elements must divide the larger.");
+    int Ratio = IncomingMask.size() / OpMask.size();
+    for (int M : IncomingMask)
+      Mask.push_back(Ratio * OpMask[M / Ratio] + M % Ratio);
+  } else {
+    assert(OpMask.size() > IncomingMask.size() && "All other cases handled!");
+    assert(OpMask.size() % IncomingMask.size() == 0 &&
+           "The smaller number of elements must divide the larger.");
+    int Ratio = OpMask.size() / IncomingMask.size();
+    for (int i = 0, e = OpMask.size(); i < e; ++i)
+      Mask.push_back(OpMask[Ratio * IncomingMask[i / Ratio] + i % Ratio]);
+  }
  
-  case X86::VFMADDPDr213r:
-  case X86::VFMADDPSr213r:
-  case X86::VFMADDSDr213r:
-  case X86::VFMADDSSr213r:
-  case X86::VFMSUBPDr213r:
-  case X86::VFMSUBPSr213r:
-  case X86::VFMSUBSDr213r:
-  case X86::VFMSUBSSr213r:
-  case X86::VFNMADDPDr213r:
-  case X86::VFNMADDPSr213r:
-  case X86::VFNMADDSDr213r:
-  case X86::VFNMADDSSr213r:
-  case X86::VFNMSUBPDr213r:
-  case X86::VFNMSUBPSr213r:
-  case X86::VFNMSUBSDr213r:
-  case X86::VFNMSUBSSr213r:
-  case X86::VFMADDPDr213rY:
-  case X86::VFMADDPSr213rY:
-  case X86::VFMSUBPDr213rY:
-  case X86::VFMSUBPSr213rY:
-  case X86::VFNMADDPDr213rY:
-  case X86::VFNMADDPSr213rY:
-  case X86::VFNMSUBPDr213rY:
-  case X86::VFNMSUBPSr213rY:
-    return emitFMA3Instr(MI, BB);
+  // See if we can recurse into the operand to combine more things.
+  switch (Op.getOpcode()) {
+    case X86ISD::PSHUFB:
+      HasPSHUFB = true;
+    case X86ISD::PSHUFD:
+    case X86ISD::PSHUFHW:
+    case X86ISD::PSHUFLW:
+      if (Op.getOperand(0).hasOneUse() &&
+          combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+                                        HasPSHUFB, DAG, DCI, Subtarget))
+        return true;
+      break;
+
+    case X86ISD::UNPCKL:
+    case X86ISD::UNPCKH:
+      assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
+      // We can't check for single use, we have to check that this shuffle is the only user.
+      if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
+          combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+                                        HasPSHUFB, DAG, DCI, Subtarget))
+          return true;
+      break;
    }
+
+  // Minor canonicalization of the accumulated shuffle mask to make it easier
+  // to match below. All this does is detect masks with squential pairs of
+  // elements, and shrink them to the half-width mask. It does this in a loop
+  // so it will reduce the size of the mask to the minimal width mask which
+  // performs an equivalent shuffle.
+  while (Mask.size() > 1) {
+    SmallVector<int, 16> NewMask;
+    for (int i = 0, e = Mask.size()/2; i < e; ++i) {
+      if (Mask[2*i] % 2 != 0 || Mask[2*i] != Mask[2*i + 1] + 1) {
+        NewMask.clear();
+        break;
+      }
+      NewMask.push_back(Mask[2*i] / 2);
+    }
+    if (NewMask.empty())
+      break;
+    Mask.swap(NewMask);
+  }
+
+  return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
+                                Subtarget);
  }
  
-//===----------------------------------------------------------------------===//
-//                           X86 Optimization Hooks
-//===----------------------------------------------------------------------===//
+/// \brief Get the PSHUF-style mask from PSHUF node.
+///
+/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
+/// PSHUF-style masks that can be reused with such instructions.
+static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+  SmallVector<int, 4> Mask;
+  bool IsUnary;
+  bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
+  (void)HaveMask;
+  assert(HaveMask);
+
+  switch (N.getOpcode()) {
+  case X86ISD::PSHUFD:
+    return Mask;
+  case X86ISD::PSHUFLW:
+    Mask.resize(4);
+    return Mask;
+  case X86ISD::PSHUFHW:
+    Mask.erase(Mask.begin(), Mask.begin() + 4);
+    for (int &M : Mask)
+      M -= 4;
+    return Mask;
+  default:
+    llvm_unreachable("No valid shuffle instruction found!");
+  }
+}
  
-void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
-                                                      APInt &KnownZero,
-                                                      APInt &KnownOne,
-                                                      const SelectionDAG &DAG,
-                                                      unsigned Depth) const {
-  unsigned BitWidth = KnownZero.getBitWidth();
-  unsigned Opc = Op.getOpcode();
-  assert((Opc >= ISD::BUILTIN_OP_END ||
-          Opc == ISD::INTRINSIC_WO_CHAIN ||
-          Opc == ISD::INTRINSIC_W_CHAIN ||
-          Opc == ISD::INTRINSIC_VOID) &&
-         "Should use MaskedValueIsZero if you don't know whether Op"
-         " is a target node!");
+/// \brief Search for a combinable shuffle across a chain ending in pshufd.
+///
+/// We walk up the chain and look for a combinable shuffle, skipping over
+/// shuffles that we could hoist this shuffle's transformation past without
+/// altering anything.
+static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
+                                         SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  assert(N.getOpcode() == X86ISD::PSHUFD &&
+         "Called with something other than an x86 128-bit half shuffle!");
+  SDLoc DL(N);
  
-  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
-  switch (Opc) {
-  default: break;
-  case X86ISD::ADD:
-  case X86ISD::SUB:
-  case X86ISD::ADC:
-  case X86ISD::SBB:
-  case X86ISD::SMUL:
-  case X86ISD::UMUL:
-  case X86ISD::INC:
-  case X86ISD::DEC:
-  case X86ISD::OR:
-  case X86ISD::XOR:
-  case X86ISD::AND:
-    // These nodes' second result is a boolean.
-    if (Op.getResNo() == 0)
+  // Walk up a single-use chain looking for a combinable shuffle.
+  SDValue V = N.getOperand(0);
+  for (; V.hasOneUse(); V = V.getOperand(0)) {
+    switch (V.getOpcode()) {
+    default:
+      return false; // Nothing combined!
+
+    case ISD::BITCAST:
+      // Skip bitcasts as we always know the type for the target specific
+      // instructions.
+      continue;
+
+    case X86ISD::PSHUFD:
+      // Found another dword shuffle.
        break;
-    // Fallthrough
-  case X86ISD::SETCC:
-    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
-    break;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-    unsigned NumLoBits = 0;
-    switch (IntId) {
-    default: break;
-    case Intrinsic::x86_sse_movmsk_ps:
-    case Intrinsic::x86_avx_movmsk_ps_256:
-    case Intrinsic::x86_sse2_movmsk_pd:
-    case Intrinsic::x86_avx_movmsk_pd_256:
-    case Intrinsic::x86_mmx_pmovmskb:
-    case Intrinsic::x86_sse2_pmovmskb_128:
-    case Intrinsic::x86_avx2_pmovmskb: {
-      // High bits of movmskp{s|d}, pmovmskb are known zero.
-      switch (IntId) {
-        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
-        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
-        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
-        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
-        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
-        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
-        case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
-      }
-      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
+
+    case X86ISD::PSHUFLW:
+      // Check that the low words (being shuffled) are the identity in the
+      // dword shuffle, and the high words are self-contained.
+      if (Mask[0] != 0 || Mask[1] != 1 ||
+          !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
+        return false;
+
+      continue;
+
+    case X86ISD::PSHUFHW:
+      // Check that the high words (being shuffled) are the identity in the
+      // dword shuffle, and the low words are self-contained.
+      if (Mask[2] != 2 || Mask[3] != 3 ||
+          !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
+        return false;
+
+      continue;
+
+    case X86ISD::UNPCKL:
+    case X86ISD::UNPCKH:
+      // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
+      // shuffle into a preceding word shuffle.
+      if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
+        return false;
+
+      // Search for a half-shuffle which we can combine with.
+      unsigned CombineOp =
+          V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+      if (V.getOperand(0) != V.getOperand(1) ||
+          !V->isOnlyUserOf(V.getOperand(0).getNode()))
+        return false;
+      V = V.getOperand(0);
+      do {
+        switch (V.getOpcode()) {
+        default:
+          return false; // Nothing to combine.
+
+        case X86ISD::PSHUFLW:
+        case X86ISD::PSHUFHW:
+          if (V.getOpcode() == CombineOp)
+            break;
+
+          // Fallthrough!
+        case ISD::BITCAST:
+          V = V.getOperand(0);
+          continue;
+        }
+        break;
+      } while (V.hasOneUse());
        break;
      }
-    }
+    // Break out of the loop if we break out of the switch.
      break;
    }
+
+  if (!V.hasOneUse())
+    // We fell out of the loop without finding a viable combining instruction.
+    return false;
+
+  // Record the old value to use in RAUW-ing.
+  SDValue Old = V;
+
+  // Merge this node's mask and our incoming mask.
+  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+  for (int &M : Mask)
+    M = VMask[M];
+  V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
+                  getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+  // It is possible that one of the combinable shuffles was completely absorbed
+  // by the other, just replace it and revisit all users in that case.
+  if (Old.getNode() == V.getNode()) {
+    DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true);
+    return true;
    }
-}
  
-unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
-  SDValue Op,
-  const SelectionDAG &,
-  unsigned Depth) const {
-  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
-  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
-    return Op.getValueType().getScalarType().getSizeInBits();
+  // Replace N with its operand as we're going to combine that shuffle away.
+  DAG.ReplaceAllUsesWith(N, N.getOperand(0));
  
-  // Fallback case.
-  return 1;
+  // Replace the combinable shuffle with the combined one, updating all users
+  // so that we re-evaluate the chain here.
+  DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+  return true;
  }
  
-/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
-/// node is a GlobalAddress + offset.
-bool X86TargetLowering::isGAPlusOffset(SDNode *N,
-                                       const GlobalValue* &GA,
-                                       int64_t &Offset) const {
-  if (N->getOpcode() == X86ISD::Wrapper) {
-    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
-      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
-      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
-      return true;
+/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
+///
+/// We walk up the chain, skipping shuffles of the other half and looking
+/// through shuffles which switch halves trying to find a shuffle of the same
+/// pair of dwords.
+static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
+                                        SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI) {
+  assert(
+      (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
+      "Called with something other than an x86 128-bit half shuffle!");
+  SDLoc DL(N);
+  unsigned CombineOpcode = N.getOpcode();
+
+  // Walk up a single-use chain looking for a combinable shuffle.
+  SDValue V = N.getOperand(0);
+  for (; V.hasOneUse(); V = V.getOperand(0)) {
+    switch (V.getOpcode()) {
+    default:
+      return false; // Nothing combined!
+
+    case ISD::BITCAST:
+      // Skip bitcasts as we always know the type for the target specific
+      // instructions.
+      continue;
+
+    case X86ISD::PSHUFLW:
+    case X86ISD::PSHUFHW:
+      if (V.getOpcode() == CombineOpcode)
+        break;
+
+      // Other-half shuffles are no-ops.
+      continue;
      }
+    // Break out of the loop if we break out of the switch.
+    break;
    }
-  return TargetLowering::isGAPlusOffset(N, GA, Offset);
-}
  
-/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
-/// same as extracting the high 128-bit part of 256-bit vector and then
-/// inserting the result into the low part of a new 256-bit vector
-static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
-  EVT VT = SVOp->getValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
+  if (!V.hasOneUse())
+    // We fell out of the loop without finding a viable combining instruction.
+    return false;
  
-  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
-  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
-    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
-        SVOp->getMaskElt(j) >= 0)
-      return false;
+  // Combine away the bottom node as its shuffle will be accumulated into
+  // a preceding shuffle.
+  DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
  
-  return true;
-}
+  // Record the old value.
+  SDValue Old = V;
  
-/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
-/// same as extracting the low 128-bit part of 256-bit vector and then
-/// inserting the result into the high part of a new 256-bit vector
-static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
-  EVT VT = SVOp->getValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
+  // Merge this node's mask and our incoming mask (adjusted to account for all
+  // the pshufd instructions encountered).
+  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+  for (int &M : Mask)
+    M = VMask[M];
+  V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
+                  getV4X86ShuffleImm8ForMask(Mask, DAG));
  
-  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
-  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
-    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
-        SVOp->getMaskElt(j) >= 0)
-      return false;
+  // Check that the shuffles didn't cancel each other out. If not, we need to
+  // combine to the new one.
+  if (Old != V)
+    // Replace the combinable shuffle with the combined one, updating all users
+    // so that we re-evaluate the chain here.
+    DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
  
    return true;
  }
  
-/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
-static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
-                                        TargetLowering::DAGCombinerInfo &DCI,
-                                        const X86Subtarget* Subtarget) {
-  SDLoc dl(N);
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  EVT VT = SVOp->getValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
-      V2.getOpcode() == ISD::CONCAT_VECTORS) {
-    //
-    //                   0,0,0,...
-    //                      |
-    //    V      UNDEF    BUILD_VECTOR    UNDEF
-    //     \      /           \           /
-    //  CONCAT_VECTOR         CONCAT_VECTOR
-    //         \                  /
-    //          \                /
-    //          RESULT: V + zero extended
-    //
-    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
-        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
-        V1.getOperand(1).getOpcode() != ISD::UNDEF)
-      return SDValue();
-
-    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
-      return SDValue();
+/// \brief Try to combine x86 target specific shuffles.
+static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  MVT VT = N.getSimpleValueType();
+  SmallVector<int, 4> Mask;
  
-    // To match the shuffle mask, the first half of the mask should
-    // be exactly the first vector, and all the rest a splat with the
-    // first element of the second one.
-    for (unsigned i = 0; i != NumElems/2; ++i)
-      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
-          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
-        return SDValue();
+  switch (N.getOpcode()) {
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFLW:
+  case X86ISD::PSHUFHW:
+    Mask = getPSHUFShuffleMask(N);
+    assert(Mask.size() == 4);
+    break;
+  default:
+    return SDValue();
+  }
  
-    // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
-    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
-      if (Ld->hasNUsesOfValue(1, 0)) {
-        SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
-        SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
-        SDValue ResNode =
-          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
-                                  Ld->getMemoryVT(),
-                                  Ld->getPointerInfo(),
-                                  Ld->getAlignment(),
-                                  false/*isVolatile*/, true/*ReadMem*/,
-                                  false/*WriteMem*/);
+  // Nuke no-op shuffles that show up after combining.
+  if (isNoopShuffleMask(Mask))
+    return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
  
-        // Make sure the newly-created LOAD is in the same position as Ld in
-        // terms of dependency. We create a TokenFactor for Ld and ResNode,
-        // and update uses of Ld's output chain to use the TokenFactor.
-        if (Ld->hasAnyUseOfValue(1)) {
-          SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                             SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
-          DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
-          DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
-                                 SDValue(ResNode.getNode(), 1));
+  // Look for simplifications involving one or two shuffle instructions.
+  SDValue V = N.getOperand(0);
+  switch (N.getOpcode()) {
+  default:
+    break;
+  case X86ISD::PSHUFLW:
+  case X86ISD::PSHUFHW:
+    assert(VT == MVT::v8i16);
+    (void)VT;
+
+    if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
+      return SDValue(); // We combined away this shuffle, so we're done.
+
+    // See if this reduces to a PSHUFD which is no more expensive and can
+    // combine with more operations.
+    if (canWidenShuffleElements(Mask)) {
+      int DMask[] = {-1, -1, -1, -1};
+      int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
+      DMask[DOffset + 0] = DOffset + Mask[0] / 2;
+      DMask[DOffset + 1] = DOffset + Mask[2] / 2;
+      V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
+      DCI.AddToWorklist(V.getNode());
+      V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
+                      getV4X86ShuffleImm8ForMask(DMask, DAG));
+      DCI.AddToWorklist(V.getNode());
+      return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+    }
+
+    // Look for shuffle patterns which can be implemented as a single unpack.
+    // FIXME: This doesn't handle the location of the PSHUFD generically, and
+    // only works when we have a PSHUFD followed by two half-shuffles.
+    if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
+        (V.getOpcode() == X86ISD::PSHUFLW ||
+         V.getOpcode() == X86ISD::PSHUFHW) &&
+        V.getOpcode() != N.getOpcode() &&
+        V.hasOneUse()) {
+      SDValue D = V.getOperand(0);
+      while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
+        D = D.getOperand(0);
+      if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
+        SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+        SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
+        int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+        int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+        int WordMask[8];
+        for (int i = 0; i < 4; ++i) {
+          WordMask[i + NOffset] = Mask[i] + NOffset;
+          WordMask[i + VOffset] = VMask[i] + VOffset;
+        }
+        // Map the word mask through the DWord mask.
+        int MappedMask[8];
+        for (int i = 0; i < 8; ++i)
+          MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
+        const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
+        const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
+        if (std::equal(std::begin(MappedMask), std::end(MappedMask),
+                       std::begin(UnpackLoMask)) ||
+            std::equal(std::begin(MappedMask), std::end(MappedMask),
+                       std::begin(UnpackHiMask))) {
+          // We can replace all three shuffles with an unpack.
+          V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
+          DCI.AddToWorklist(V.getNode());
+          return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
+                                                : X86ISD::UNPCKH,
+                             DL, MVT::v8i16, V, V);
          }
-
-        return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
        }
      }
  
-    // Emit a zeroed vector and insert the desired subvector on its
-    // first half.
-    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
-    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
-    return DCI.CombineTo(N, InsV);
-  }
-
-  //===--------------------------------------------------------------------===//
-  // Combine some shuffles into subvector extracts and inserts:
-  //
+    break;
  
-  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
-  if (isShuffleHigh128VectorInsertLow(SVOp)) {
-    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
-    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
-    return DCI.CombineTo(N, InsV);
-  }
+  case X86ISD::PSHUFD:
+    if (combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+      return SDValue(); // We combined away this shuffle.
  
-  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
-  if (isShuffleLow128VectorInsertHigh(SVOp)) {
-    SDValue V = Extract128BitVector(V1, 0, DAG, dl);
-    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
-    return DCI.CombineTo(N, InsV);
+    break;
    }
  
    return SDValue();
@@ -18014,49 +19592,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    SDValue N1 = N->getOperand(1);
    EVT VT = N->getValueType(0);
  
-  // Canonicalize shuffles that perform 'addsub' on packed float vectors
-  // according to the rule:
-  //  (shuffle (FADD A, B), (FSUB A, B), Mask) ->
-  //  (shuffle (FSUB A, -B), (FADD A, -B), Mask)
-  //
-  // Where 'Mask' is:
-  //  <0,5,2,7>             -- for v4f32 and v4f64 shuffles;
-  //  <0,3>                 -- for v2f64 shuffles;
-  //  <0,9,2,11,4,13,6,15>  -- for v8f32 shuffles.
-  //
-  // This helps pattern-matching more SSE3/AVX ADDSUB instructions
-  // during ISel stage.
-  if (N->getOpcode() == ISD::VECTOR_SHUFFLE &&
-      ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
-       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
-      N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB &&
-      // Operands to the FADD and FSUB must be the same.
-      ((N0->getOperand(0) == N1->getOperand(0) &&
-        N0->getOperand(1) == N1->getOperand(1)) ||
-       // FADD is commutable. See if by commuting the operands of the FADD
-       // we would still be able to match the operands of the FSUB dag node.
-       (N0->getOperand(1) == N1->getOperand(0) &&
-        N0->getOperand(0) == N1->getOperand(1))) &&
-      N0->getOperand(0)->getOpcode() != ISD::UNDEF &&
-      N0->getOperand(1)->getOpcode() != ISD::UNDEF) {
-    
-    ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
-    unsigned NumElts = VT.getVectorNumElements();
-    ArrayRef<int> Mask = SV->getMask();
-    bool CanFold = true;
-
-    for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i)
-      CanFold = Mask[i] == (i & 1) ? i + NumElts : i;
-
-    if (CanFold) {
-      SDValue Op0 = N1->getOperand(0);
-      SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1));
-      SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1);
-      SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1);
-      return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask);
-    }
-  }
-
    // Don't create instructions with illegal types after legalize types has run.
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
@@ -18129,7 +19664,30 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
      Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
  
-  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
+  SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
+  if (LD.getNode())
+    return LD;
+
+  if (isTargetShuffle(N->getOpcode())) {
+    SDValue Shuffle =
+        PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
+    if (Shuffle.getNode())
+      return Shuffle;
+
+    // Try recursively combining arbitrary sequences of x86 shuffle
+    // instructions into higher-order shuffles. We do this after combining
+    // specific PSHUF instruction sequences into their minimal form so that we
+    // can evaluate how many specialized shuffle instructions are involved in
+    // a particular chain.
+    SmallVector<int, 1> NonceMask; // Just a placeholder.
+    NonceMask.push_back(0);
+    if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
+                                      /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
+                                      DCI, Subtarget))
+      return SDValue(); // This routine will use CombineTo to replace N.
+  }
+
+  return SDValue();
  }
  
  /// PerformTruncateCombine - Converts truncate operation to
@@ -18783,28 +20341,34 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
            Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
  
-      // If the RHS is a constant we have to reverse the const canonicalization.
-      // x > C-1 ? x+-C : 0 --> subus x, C
-      if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
-          isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
-        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
-        if (CondRHS.getConstantOperandVal(0) == -A-1)
-          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
-                             DAG.getConstant(-A, VT));
-      }
-
-      // Another special case: If C was a sign bit, the sub has been
-      // canonicalized into a xor.
-      // FIXME: Would it be better to use computeKnownBits to determine whether
-      //        it's safe to decanonicalize the xor?
-      // x s< 0 ? x^C : 0 --> subus x, C
-      if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
-          ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
-          isSplatVector(OpRHS.getNode())) {
-        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
-        if (A.isSignBit())
-          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
-      }
+      if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
+        if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
+          if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
+            if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
+              // If the RHS is a constant we have to reverse the const
+              // canonicalization.
+              // x > C-1 ? x+-C : 0 --> subus x, C
+              if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+                  CondRHSConst->getAPIntValue() ==
+                      (-OpRHSConst->getAPIntValue() - 1))
+                return DAG.getNode(
+                    X86ISD::SUBUS, DL, VT, OpLHS,
+                    DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
+
+          // Another special case: If C was a sign bit, the sub has been
+          // canonicalized into a xor.
+          // FIXME: Would it be better to use computeKnownBits to determine
+          //        whether it's safe to decanonicalize the xor?
+          // x s< 0 ? x^C : 0 --> subus x, C
+          if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+              ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+              OpRHSConst->getAPIntValue().isSignBit())
+            // Note that we have to rebuild the RHS constant here to ensure we
+            // don't rely on particular values of undef lanes.
+            return DAG.getNode(
+                X86ISD::SUBUS, DL, VT, OpLHS,
+                DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
+        }
      }
    }
  
@@ -19512,16 +21076,15 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
    // vector operations in many cases. Also, on sandybridge ADD is faster than
    // shl.
    // (shl V, 1) -> add V,V
-  if (isSplatVector(N1.getNode())) {
-    assert(N0.getValueType().isVector() && "Invalid vector shift type");
-    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
-    // We shift all of the values by one. In many cases we do not have
-    // hardware support for this operation. This is better expressed as an ADD
-    // of two values.
-    if (N1C && (1 == N1C->getZExtValue())) {
-      return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
+  if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+    if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
+      assert(N0.getValueType().isVector() && "Invalid vector shift type");
+      // We shift all of the values by one. In many cases we do not have
+      // hardware support for this operation. This is better expressed as an ADD
+      // of two values.
+      if (N1SplatC->getZExtValue() == 1)
+        return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
      }
-  }
  
    return SDValue();
  }
@@ -19540,10 +21103,9 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
  
    SDValue Amt = N->getOperand(1);
    SDLoc DL(N);
-  if (isSplatVector(Amt.getNode())) {
-    SDValue SclrAmt = Amt->getOperand(0);
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
-      APInt ShiftAmt = C->getAPIntValue();
+  if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
+    if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
+      APInt ShiftAmt = AmtSplat->getAPIntValue();
        unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
  
        // SSE2/AVX2 logical shifts always return a vector of 0s
@@ -19553,7 +21115,6 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
        if (ShiftAmt.trunc(8).uge(MaxAmount))
          return getZeroVector(VT, Subtarget, DAG, DL);
      }
-  }
  
    return SDValue();
  }
@@ -19747,9 +21308,10 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
  
    // The right side has to be a 'trunc' or a constant vector.
    bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
-  bool RHSConst = (isSplatVector(N1.getNode()) &&
-                   isa<ConstantSDNode>(N1->getOperand(0)));
-  if (!RHSTrunc && !RHSConst)
+  ConstantSDNode *RHSConstSplat = nullptr;
+  if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
+    RHSConstSplat = RHSBV->getConstantSplatNode();
+  if (!RHSTrunc && !RHSConstSplat)
      return SDValue();
  
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -19759,9 +21321,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
  
    // Set N0 and N1 to hold the inputs to the new wide operation.
    N0 = N0->getOperand(0);
-  if (RHSConst) {
+  if (RHSConstSplat) {
      N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
-                     N1->getOperand(0));
+                     SDValue(RHSConstSplat, 0));
      SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
      N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
    } else if (RHSTrunc) {
@@ -19907,12 +21469,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
        unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
        unsigned SraAmt = ~0;
        if (Mask.getOpcode() == ISD::SRA) {
-        SDValue Amt = Mask.getOperand(1);
-        if (isSplatVector(Amt.getNode())) {
-          SDValue SclrAmt = Amt->getOperand(0);
-          if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt))
-            SraAmt = C->getZExtValue();
-        }
+        if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
+          if (auto *AmtConst = AmtBV->getConstantSplatNode())
+            SraAmt = AmtConst->getZExtValue();
        } else if (Mask.getOpcode() == X86ISD::VSRAI) {
          SDValue SraC = Mask.getOperand(1);
          SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
@@ -20076,7 +21635,6 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
    EVT MemVT = Ld->getMemoryVT();
    SDLoc dl(Ld);
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  unsigned RegSz = RegVT.getSizeInBits();
  
    // On Sandybridge unaligned 256bit loads are inefficient.
    ISD::LoadExtType Ext = Ld->getExtensionType();
@@ -20112,153 +21670,6 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
      return DCI.CombineTo(N, NewVec, TF, true);
    }
  
-  // If this is a vector EXT Load then attempt to optimize it using a
-  // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
-  // expansion is still better than scalar code.
-  // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
-  // emit a shuffle and a arithmetic shift.
-  // TODO: It is possible to support ZExt by zeroing the undef values
-  // during the shuffle phase or after the shuffle.
-  if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
-      (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
-    assert(MemVT != RegVT && "Cannot extend to the same type");
-    assert(MemVT.isVector() && "Must load a vector from memory");
-
-    unsigned NumElems = RegVT.getVectorNumElements();
-    unsigned MemSz = MemVT.getSizeInBits();
-    assert(RegSz > MemSz && "Register size must be greater than the mem size");
-
-    if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
-      return SDValue();
-
-    // All sizes must be a power of two.
-    if (!isPowerOf2_32(RegSz * MemSz * NumElems))
-      return SDValue();
-
-    // Attempt to load the original value using scalar loads.
-    // Find the largest scalar type that divides the total loaded size.
-    MVT SclrLoadTy = MVT::i8;
-    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
-         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
-      MVT Tp = (MVT::SimpleValueType)tp;
-      if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
-        SclrLoadTy = Tp;
-      }
-    }
-
-    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
-    if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
-        (64 <= MemSz))
-      SclrLoadTy = MVT::f64;
-
-    // Calculate the number of scalar loads that we need to perform
-    // in order to load our vector from memory.
-    unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
-    if (Ext == ISD::SEXTLOAD && NumLoads > 1)
-      return SDValue();
-
-    unsigned loadRegZize = RegSz;
-    if (Ext == ISD::SEXTLOAD && RegSz == 256)
-      loadRegZize /= 2;
-
-    // Represent our vector as a sequence of elements which are the
-    // largest scalar that we can load.
-    EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
-      loadRegZize/SclrLoadTy.getSizeInBits());
-
-    // Represent the data using the same element type that is stored in
-    // memory. In practice, we ''widen'' MemVT.
-    EVT WideVecVT =
-          EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
-                       loadRegZize/MemVT.getScalarType().getSizeInBits());
-
-    assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
-      "Invalid vector type");
-
-    // We can't shuffle using an illegal type.
-    if (!TLI.isTypeLegal(WideVecVT))
-      return SDValue();
-
-    SmallVector<SDValue, 8> Chains;
-    SDValue Ptr = Ld->getBasePtr();
-    SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
-                                        TLI.getPointerTy());
-    SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
-
-    for (unsigned i = 0; i < NumLoads; ++i) {
-      // Perform a single load.
-      SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
-                                       Ptr, Ld->getPointerInfo(),
-                                       Ld->isVolatile(), Ld->isNonTemporal(),
-                                       Ld->isInvariant(), Ld->getAlignment());
-      Chains.push_back(ScalarLoad.getValue(1));
-      // Create the first element type using SCALAR_TO_VECTOR in order to avoid
-      // another round of DAGCombining.
-      if (i == 0)
-        Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
-      else
-        Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
-                          ScalarLoad, DAG.getIntPtrConstant(i));
-
-      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
-    }
-
-    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
-
-    // Bitcast the loaded value to a vector of the original element type, in
-    // the size of the target vector type.
-    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
-    unsigned SizeRatio = RegSz/MemSz;
-
-    if (Ext == ISD::SEXTLOAD) {
-      // If we have SSE4.1 we can directly emit a VSEXT node.
-      if (Subtarget->hasSSE41()) {
-        SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
-        return DCI.CombineTo(N, Sext, TF, true);
-      }
-
-      // Otherwise we'll shuffle the small elements in the high bits of the
-      // larger type and perform an arithmetic shift. If the shift is not legal
-      // it's better to scalarize.
-      if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
-        return SDValue();
-
-      // Redistribute the loaded elements into the different locations.
-      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-      for (unsigned i = 0; i != NumElems; ++i)
-        ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
-
-      SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
-                                           DAG.getUNDEF(WideVecVT),
-                                           &ShuffleVec[0]);
-
-      Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
-
-      // Build the arithmetic shift.
-      unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
-                     MemVT.getVectorElementType().getSizeInBits();
-      Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
-                          DAG.getConstant(Amt, RegVT));
-
-      return DCI.CombineTo(N, Shuff, TF, true);
-    }
-
-    // Redistribute the loaded elements into the different locations.
-    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i != NumElems; ++i)
-      ShuffleVec[i*SizeRatio] = i;
-
-    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
-                                         DAG.getUNDEF(WideVecVT),
-                                         &ShuffleVec[0]);
-
-    // Bitcast to the requested type.
-    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
-    // Replace the original load with the new sequence
-    // and return the new chain.
-    return DCI.CombineTo(N, Shuff, TF, true);
-  }
-
    return SDValue();
  }
  
@@ -21049,8 +22460,61 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
+                                                         SelectionDAG &DAG) {
+  // Take advantage of vector comparisons producing 0 or -1 in each lane to
+  // optimize away operation when it's from a constant.
+  //
+  // The general transformation is:
+  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+  //       AND(VECTOR_CMP(x,y), constant2)
+  //    constant2 = UNARYOP(constant)
+
+  // Early exit if this isn't a vector operation, the operand of the
+  // unary operation isn't a bitwise AND, or if the sizes of the operations
+  // aren't the same.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
+      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
+      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
+    return SDValue();
+
+  // Now check that the other operand of the AND is a constant. We could
+  // make the transformation for non-constant splats as well, but it's unclear
+  // that would be a benefit as it would not eliminate any operations, just
+  // perform one more step in scalar code before moving to the vector unit.
+  if (BuildVectorSDNode *BV =
+          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+    // Bail out if the vector isn't a constant.
+    if (!BV->isConstant())
+      return SDValue();
+
+    // Everything checks out. Build up the new and improved node.
+    SDLoc DL(N);
+    EVT IntVT = BV->getValueType(0);
+    // Create a new constant of the appropriate type for the transformed
+    // DAG.
+    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+    // The AND node needs bitcasts to/from an integer vector type around it.
+    SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
+    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
+                                 N->getOperand(0)->getOperand(0), MaskConst);
+    SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
+    return Res;
+  }
+
+  return SDValue();
+}
+
  static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
                                          const X86TargetLowering *XTLI) {
+  // First try to optimize away the conversion entirely when it's
+  // conditionally from a constant. Vectors only.
+  SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
+  if (Res != SDValue())
+    return Res;
+
+  // Now move on to more general possibilities.
    SDValue Op0 = N->getOperand(0);
    EVT InVT = Op0->getValueType(0);
  
@@ -21259,6 +22723,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case X86ISD::UNPCKL:
    case X86ISD::MOVHLPS:
    case X86ISD::MOVLHPS:
+  case X86ISD::PSHUFB:
    case X86ISD::PSHUFD:
    case X86ISD::PSHUFHW:
    case X86ISD::PSHUFLW:
@@ -21914,14 +23379,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
          Constraint[5] == ')' &&
          Constraint[6] == '}') {
  
-      Res.first = X86::ST0+Constraint[4]-'0';
+      Res.first = X86::FP0+Constraint[4]-'0';
        Res.second = &X86::RFP80RegClass;
        return Res;
      }
  
      // GCC allows "st(0)" to be called just plain "st".
      if (StringRef("{st}").equals_lower(Constraint)) {
-      Res.first = X86::ST0;
+      Res.first = X86::FP0;
        Res.second = &X86::RFP80RegClass;
        return Res;
      }