AVX: Add support for vbroadcast from BUILD_VECTOR and refactor some of the vbroadcast...

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index a8c3f80af6a31c01d074bbcb377293e5e3857f95..4986aac04f23e1cd45c3f927bab25a69ccb4a899 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -35,7 +35,6 @@
  #include "llvm/CodeGen/MachineJumpTableInfo.h"
  #include "llvm/CodeGen/MachineModuleInfo.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
  #include "llvm/MC/MCAsmInfo.h"
  #include "llvm/MC/MCContext.h"
  #include "llvm/MC/MCExpr.h"
@@ -379,15 +378,24 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::FREM             , MVT::f80  , Expand);
    setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
  
-  setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
-  setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
-  setOperationAction(ISD::CTTZ             , MVT::i16  , Custom);
-  setOperationAction(ISD::CTLZ             , MVT::i16  , Custom);
-  setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
-  setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
-  if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::CTTZ           , MVT::i64  , Custom);
-    setOperationAction(ISD::CTLZ           , MVT::i64  , Custom);
+  if (Subtarget->hasBMI()) {
+    setOperationAction(ISD::CTTZ           , MVT::i8   , Promote);
+  } else {
+    setOperationAction(ISD::CTTZ           , MVT::i8   , Custom);
+    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
+  }
+
+  if (Subtarget->hasLZCNT()) {
+    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
+  } else {
+    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
    }
  
    if (Subtarget->hasPOPCNT()) {
@@ -821,7 +829,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
      setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
  
-    setOperationAction(ISD::SETCC,              MVT::v2f64, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
      setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
      setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
      setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
@@ -915,10 +923,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      // FIXME: Do we need to handle scalar-to-vector here?
      setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
  
-    // Can turn SHL into an integer multiply.
-    setOperationAction(ISD::SHL,                MVT::v4i32, Custom);
-    setOperationAction(ISD::SHL,                MVT::v16i8, Custom);
-
      setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
      setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
      setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
@@ -939,24 +943,41 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
  
+    // FIXME: these should be Legal but thats only for the case where
+    // the index is constant.  For now custom expand to deal with that
      if (Subtarget->is64Bit()) {
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Legal);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
      }
    }
  
    if (Subtarget->hasXMMInt()) {
-    setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
-    setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
-    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
      setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
  
-    setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
-    setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
      setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
  
-    setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
      setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
+
+    if (Subtarget->hasAVX2()) {
+      setOperationAction(ISD::SRL,             MVT::v2i64, Legal);
+      setOperationAction(ISD::SRL,             MVT::v4i32, Legal);
+
+      setOperationAction(ISD::SHL,             MVT::v2i64, Legal);
+      setOperationAction(ISD::SHL,             MVT::v4i32, Legal);
+
+      setOperationAction(ISD::SRA,             MVT::v4i32, Legal);
+    } else {
+      setOperationAction(ISD::SRL,             MVT::v2i64, Custom);
+      setOperationAction(ISD::SRL,             MVT::v4i32, Custom);
+
+      setOperationAction(ISD::SHL,             MVT::v2i64, Custom);
+      setOperationAction(ISD::SHL,             MVT::v4i32, Custom);
+
+      setOperationAction(ISD::SRA,             MVT::v4i32, Custom);
+    }
    }
  
    if (Subtarget->hasSSE42() || Subtarget->hasAVX())
@@ -999,18 +1020,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i8,  Custom);
      setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i16, Custom);
  
-    setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
-    setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
      setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
      setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
  
-    setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
-    setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
      setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
      setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
  
-    setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
      setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
  
      setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
      setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
@@ -1021,25 +1038,60 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
      setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
  
-    setOperationAction(ISD::VSELECT,            MVT::v4f64, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v4i64, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v8i32, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v8f32, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
  
-    setOperationAction(ISD::ADD,               MVT::v4i64, Custom);
-    setOperationAction(ISD::ADD,               MVT::v8i32, Custom);
-    setOperationAction(ISD::ADD,               MVT::v16i16, Custom);
-    setOperationAction(ISD::ADD,               MVT::v32i8, Custom);
+    if (Subtarget->hasAVX2()) {
+      setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
+      setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
+      setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
+      setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
  
-    setOperationAction(ISD::SUB,               MVT::v4i64, Custom);
-    setOperationAction(ISD::SUB,               MVT::v8i32, Custom);
-    setOperationAction(ISD::SUB,               MVT::v16i16, Custom);
-    setOperationAction(ISD::SUB,               MVT::v32i8, Custom);
+      setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
+      setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
+      setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
+      setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
+
+      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
+      setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
+      setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
+      // Don't lower v32i8 because there is no 128-bit byte mul
+
+      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
+
+      setOperationAction(ISD::SRL,             MVT::v4i64, Legal);
+      setOperationAction(ISD::SRL,             MVT::v8i32, Legal);
+
+      setOperationAction(ISD::SHL,             MVT::v4i64, Legal);
+      setOperationAction(ISD::SHL,             MVT::v8i32, Legal);
+
+      setOperationAction(ISD::SRA,             MVT::v8i32, Legal);
+    } else {
+      setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
+      setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
+      setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
+      setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
  
-    setOperationAction(ISD::MUL,               MVT::v4i64, Custom);
-    setOperationAction(ISD::MUL,               MVT::v8i32, Custom);
-    setOperationAction(ISD::MUL,               MVT::v16i16, Custom);
-    // Don't lower v32i8 because there is no 128-bit byte mul
+      setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
+      setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
+      setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
+      setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
+
+      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
+      setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
+      setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
+      // Don't lower v32i8 because there is no 128-bit byte mul
+
+      setOperationAction(ISD::SRL,             MVT::v4i64, Custom);
+      setOperationAction(ISD::SRL,             MVT::v8i32, Custom);
+
+      setOperationAction(ISD::SHL,             MVT::v4i64, Custom);
+      setOperationAction(ISD::SHL,             MVT::v8i32, Custom);
+
+      setOperationAction(ISD::SRA,             MVT::v8i32, Custom);
+    }
  
      // Custom lower several nodes for 256-bit types.
      for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
@@ -1137,12 +1189,17 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setTargetDAGCombine(ISD::OR);
    setTargetDAGCombine(ISD::AND);
    setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
    setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::LOAD);
    setTargetDAGCombine(ISD::STORE);
    setTargetDAGCombine(ISD::ZERO_EXTEND);
    setTargetDAGCombine(ISD::SINT_TO_FP);
    if (Subtarget->is64Bit())
      setTargetDAGCombine(ISD::MUL);
+  if (Subtarget->hasBMI())
+    setTargetDAGCombine(ISD::XOR);
  
    computeRegisterProperties();
  
@@ -1218,7 +1275,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
  /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
  /// means there isn't a need to check it against alignment requirement,
  /// probably because the source does not need to be loaded. If
-/// 'NonScalarIntSafe' is true, that means it's safe to return a
+/// 'IsZeroVal' is true, that means it's safe to return a
  /// non-scalar-integer type, e.g. empty string source, constant, or loaded
  /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
  /// constant so it does not need to be loaded.
@@ -1227,14 +1284,14 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
  EVT
  X86TargetLowering::getOptimalMemOpType(uint64_t Size,
                                         unsigned DstAlign, unsigned SrcAlign,
-                                       bool NonScalarIntSafe,
+                                       bool IsZeroVal,
                                         bool MemcpyStrSrc,
                                         MachineFunction &MF) const {
    // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
    // linux.  This is because the stack realignment code can't handle certain
    // cases like PR2962.  This should be removed when PR2962 is fixed.
    const Function *F = MF.getFunction();
-  if (NonScalarIntSafe &&
+  if (IsZeroVal &&
        !F->hasFnAttr(Attribute::NoImplicitFloat)) {
      if (Size >= 16 &&
          (Subtarget->isUnalignedMemAccessFast() ||
@@ -1692,7 +1749,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
      return DAG.getLoad(ValVT, dl, Chain, FIN,
                         MachinePointerInfo::getFixedStack(FI),
-                       false, false, 0);
+                       false, false, false, 0);
    }
  }
  
@@ -1741,6 +1798,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
      // places.
      assert(VA.getValNo() != LastVal &&
             "Don't support value assigned to multiple locs yet");
+    (void)LastVal;
      LastVal = VA.getValNo();
  
      if (VA.isRegLoc()) {
@@ -1794,7 +1852,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
      // If value is passed via pointer - do a load.
      if (VA.getLocInfo() == CCValAssign::Indirect)
        ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
-                             MachinePointerInfo(), false, false, 0);
+                             MachinePointerInfo(), false, false, false, 0);
  
      InVals.push_back(ArgValue);
    }
@@ -1993,7 +2051,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
  
    // Load the "old" Return address.
    OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
-                           false, false, 0);
+                           false, false, false, 0);
    return SDValue(OutRetAddr.getNode(), 1);
  }
  
@@ -2355,7 +2413,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
        if (ExtraLoad)
          Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
                               MachinePointerInfo::getGOT(),
-                             false, false, 0);
+                             false, false, false, 0);
      }
    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
      unsigned char OpFlags = 0;
@@ -3754,12 +3812,8 @@ static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT,
    if (!Subtarget->hasAVX())
      return false;
  
-  // Match any permutation of 128-bit vector with 64-bit types
-  if (NumLanes == 1 && NumElts != 2)
-    return false;
-
-  // Only match 256-bit with 32 types
-  if (VT.getSizeInBits() == 256 && NumElts != 4)
+  // Only match 256-bit with 64-bit types
+  if (VT.getSizeInBits() != 256 || NumElts != 4)
      return false;
  
    // The mask on the high lane is independent of the low. Both can match
@@ -3790,12 +3844,8 @@ static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT,
    if (!Subtarget->hasAVX())
      return false;
  
-  // Match any permutation of 128-bit vector with 32-bit types
-  if (NumLanes == 1 && NumElts != 4)
-    return false;
-
-  // Only match 256-bit with 32 types
-  if (VT.getSizeInBits() == 256 && NumElts != 8)
+  // Only match 256-bit with 32-bit types
+  if (VT.getSizeInBits() != 256 || NumElts != 8)
      return false;
  
    // The mask on the high lane should be the same as the low. Actually,
@@ -4213,6 +4263,29 @@ static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
    return true;
  }
  
+// Test whether the given value is a vector value which will be legalized
+// into a load.
+static bool WillBeConstantPoolLoad(SDNode *N) {
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  // Check for any non-constant elements.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    switch (N->getOperand(i).getNode()->getOpcode()) {
+    case ISD::UNDEF:
+    case ISD::ConstantFP:
+    case ISD::Constant:
+      break;
+    default:
+      return false;
+    }
+
+  // Vectors of all-zeros and all-ones are materialized with special
+  // instructions rather than being loaded.
+  return !ISD::isBuildVectorAllZeros(N) &&
+         !ISD::isBuildVectorAllOnes(N);
+}
+
  /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
  /// match movlp{s|d}. The lower half elements should come from lower half of
  /// V1 (and in order), and the upper half elements should come from the upper
@@ -4228,7 +4301,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
      return false;
    // Is V2 is a vector load, don't do this transformation. We will try to use
    // load folding shufps op.
-  if (ISD::isNON_EXTLoad(V2))
+  if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
      return false;
  
    unsigned NumElems = VT.getVectorNumElements();
@@ -4951,7 +5024,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
      EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
      SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
                               LD->getPointerInfo().getWithOffset(StartOffset),
-                             false, false, 0);
+                             false, false, false, 0);
  
      // Canonicalize it to a v4i32 or v8i32 shuffle.
      SmallVector<int, 8> Mask;
@@ -5016,11 +5089,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
      if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
        return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                           LDBase->getPointerInfo(),
-                         LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
+                         LDBase->isVolatile(), LDBase->isNonTemporal(),
+                         LDBase->isInvariant(), 0);
      return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                         LDBase->getPointerInfo(),
                         LDBase->isVolatile(), LDBase->isNonTemporal(),
-                       LDBase->getAlignment());
+                       LDBase->isInvariant(), LDBase->getAlignment());
    } else if (NumElems == 4 && LastLoadedElt == 1 &&
               DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
      SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
@@ -5036,6 +5110,84 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
    return SDValue();
  }
  
+/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
+/// a vbroadcast node. We support two patterns:
+/// 1. A splat BUILD_VECTOR which uses a single scalar load.
+/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
+/// a scalar load.
+/// The scalar load node is returned when a pattern is found, 
+/// or SDValue() otherwise. 
+static SDValue isVectorBroadcast(SDValue &Op) {
+  EVT VT = Op.getValueType();
+  SDValue V = Op;
+
+  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  //A suspected load to be broadcasted.
+  SDValue Ld;
+
+  switch (V.getOpcode()) {
+    default:
+      // Unknown pattern found.
+      return SDValue();
+
+    case ISD::BUILD_VECTOR: {
+      // The BUILD_VECTOR node must be a splat.
+      if (!isSplatVector(V.getNode())) 
+        return SDValue();
+
+      Ld = V.getOperand(0);
+    
+      // The suspected load node has several users. Make sure that all 
+      // of its users are from the BUILD_VECTOR node.
+      if (!Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) 
+        return SDValue();
+      break; 
+    }
+
+    case ISD::VECTOR_SHUFFLE: {
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+
+      // Shuffles must have a splat mask where the first element is
+      // broadcasted.
+      if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) 
+        return SDValue();
+
+      SDValue Sc = Op.getOperand(0);
+      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR) 
+        return SDValue();
+
+      Ld = Sc.getOperand(0);
+
+      // The scalar_to_vector node and the suspected
+      // load node must have exactly one user.
+      if (!Sc.hasOneUse() || !Ld.hasOneUse())
+        return SDValue();
+      break;
+    }
+  }
+  
+  // The scalar source must be a normal load.
+  if (!ISD::isNormalLoad(Ld.getNode())) 
+    return SDValue();
+  
+  bool Is256 = VT.getSizeInBits() == 256;
+  bool Is128 = VT.getSizeInBits() == 128;
+  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
+
+  // VBroadcast to YMM
+  if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
+    return Ld;
+
+  // VBroadcast to XMM
+  if (Is128 && (ScalarSize == 32))
+    return Ld;
+
+  // Unsupported broadcast.
+  return SDValue();
+}
+
  SDValue
  X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
    DebugLoc dl = Op.getDebugLoc();
@@ -5064,6 +5216,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
      return getOnesVector(Op.getValueType(), DAG, dl);
    }
  
+  SDValue LD = isVectorBroadcast(Op);
+  if (Subtarget->hasAVX() && LD.getNode())
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
+
    unsigned EVTBits = ExtVT.getSizeInBits();
  
    unsigned NumZero  = 0;
@@ -5434,11 +5590,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
    // Determine if more than 1 of the words in each of the low and high quadwords
    // of the result come from the same quadword of one of the two inputs.  Undef
    // mask values count as coming from any quadword, for better codegen.
-  SmallVector<unsigned, 4> LoQuad(4);
-  SmallVector<unsigned, 4> HiQuad(4);
+  unsigned LoQuad[] = { 0, 0, 0, 0 };
+  unsigned HiQuad[] = { 0, 0, 0, 0 };
    BitVector InputQuads(4);
    for (unsigned i = 0; i < 8; ++i) {
-    SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
+    unsigned *Quad = i < 4 ? LoQuad : HiQuad;
      int EltIdx = SVOp->getMaskElt(i);
      MaskVals.push_back(EltIdx);
      if (EltIdx < 0) {
@@ -6159,6 +6315,10 @@ static bool MayFoldVectorLoad(SDValue V) {
      V = V.getOperand(0);
    if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
      V = V.getOperand(0);
+  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
+      V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
+    // BUILD_VECTOR (load), undef
+    V = V.getOperand(0);
    if (MayFoldLoad(V))
      return true;
    return false;
@@ -6341,22 +6501,20 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
    //    turns into:
    //  (MOVLPSmr addr:$src1, VR128:$src2)
    // So, recognize this potential and also use MOVLPS or MOVLPD
-  if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
+  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
      CanFoldLoad = true;
  
-  // Both of them can't be memory operations though.
-  if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2))
-    CanFoldLoad = false;
-
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
    if (CanFoldLoad) {
      if (HasXMMInt && NumElems == 2)
        return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
  
      if (NumElems == 4)
-      return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
+      // If we don't care about the second element, procede to use movss.
+      if (SVOp->getMaskElt(1) != -1)
+        return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
    }
  
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
    // movl and movlp will both match v2i64, but v2i64 is never matched by
    // movl earlier because we make it strict to avoid messing with the movlp load
    // folding logic (see the code above getMOVLP call). Match it here then,
@@ -6430,52 +6588,6 @@ static inline unsigned getVPERMILOpcode(EVT VT) {
    return 0;
  }
  
-/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
-/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming
-/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded.
-static bool isVectorBroadcast(SDValue &Op) {
-  EVT VT = Op.getValueType();
-  bool Is256 = VT.getSizeInBits() == 256;
-
-  assert((VT.getSizeInBits() == 128 || Is256) &&
-         "Unsupported type for vbroadcast node");
-
-  SDValue V = Op;
-  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
-    V = V.getOperand(0);
-
-  if (Is256 && !(V.hasOneUse() &&
-                 V.getOpcode() == ISD::INSERT_SUBVECTOR &&
-                 V.getOperand(0).getOpcode() == ISD::UNDEF))
-    return false;
-
-  if (Is256)
-    V = V.getOperand(1);
-
-  if (!V.hasOneUse())
-    return false;
-
-  // Check the source scalar_to_vector type. 256-bit broadcasts are
-  // supported for 32/64-bit sizes, while 128-bit ones are only supported
-  // for 32-bit scalars.
-  if (V.getOpcode() != ISD::SCALAR_TO_VECTOR)
-    return false;
-
-  unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits();
-  if (ScalarSize != 32 && ScalarSize != 64)
-    return false;
-  if (!Is256 && ScalarSize == 64)
-    return false;
-
-  V = V.getOperand(0);
-  if (!MayFoldLoad(V))
-    return false;
-
-  // Return the load node
-  Op = V;
-  return true;
-}
-
  static
  SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
                                 const TargetLowering &TLI,
@@ -6501,8 +6613,9 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
        return Op;
  
      // Use vbroadcast whenever the splat comes from a foldable load
-    if (Subtarget->hasAVX() && isVectorBroadcast(V1))
-      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
+    SDValue LD = isVectorBroadcast(Op);
+    if (Subtarget->hasAVX() && LD.getNode())
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
  
      // Handle splats by matching through known shuffle masks
      if ((Size == 128 && NumElem <= 4) ||
@@ -6548,7 +6661,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    EVT VT = Op.getValueType();
    DebugLoc dl = Op.getDebugLoc();
    unsigned NumElems = VT.getVectorNumElements();
-  bool isMMX = VT.getSizeInBits() == 64;
    bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
    bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
    bool V1IsSplat = false;
@@ -6557,9 +6669,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    MachineFunction &MF = DAG.getMachineFunction();
    bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
  
-  // Shuffle operations on MMX not supported.
-  if (isMMX)
-    return Op;
+  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
  
    // Vector shuffle lowering takes 3 steps:
    //
@@ -6571,7 +6681,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    //    so the shuffle can be broken into other shuffles and the legalizer can
    //    try the lowering again.
    //
-  // The general ideia is that no vector_shuffle operation should be left to
+  // The general idea is that no vector_shuffle operation should be left to
    // be matched during isel, all of them must be converted to a target specific
    // node here.
  
@@ -6891,8 +7001,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
                                                Op.getOperand(0)),
                                                Op.getOperand(1));
      return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
-  } else if (VT == MVT::i32) {
-    // ExtractPS works with constant index.
+  } else if (VT == MVT::i32 || VT == MVT::i64) {
+    // ExtractPS/pextrq works with constant index.
      if (isa<ConstantSDNode>(Op.getOperand(1)))
        return Op;
    }
@@ -7031,7 +7141,8 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
      // Create this as a scalar to vector..
      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
      return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
-  } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
+  } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) && 
+             isa<ConstantSDNode>(N2)) {
      // PINSR* works with constant index.
      return Op;
    }
@@ -7271,7 +7382,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
    // load.
    if (isGlobalStubReference(OpFlag))
      Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(), false, false, 0);
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
  
    return Result;
  }
@@ -7339,7 +7450,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
    // load.
    if (isGlobalStubReference(OpFlags))
      Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(), false, false, 0);
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
  
    // If there was a non-zero offset that we didn't fold, create an explicit
    // addition for it.
@@ -7418,7 +7529,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
  
    SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                                        DAG.getIntPtrConstant(0),
-                                      MachinePointerInfo(Ptr), false, false, 0);
+                                      MachinePointerInfo(Ptr),
+                                      false, false, false, 0);
  
    unsigned char OperandFlags = 0;
    // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
@@ -7444,7 +7556,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
  
    if (model == TLSModel::InitialExec)
      Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
-                         MachinePointerInfo::getGOT(), false, false, 0);
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
  
    // The address of the thread local variable is the add of the thread
    // pointer with the offset of the variable.
@@ -7522,7 +7634,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
      // And our return value (tls address) is in the standard call return value
      // location.
      unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
-    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
+    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
+                              Chain.getValue(1));
    }
  
    assert(false &&
@@ -7667,7 +7780,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
                                      Op.getValueType(), MMO);
      Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
                           MachinePointerInfo::getFixedStack(SSFI),
-                         false, false, 0);
+                         false, false, false, 0);
    }
  
    return Result;
@@ -7741,12 +7854,12 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
    SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
    SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
                                MachinePointerInfo::getConstantPool(),
-                              false, false, 16);
+                              false, false, false, 16);
    SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
    SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2);
    SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
                                MachinePointerInfo::getConstantPool(),
-                              false, false, 16);
+                              false, false, false, 16);
    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
  
    // Add the halves; easiest way is to swap them into another reg first.
@@ -7978,7 +8091,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
  
    // Load the result.
    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
-                     FIST, StackSlot, MachinePointerInfo(), false, false, 0);
+                     FIST, StackSlot, MachinePointerInfo(),
+                     false, false, false, 0);
  }
  
  SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
@@ -7989,7 +8103,8 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
  
    // Load the result.
    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
-                     FIST, StackSlot, MachinePointerInfo(), false, false, 0);
+                     FIST, StackSlot, MachinePointerInfo(),
+                     false, false, false, 0);
  }
  
  SDValue X86TargetLowering::LowerFABS(SDValue Op,
@@ -8016,7 +8131,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op,
    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
    SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                               MachinePointerInfo::getConstantPool(),
-                             false, false, 16);
+                             false, false, false, 16);
    return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
  }
  
@@ -8043,7 +8158,7 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
    SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                               MachinePointerInfo::getConstantPool(),
-                             false, false, 16);
+                             false, false, false, 16);
    if (VT.isVector()) {
      return DAG.getNode(ISD::BITCAST, dl, VT,
                         DAG.getNode(ISD::XOR, dl, MVT::v2i64,
@@ -8092,7 +8207,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
    SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
                                MachinePointerInfo::getConstantPool(),
-                              false, false, 16);
+                              false, false, false, 16);
    SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
  
    // Shift sign bit right or left if the two operands have different types.
@@ -8121,7 +8236,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
    CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
    SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                                MachinePointerInfo::getConstantPool(),
-                              false, false, 16);
+                              false, false, false, 16);
    SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
  
    // Or the value with the sign bit.
@@ -8185,8 +8300,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
      // climbing the DAG back to the root, and it doesn't seem to be worth the
      // effort.
      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
-           UE = Op.getNode()->use_end(); UI != UE; ++UI)
-      if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC)
+         UE = Op.getNode()->use_end(); UI != UE; ++UI)
+      if (UI->getOpcode() != ISD::CopyToReg &&
+          UI->getOpcode() != ISD::SETCC &&
+          UI->getOpcode() != ISD::STORE)
          goto default_case;
  
      if (ConstantSDNode *C =
@@ -8505,8 +8622,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
          UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
          EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
          return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
-      }
-      else if (SetCCOpcode == ISD::SETONE) {
+      } else if (SetCCOpcode == ISD::SETONE) {
          SDValue ORD, NEQ;
          ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
          NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
@@ -8519,7 +8635,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    }
  
    // Break 256-bit integer vector compare into smaller ones.
-  if (!isFP && VT.getSizeInBits() == 256)
+  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
      return Lower256IntVSETCC(Op, DAG);
  
    // We are handling one of the integer comparisons here.  Since SSE only has
@@ -8528,12 +8644,12 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
    bool Swap = false, Invert = false, FlipSigns = false;
  
-  switch (VT.getSimpleVT().SimpleTy) {
+  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
    default: break;
-  case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
-  case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
-  case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
-  case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
+  case MVT::i8:   EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
+  case MVT::i16:  EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
+  case MVT::i32:  EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
+  case MVT::i64:  EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
    }
  
    switch (SetCCOpcode) {
@@ -8552,6 +8668,13 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    if (Swap)
      std::swap(Op0, Op1);
  
+  // Check that the operation in question is available (most are plain SSE2,
+  // but PCMPGTQ and PCMPEQQ have different requirements).
+  if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42() && !Subtarget->hasAVX())
+    return SDValue();
+  if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41() && !Subtarget->hasAVX())
+    return SDValue();
+
    // Since SSE has no unsigned integer comparisons, we need to flip  the sign
    // bits of the inputs before performing those operations.
    if (FlipSigns) {
@@ -8666,8 +8789,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
  
    // If condition flag is set by a X86ISD::CMP, then use it as the condition
    // setting operand in place of the X86ISD::SETCC.
-  if (Cond.getOpcode() == X86ISD::SETCC ||
-      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
+  unsigned CondOpcode = Cond.getOpcode();
+  if (CondOpcode == X86ISD::SETCC ||
+      CondOpcode == X86ISD::SETCC_CARRY) {
      CC = Cond.getOperand(0);
  
      SDValue Cmp = Cond.getOperand(1);
@@ -8684,6 +8808,39 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
        Cond = Cmp;
        addTest = false;
      }
+  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+             ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+              Cond.getOperand(0).getValueType() != MVT::i8)) {
+    SDValue LHS = Cond.getOperand(0);
+    SDValue RHS = Cond.getOperand(1);
+    unsigned X86Opcode;
+    unsigned X86Cond;
+    SDVTList VTs;
+    switch (CondOpcode) {
+    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+    default: llvm_unreachable("unexpected overflowing operator");
+    }
+    if (CondOpcode == ISD::UMULO)
+      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+                          MVT::i32);
+    else
+      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+    SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
+
+    if (CondOpcode == ISD::UMULO)
+      Cond = X86Op.getValue(2);
+    else
+      Cond = X86Op.getValue(1);
+
+    CC = DAG.getConstant(X86Cond, MVT::i8);
+    addTest = false;
    }
  
    if (addTest) {
@@ -8765,11 +8922,27 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
    SDValue Dest  = Op.getOperand(2);
    DebugLoc dl = Op.getDebugLoc();
    SDValue CC;
+  bool Inverted = false;
  
    if (Cond.getOpcode() == ISD::SETCC) {
-    SDValue NewCond = LowerSETCC(Cond, DAG);
-    if (NewCond.getNode())
-      Cond = NewCond;
+    // Check for setcc([su]{add,sub,mul}o == 0).
+    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+        isa<ConstantSDNode>(Cond.getOperand(1)) &&
+        cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
+        Cond.getOperand(0).getResNo() == 1 &&
+        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
+         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
+         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
+         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
+         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
+         Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
+      Inverted = true;
+      Cond = Cond.getOperand(0);
+    } else {
+      SDValue NewCond = LowerSETCC(Cond, DAG);
+      if (NewCond.getNode())
+        Cond = NewCond;
+    }
    }
  #if 0
    // FIXME: LowerXALUO doesn't handle these!!
@@ -8790,8 +8963,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
  
    // If condition flag is set by a X86ISD::CMP, then use it as the condition
    // setting operand in place of the X86ISD::SETCC.
-  if (Cond.getOpcode() == X86ISD::SETCC ||
-      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
+  unsigned CondOpcode = Cond.getOpcode();
+  if (CondOpcode == X86ISD::SETCC ||
+      CondOpcode == X86ISD::SETCC_CARRY) {
      CC = Cond.getOperand(0);
  
      SDValue Cmp = Cond.getOperand(1);
@@ -8812,6 +8986,43 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
          break;
        }
      }
+  }
+  CondOpcode = Cond.getOpcode();
+  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+      ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+       Cond.getOperand(0).getValueType() != MVT::i8)) {
+    SDValue LHS = Cond.getOperand(0);
+    SDValue RHS = Cond.getOperand(1);
+    unsigned X86Opcode;
+    unsigned X86Cond;
+    SDVTList VTs;
+    switch (CondOpcode) {
+    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+    default: llvm_unreachable("unexpected overflowing operator");
+    }
+    if (Inverted)
+      X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
+    if (CondOpcode == ISD::UMULO)
+      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+                          MVT::i32);
+    else
+      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+    SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
+
+    if (CondOpcode == ISD::UMULO)
+      Cond = X86Op.getValue(2);
+    else
+      Cond = X86Op.getValue(1);
+
+    CC = DAG.getConstant(X86Cond, MVT::i8);
+    addTest = false;
    } else {
      unsigned CondOpc;
      if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
@@ -8875,6 +9086,66 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
        CC = DAG.getConstant(CCode, MVT::i8);
        Cond = Cond.getOperand(0).getOperand(1);
        addTest = false;
+    } else if (Cond.getOpcode() == ISD::SETCC &&
+               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
+      // For FCMP_OEQ, we can emit
+      // two branches instead of an explicit AND instruction with a
+      // separate test. However, we only do this if this block doesn't
+      // have a fall-through edge, because this requires an explicit
+      // jmp when the condition is false.
+      if (Op.getNode()->hasOneUse()) {
+        SDNode *User = *Op.getNode()->use_begin();
+        // Look for an unconditional branch following this conditional branch.
+        // We need this because we need to reverse the successors in order
+        // to implement FCMP_OEQ.
+        if (User->getOpcode() == ISD::BR) {
+          SDValue FalseBB = User->getOperand(1);
+          SDNode *NewBR =
+            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+          assert(NewBR == User);
+          (void)NewBR;
+          Dest = FalseBB;
+
+          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                                    Cond.getOperand(0), Cond.getOperand(1));
+          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = DAG.getConstant(X86::COND_P, MVT::i8);
+          Cond = Cmp;
+          addTest = false;
+        }
+      }
+    } else if (Cond.getOpcode() == ISD::SETCC &&
+               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
+      // For FCMP_UNE, we can emit
+      // two branches instead of an explicit AND instruction with a
+      // separate test. However, we only do this if this block doesn't
+      // have a fall-through edge, because this requires an explicit
+      // jmp when the condition is false.
+      if (Op.getNode()->hasOneUse()) {
+        SDNode *User = *Op.getNode()->use_begin();
+        // Look for an unconditional branch following this conditional branch.
+        // We need this because we need to reverse the successors in order
+        // to implement FCMP_UNE.
+        if (User->getOpcode() == ISD::BR) {
+          SDValue FalseBB = User->getOperand(1);
+          SDNode *NewBR =
+            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+          assert(NewBR == User);
+          (void)NewBR;
+
+          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                                    Cond.getOperand(0), Cond.getOperand(1));
+          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = DAG.getConstant(X86::COND_NP, MVT::i8);
+          Cond = Cmp;
+          addTest = false;
+          Dest = FalseBB;
+        }
+      }
      }
    }
  
@@ -9093,7 +9364,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
                       Chain,
                       VAARG,
                       MachinePointerInfo(),
-                     false, false, 0);
+                     false, false, false, 0);
  }
  
  SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
@@ -9218,6 +9489,19 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
                                  DAG.getConstant(X86CC, MVT::i8), Cond);
      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
    }
+  // Arithmetic intrinsics.
+  case Intrinsic::x86_sse3_hadd_ps:
+  case Intrinsic::x86_sse3_hadd_pd:
+  case Intrinsic::x86_avx_hadd_ps_256:
+  case Intrinsic::x86_avx_hadd_pd_256:
+    return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse3_hsub_ps:
+  case Intrinsic::x86_sse3_hsub_pd:
+  case Intrinsic::x86_avx_hsub_ps_256:
+  case Intrinsic::x86_avx_hsub_pd_256:
+    return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
    // ptest and testp intrinsics. The intrinsic these come from are designed to
    // return an integer value, not just an instruction so lower it to the ptest
    // or testp pattern and a setcc for the result.
@@ -9286,6 +9570,14 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
  
    // Fix vector shift instructions where the last operand is a non-immediate
    // i32 value.
+  case Intrinsic::x86_avx2_pslli_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+  case Intrinsic::x86_avx2_psrli_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
    case Intrinsic::x86_sse2_pslli_w:
    case Intrinsic::x86_sse2_pslli_d:
    case Intrinsic::x86_sse2_pslli_q:
@@ -9333,6 +9625,30 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
      case Intrinsic::x86_sse2_psrai_d:
        NewIntNo = Intrinsic::x86_sse2_psra_d;
        break;
+    case Intrinsic::x86_avx2_pslli_w:
+      NewIntNo = Intrinsic::x86_avx2_psll_w;
+      break;
+    case Intrinsic::x86_avx2_pslli_d:
+      NewIntNo = Intrinsic::x86_avx2_psll_d;
+      break;
+    case Intrinsic::x86_avx2_pslli_q:
+      NewIntNo = Intrinsic::x86_avx2_psll_q;
+      break;
+    case Intrinsic::x86_avx2_psrli_w:
+      NewIntNo = Intrinsic::x86_avx2_psrl_w;
+      break;
+    case Intrinsic::x86_avx2_psrli_d:
+      NewIntNo = Intrinsic::x86_avx2_psrl_d;
+      break;
+    case Intrinsic::x86_avx2_psrli_q:
+      NewIntNo = Intrinsic::x86_avx2_psrl_q;
+      break;
+    case Intrinsic::x86_avx2_psrai_w:
+      NewIntNo = Intrinsic::x86_avx2_psra_w;
+      break;
+    case Intrinsic::x86_avx2_psrai_d:
+      NewIntNo = Intrinsic::x86_avx2_psra_d;
+      break;
      default: {
        ShAmtVT = MVT::v2i32;
        switch (IntNo) {
@@ -9406,13 +9722,13 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
      return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
                         DAG.getNode(ISD::ADD, dl, getPointerTy(),
                                     FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, 0);
+                       MachinePointerInfo(), false, false, false, 0);
    }
  
    // Just load the return address.
    SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                     RetAddrFI, MachinePointerInfo(), false, false, 0);
+                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
  }
  
  SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
@@ -9427,7 +9743,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
    while (Depth--)
      FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
                              MachinePointerInfo(),
-                            false, false, 0);
+                            false, false, false, 0);
    return FrameAddr;
  }
  
@@ -9659,7 +9975,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
  
    // Load FP Control Word from stack slot
    SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
-                            MachinePointerInfo(), false, false, 0);
+                            MachinePointerInfo(), false, false, false, 0);
  
    // Transform as necessary
    SDValue CWD1 =
@@ -9798,12 +10114,55 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
    EVT VT = Op.getValueType();
  
    // Decompose 256-bit ops into smaller 128-bit ops.
-  if (VT.getSizeInBits() == 256)
+  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
      return Lower256IntArith(Op, DAG);
  
-  assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
    DebugLoc dl = Op.getDebugLoc();
  
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  if (VT == MVT::v4i64) {
+    assert(Subtarget->hasAVX2() && "Lowering v4i64 multiply requires AVX2");
+
+    //  ulong2 Ahi = __builtin_ia32_psrlqi256( a, 32);
+    //  ulong2 Bhi = __builtin_ia32_psrlqi256( b, 32);
+    //  ulong2 AloBlo = __builtin_ia32_pmuludq256( a, b );
+    //  ulong2 AloBhi = __builtin_ia32_pmuludq256( a, Bhi );
+    //  ulong2 AhiBlo = __builtin_ia32_pmuludq256( Ahi, b );
+    //
+    //  AloBhi = __builtin_ia32_psllqi256( AloBhi, 32 );
+    //  AhiBlo = __builtin_ia32_psllqi256( AhiBlo, 32 );
+    //  return AloBlo + AloBhi + AhiBlo;
+
+    SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
+                         A, DAG.getConstant(32, MVT::i32));
+    SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
+                         B, DAG.getConstant(32, MVT::i32));
+    SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
+                         A, B);
+    SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
+                         A, Bhi);
+    SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
+                         Ahi, B);
+    AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
+                         AloBhi, DAG.getConstant(32, MVT::i32));
+    AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
+                         AhiBlo, DAG.getConstant(32, MVT::i32));
+    SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
+    Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
+    return Res;
+  }
+
+  assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
+
    //  ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
    //  ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
    //  ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
@@ -9814,9 +10173,6 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
    //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
    //  return AloBlo + AloBhi + AhiBlo;
  
-  SDValue A = Op.getOperand(0);
-  SDValue B = Op.getOperand(1);
-
    SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                         DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
                         A, DAG.getConstant(32, MVT::i32));
@@ -9854,53 +10210,25 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
    if (!Subtarget->hasXMMInt())
      return SDValue();
  
-  // Decompose 256-bit shifts into smaller 128-bit shifts.
-  if (VT.getSizeInBits() == 256) {
-    int NumElems = VT.getVectorNumElements();
-    MVT EltVT = VT.getVectorElementType().getSimpleVT();
-    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
-    // Extract the two vectors
-    SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
-    SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
-                                     DAG, dl);
-
-    // Recreate the shift amount vectors
-    SDValue Amt1, Amt2;
-    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
-      // Constant shift amount
-      SmallVector<SDValue, 4> Amt1Csts;
-      SmallVector<SDValue, 4> Amt2Csts;
-      for (int i = 0; i < NumElems/2; ++i)
-        Amt1Csts.push_back(Amt->getOperand(i));
-      for (int i = NumElems/2; i < NumElems; ++i)
-        Amt2Csts.push_back(Amt->getOperand(i));
-
-      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
-                                 &Amt1Csts[0], NumElems/2);
-      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
-                                 &Amt2Csts[0], NumElems/2);
-    } else {
-      // Variable shift amount
-      Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
-      Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
-                                 DAG, dl);
-    }
-
-    // Issue new vector shifts for the smaller types
-    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
-    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
-
-    // Concatenate the result back
-    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
-  }
-
    // Optimize shl/srl/sra with constant shift amount.
    if (isSplatVector(Amt.getNode())) {
      SDValue SclrAmt = Amt->getOperand(0);
      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
        uint64_t ShiftAmt = C->getZExtValue();
  
+      if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SHL) {
+        // Make a large shift.
+        SDValue SHL =
+          DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                      DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
+                      R, DAG.getConstant(ShiftAmt, MVT::i32));
+        // Zero out the rightmost bits.
+        SmallVector<SDValue, 16> V(16, DAG.getConstant(uint8_t(-1U << ShiftAmt),
+                                                       MVT::i8));
+        return DAG.getNode(ISD::AND, dl, VT, SHL,
+                           DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+      }
+
        if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL)
         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
@@ -9916,6 +10244,19 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
                       DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
                       R, DAG.getConstant(ShiftAmt, MVT::i32));
  
+      if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRL) {
+        // Make a large shift.
+        SDValue SRL =
+          DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                      DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
+                      R, DAG.getConstant(ShiftAmt, MVT::i32));
+        // Zero out the leftmost bits.
+        SmallVector<SDValue, 16> V(16, DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+                                                       MVT::i8));
+        return DAG.getNode(ISD::AND, dl, VT, SRL,
+                           DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+      }
+
        if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL)
         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
@@ -9940,6 +10281,65 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                       DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
                       R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRA) {
+        if (ShiftAmt == 7) {
+          // R s>> 7  ===  R s< 0
+          SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
+          return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R);
+        }
+
+        // R s>> a === ((R u>> a) ^ m) - m
+        SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+        SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
+                                                       MVT::i8));
+        SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
+        Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+        Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+        return Res;
+      }
+
+      if (Subtarget->hasAVX2()) {
+        if (VT == MVT::v4i64 && Op.getOpcode() == ISD::SHL)
+         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
+                       R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+        if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SHL)
+         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_avx2_pslli_d, MVT::i32),
+                       R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+        if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SHL)
+         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32),
+                       R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+        if (VT == MVT::v4i64 && Op.getOpcode() == ISD::SRL)
+         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
+                       R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+        if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SRL)
+         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_avx2_psrli_d, MVT::i32),
+                       R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+        if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SRL)
+         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32),
+                       R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+        if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SRA)
+         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_avx2_psrai_d, MVT::i32),
+                       R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+        if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SRA)
+         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_avx2_psrai_w, MVT::i32),
+                       R, DAG.getConstant(ShiftAmt, MVT::i32));
+        }
      }
    }
  
@@ -9956,7 +10356,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
      SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
      SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                                   MachinePointerInfo::getConstantPool(),
-                                 false, false, 16);
+                                 false, false, false, 16);
  
      Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
@@ -9978,7 +10378,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
      SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
      SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
-                            false, false, 16);
+                            false, false, false, 16);
  
      // r = pblendv(r, psllw(r & (char16)15, 4), a);
      M = DAG.getNode(ISD::AND, dl, VT, R, M);
@@ -9993,7 +10393,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
      CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
      M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                      MachinePointerInfo::getConstantPool(),
-                    false, false, 16);
+                    false, false, false, 16);
  
      // r = pblendv(r, psllw(r & (char16)63, 2), a);
      M = DAG.getNode(ISD::AND, dl, VT, R, M);
@@ -10009,6 +10409,48 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
                      R, DAG.getNode(ISD::ADD, dl, VT, R, R));
      return R;
    }
+
+  // Decompose 256-bit shifts into smaller 128-bit shifts.
+  if (VT.getSizeInBits() == 256) {
+    int NumElems = VT.getVectorNumElements();
+    MVT EltVT = VT.getVectorElementType().getSimpleVT();
+    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+    // Extract the two vectors
+    SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
+    SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
+                                     DAG, dl);
+
+    // Recreate the shift amount vectors
+    SDValue Amt1, Amt2;
+    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+      // Constant shift amount
+      SmallVector<SDValue, 4> Amt1Csts;
+      SmallVector<SDValue, 4> Amt2Csts;
+      for (int i = 0; i < NumElems/2; ++i)
+        Amt1Csts.push_back(Amt->getOperand(i));
+      for (int i = NumElems/2; i < NumElems; ++i)
+        Amt2Csts.push_back(Amt->getOperand(i));
+
+      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+                                 &Amt1Csts[0], NumElems/2);
+      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+                                 &Amt2Csts[0], NumElems/2);
+    } else {
+      // Variable shift amount
+      Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
+      Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
+                                 DAG, dl);
+    }
+
+    // Issue new vector shifts for the smaller types
+    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
+    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
+
+    // Concatenate the result back
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
+  }
+
    return SDValue();
  }
  
@@ -10102,11 +10544,6 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG)
      switch (VT.getSimpleVT().SimpleTy) {
        default:
          return SDValue();
-      case MVT::v2i64: {
-        SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_q;
-        SRAIntrinsicsID = 0;
-        break;
-      }
        case MVT::v4i32: {
          SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d;
          SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d;
@@ -10123,15 +10560,9 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG)
                           DAG.getConstant(SHLIntrinsicsID, MVT::i32),
                           Node->getOperand(0), ShAmt);
  
-    // In case of 1 bit sext, no need to shr
-    if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1;
-
-    if (SRAIntrinsicsID) {
-      Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(SRAIntrinsicsID, MVT::i32),
-                         Tmp1, ShAmt);
-    }
-    return Tmp1;
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(SRAIntrinsicsID, MVT::i32),
+                       Tmp1, ShAmt);
    }
  
    return SDValue();
@@ -10460,9 +10891,9 @@ static void ReplaceATOMIC_LOAD(SDNode *Node,
  void X86TargetLowering::
  ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
                          SelectionDAG &DAG, unsigned NewOp) const {
-  EVT T = Node->getValueType(0);
    DebugLoc dl = Node->getDebugLoc();
-  assert (T == MVT::i64 && "Only know how to expand i64 atomics");
+  assert (Node->getValueType(0) == MVT::i64 &&
+          "Only know how to expand i64 atomics");
  
    SDValue Chain = Node->getOperand(0);
    SDValue In1 = Node->getOperand(1);
@@ -10505,7 +10936,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
        EVT VT = N->getValueType(0);
        // Return a load from the stack slot.
        Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
-                                    MachinePointerInfo(), false, false, 0));
+                                    MachinePointerInfo(), 
+                                    false, false, false, 0));
      }
      return;
    }
@@ -10642,6 +11074,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::PSIGNB:             return "X86ISD::PSIGNB";
    case X86ISD::PSIGNW:             return "X86ISD::PSIGNW";
    case X86ISD::PSIGND:             return "X86ISD::PSIGND";
+  case X86ISD::BLENDV:             return "X86ISD::BLENDV";
+  case X86ISD::FHADD:              return "X86ISD::FHADD";
+  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
    case X86ISD::FMAX:               return "X86ISD::FMAX";
    case X86ISD::FMIN:               return "X86ISD::FMIN";
    case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
@@ -10684,6 +11119,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::OR:                 return "X86ISD::OR";
    case X86ISD::XOR:                return "X86ISD::XOR";
    case X86ISD::AND:                return "X86ISD::AND";
+  case X86ISD::ANDN:               return "X86ISD::ANDN";
+  case X86ISD::BLSI:               return "X86ISD::BLSI";
+  case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
+  case X86ISD::BLSR:               return "X86ISD::BLSR";
    case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
    case X86ISD::PTEST:              return "X86ISD::PTEST";
    case X86ISD::TESTP:              return "X86ISD::TESTP";
@@ -11764,6 +12203,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
    unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
      bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
      tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+    SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
      sizeVReg = MI->getOperand(1).getReg(),
      physSPReg = Is64Bit ? X86::RSP : X86::ESP;
  
@@ -11781,19 +12221,19 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
    // Add code to the main basic block to check if the stack limit has been hit,
    // and if so, jump to mallocMBB otherwise to bumpMBB.
    BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
-  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), tmpSPVReg)
+  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
      .addReg(tmpSPVReg).addReg(sizeVReg);
    BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
      .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg)
-    .addReg(tmpSPVReg);
+    .addReg(SPLimitVReg);
    BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
  
    // bumpMBB simply decreases the stack pointer, since we know the current
    // stacklet has enough space.
    BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
-    .addReg(tmpSPVReg);
+    .addReg(SPLimitVReg);
    BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
-    .addReg(tmpSPVReg);
+    .addReg(SPLimitVReg);
    BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
  
    // Calls into a routine in libgcc to allocate more space from the heap.
@@ -11945,11 +12385,11 @@ MachineBasicBlock *
  X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                 MachineBasicBlock *BB) const {
    switch (MI->getOpcode()) {
-  default: assert(false && "Unexpected instr type to insert");
+  default: assert(0 && "Unexpected instr type to insert");
    case X86::TAILJMPd64:
    case X86::TAILJMPr64:
    case X86::TAILJMPm64:
-    assert(!"TAILJMP64 would not be touched here.");
+    assert(0 && "TAILJMP64 would not be touched here.");
    case X86::TCRETURNdi64:
    case X86::TCRETURNri64:
    case X86::TCRETURNmi64:
@@ -12308,6 +12748,33 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
      KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
                                         Mask.getBitWidth() - 1);
      break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned NumLoBits = 0;
+    switch (IntId) {
+    default: break;
+    case Intrinsic::x86_sse_movmsk_ps:
+    case Intrinsic::x86_avx_movmsk_ps_256:
+    case Intrinsic::x86_sse2_movmsk_pd:
+    case Intrinsic::x86_avx_movmsk_pd_256:
+    case Intrinsic::x86_mmx_pmovmskb:
+    case Intrinsic::x86_sse2_pmovmskb_128: {
+      // High bits of movmskp{s|d}, pmovmskb are known zero.
+      switch (IntId) {
+        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
+        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
+        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
+        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
+        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
+        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
+      }
+      KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(),
+                                        Mask.getBitWidth() - NumLoBits);
+      break;
+    }
+    }
+    break;
+  }
    }
  }
  
@@ -12542,7 +13009,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
      // Load the scalar.
      SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
                                       ScalarAddr, MachinePointerInfo(),
-                                     false, false, 0);
+                                     false, false, false, 0);
  
      // Replace the exact with the load.
      DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
@@ -12561,14 +13028,16 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
    // Get the LHS/RHS of the select.
    SDValue LHS = N->getOperand(1);
    SDValue RHS = N->getOperand(2);
+  EVT VT = LHS.getValueType();
  
    // If we have SSE[12] support, try to form min/max nodes. SSE min/max
    // instructions match the semantics of the common C idiom x<y?x:y but not
    // x<=y?x:y, because of how they handle negative zero (which can be
    // ignored in unsafe-math mode).
-  if (Subtarget->hasXMMInt() && Cond.getOpcode() == ISD::SETCC &&
-      (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64 ||
-       LHS.getValueType() == MVT::v4f32 || LHS.getValueType() == MVT::v2f64)) {
+  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
+      VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+      (Subtarget->hasXMMInt() ||
+       (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
      ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
  
      unsigned Opcode = 0;
@@ -12992,7 +13461,8 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
  
    // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
    // since the result of setcc_c is all zero's or all ones.
-  if (N1C && N0.getOpcode() == ISD::AND &&
+  if (VT.isInteger() && !VT.isVector() &&
+      N1C && N0.getOpcode() == ISD::AND &&
        N0.getOperand(1).getOpcode() == ISD::Constant) {
      SDValue N00 = N0.getOperand(0);
      if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
@@ -13008,6 +13478,22 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
      }
    }
  
+
+  // Hardware support for vector shifts is sparse which makes us scalarize the
+  // vector operations in many cases. Also, on sandybridge ADD is faster than
+  // shl.
+  // (shl V, 1) -> add V,V
+  if (isSplatVector(N1.getNode())) {
+    assert(N0.getValueType().isVector() && "Invalid vector shift type");
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
+    // We shift all of the values by one. In many cases we do not have
+    // hardware support for this operation. This is better expressed as an ADD
+    // of two values.
+    if (N1C && (1 == N1C->getZExtValue())) {
+      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
+    }
+  }
+
    return SDValue();
  }
  
@@ -13016,9 +13502,10 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
  static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
                                     const X86Subtarget *Subtarget) {
    EVT VT = N->getValueType(0);
-  if (!VT.isVector() && VT.isInteger() &&
-      N->getOpcode() == ISD::SHL)
-    return PerformSHLCombine(N, DAG);
+  if (N->getOpcode() == ISD::SHL) {
+    SDValue V = PerformSHLCombine(N, DAG);
+    if (V.getNode()) return V;
+  }
  
    // On X86 with SSE2 support, we can transform this to a vector shift if
    // all elements are shifted by the same amount.  We can't do this in legalize
@@ -13027,7 +13514,9 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
    if (!Subtarget->hasXMMInt())
      return SDValue();
  
-  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
+  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
+      (!Subtarget->hasAVX2() ||
+       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
      return SDValue();
  
    SDValue ShAmtOp = N->getOperand(1);
@@ -13100,6 +13589,18 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
        return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                           DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
                           ValOp, BaseShAmt);
+    if (VT == MVT::v4i64)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT == MVT::v8i32)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_pslli_d, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT == MVT::v16i16)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32),
+                         ValOp, BaseShAmt);
      break;
    case ISD::SRA:
      if (VT == MVT::v4i32)
@@ -13110,6 +13611,14 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
        return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                           DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
                           ValOp, BaseShAmt);
+    if (VT == MVT::v8i32)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_psrai_d, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT == MVT::v16i16)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_psrai_w, MVT::i32),
+                         ValOp, BaseShAmt);
      break;
    case ISD::SRL:
      if (VT == MVT::v2i64)
@@ -13124,6 +13633,18 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
        return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                           DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
                           ValOp, BaseShAmt);
+    if (VT == MVT::v4i64)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT == MVT::v8i32)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_psrli_d, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT ==  MVT::v16i16)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32),
+                         ValOp, BaseShAmt);
      break;
    }
    return SDValue();
@@ -13248,11 +13769,50 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
    if (R.getNode())
      return R;
  
+  EVT VT = N->getValueType(0);
+
+  // Create ANDN, BLSI, and BLSR instructions
+  // BLSI is X & (-X)
+  // BLSR is X & (X-1)
+  if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    DebugLoc DL = N->getDebugLoc();
+
+    // Check LHS for not
+    if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1)))
+      return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1);
+    // Check RHS for not
+    if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
+      return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
+
+    // Check LHS for neg
+    if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
+        isZero(N0.getOperand(0)))
+      return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
+
+    // Check RHS for neg
+    if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
+        isZero(N1.getOperand(0)))
+      return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
+
+    // Check LHS for X-1
+    if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
+        isAllOnes(N0.getOperand(1)))
+      return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
+
+    // Check RHS for X-1
+    if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
+        isAllOnes(N1.getOperand(1)))
+      return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
+
+    return SDValue();
+  }
+
    // Want to form ANDNP nodes:
    // 1) In the hopes of then easily combining them with OR and AND nodes
    //    to form PBLEND/PSIGN.
    // 2) To match ANDN packed intrinsics
-  EVT VT = N->getValueType(0);
    if (VT != MVT::v2i64 && VT != MVT::v4i64)
      return SDValue();
  
@@ -13433,6 +13993,117 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  // Create BLSMSK instructions by finding X ^ (X-1)
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
+      isAllOnes(N0.getOperand(1)))
+    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
+
+  if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
+      isAllOnes(N1.getOperand(1)))
+    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
+
+  return SDValue();
+}
+
+/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
+static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
+  LoadSDNode *Ld = cast<LoadSDNode>(N);
+  EVT RegVT = Ld->getValueType(0);
+  EVT MemVT = Ld->getMemoryVT();
+  DebugLoc dl = Ld->getDebugLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  ISD::LoadExtType Ext = Ld->getExtensionType();
+
+  // If this is a vector EXT Load then attempt to optimize it using a
+  // shuffle. We need SSE4 for the shuffles.
+  // TODO: It is possible to support ZExt by zeroing the undef values
+  // during the shuffle phase or after the shuffle.
+  if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
+    assert(MemVT != RegVT && "Cannot extend to the same type");
+    assert(MemVT.isVector() && "Must load a vector from memory");
+
+    unsigned NumElems = RegVT.getVectorNumElements();
+    unsigned RegSz = RegVT.getSizeInBits();
+    unsigned MemSz = MemVT.getSizeInBits();
+    assert(RegSz > MemSz && "Register size must be greater than the mem size");
+    // All sizes must be a power of two
+    if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
+
+    // Attempt to load the original value using a single load op.
+    // Find a scalar type which is equal to the loaded word size.
+    MVT SclrLoadTy = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) &&  Tp.getSizeInBits() == MemSz) {
+        SclrLoadTy = Tp;
+        break;
+      }
+    }
+
+    // Proceed if a load word is found.
+    if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
+
+    EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
+      RegSz/SclrLoadTy.getSizeInBits());
+
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                                  RegSz/MemVT.getScalarType().getSizeInBits());
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+    // Perform a single load.
+    SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+                                  Ld->getBasePtr(),
+                                  Ld->getPointerInfo(), Ld->isVolatile(),
+                                  Ld->isNonTemporal(), Ld->isInvariant(),
+                                  Ld->getAlignment());
+
+    // Insert the word loaded into a vector.
+    SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+      LoadUnitVecVT, ScalarLoad);
+
+    // Bitcast the loaded value to a vector of the original element type, in
+    // the size of the target vector type.
+    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector);
+    unsigned SizeRatio = RegSz/MemSz;
+
+    // Redistribute the loaded elements into the different locations.
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i;
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+                                DAG.getUNDEF(SlicedVec.getValueType()),
+                                ShuffleVec.data());
+
+    // Bitcast to the requested type.
+    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+    // Replace the original load with the new sequence
+    // and return the new chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
+    return SDValue(ScalarLoad.getNode(), 1);
+  }
+
+  return SDValue();
+}
+
  /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
  static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                     const X86Subtarget *Subtarget) {
@@ -13480,8 +14151,9 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
      // From, To sizes and ElemCount must be pow of two
      if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
      // We are going to use the original vector elt for storing.
-    // accumulated smaller vector elements must be a multiple of bigger size.
-    if (0 != (NumElems * ToSz) % FromSz) return SDValue();
+    // Accumulated smaller vector elements must be a multiple of the store size.
+    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
+
      unsigned SizeRatio  = FromSz / ToSz;
  
      assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
@@ -13598,7 +14270,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
        EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
        SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
                                    Ld->getPointerInfo(), Ld->isVolatile(),
-                                  Ld->isNonTemporal(), Ld->getAlignment());
+                                  Ld->isNonTemporal(), Ld->isInvariant(),
+                                  Ld->getAlignment());
        SDValue NewChain = NewLd.getValue(1);
        if (TokenFactorIndex != -1) {
          Ops.push_back(NewChain);
@@ -13619,10 +14292,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
      SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
                                 Ld->getPointerInfo(),
                                 Ld->isVolatile(), Ld->isNonTemporal(),
-                               Ld->getAlignment());
+                               Ld->isInvariant(), Ld->getAlignment());
      SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
                                 Ld->getPointerInfo().getWithOffset(4),
                                 Ld->isVolatile(), Ld->isNonTemporal(),
+                               Ld->isInvariant(),
                                 MinAlign(Ld->getAlignment(), 4));
  
      SDValue NewChain = LoLd.getValue(1);
@@ -13651,6 +14325,150 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
+/// and return the operands for the horizontal operation in LHS and RHS.  A
+/// horizontal operation performs the binary operation on successive elements
+/// of its first operand, then on successive elements of its second operand,
+/// returning the resulting values in a vector.  For example, if
+///   A = < float a0, float a1, float a2, float a3 >
+/// and
+///   B = < float b0, float b1, float b2, float b3 >
+/// then the result of doing a horizontal operation on A and B is
+///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
+/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
+/// A horizontal-op B, for some already available A and B, and if so then LHS is
+/// set to A, RHS to B, and the routine returns 'true'.
+/// Note that the binary operation should have the property that if one of the
+/// operands is UNDEF then the result is UNDEF.
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
+  // Look for the following pattern: if
+  //   A = < float a0, float a1, float a2, float a3 >
+  //   B = < float b0, float b1, float b2, float b3 >
+  // and
+  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
+  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
+  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
+  // which is A horizontal-op B.
+
+  // At least one of the operands should be a vector shuffle.
+  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
+      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
+    return false;
+
+  EVT VT = LHS.getValueType();
+  unsigned N = VT.getVectorNumElements();
+
+  // View LHS in the form
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  // If LHS is not a shuffle then pretend it is the shuffle
+  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
+  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
+  // type VT.
+  SDValue A, B;
+  SmallVector<int, 8> LMask(N);
+  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
+      A = LHS.getOperand(0);
+    if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
+      B = LHS.getOperand(1);
+    cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(LMask);
+  } else {
+    if (LHS.getOpcode() != ISD::UNDEF)
+      A = LHS;
+    for (unsigned i = 0; i != N; ++i)
+      LMask[i] = i;
+  }
+
+  // Likewise, view RHS in the form
+  //   RHS = VECTOR_SHUFFLE C, D, RMask
+  SDValue C, D;
+  SmallVector<int, 8> RMask(N);
+  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
+      C = RHS.getOperand(0);
+    if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
+      D = RHS.getOperand(1);
+    cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(RMask);
+  } else {
+    if (RHS.getOpcode() != ISD::UNDEF)
+      C = RHS;
+    for (unsigned i = 0; i != N; ++i)
+      RMask[i] = i;
+  }
+
+  // Check that the shuffles are both shuffling the same vectors.
+  if (!(A == C && B == D) && !(A == D && B == C))
+    return false;
+
+  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
+  if (!A.getNode() && !B.getNode())
+    return false;
+
+  // If A and B occur in reverse order in RHS, then "swap" them (which means
+  // rewriting the mask).
+  if (A != C)
+    for (unsigned i = 0; i != N; ++i) {
+      unsigned Idx = RMask[i];
+      if (Idx < N)
+        RMask[i] += N;
+      else if (Idx < 2*N)
+        RMask[i] -= N;
+    }
+
+  // At this point LHS and RHS are equivalent to
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  //   RHS = VECTOR_SHUFFLE A, B, RMask
+  // Check that the masks correspond to performing a horizontal operation.
+  for (unsigned i = 0; i != N; ++i) {
+    unsigned LIdx = LMask[i], RIdx = RMask[i];
+
+    // Ignore any UNDEF components.
+    if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N))
+        || (!B.getNode() && (LIdx >= N || RIdx >= N)))
+      continue;
+
+    // Check that successive elements are being operated on.  If not, this is
+    // not a horizontal operation.
+    if (!(LIdx == 2*i && RIdx == 2*i + 1) &&
+        !(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i))
+      return false;
+  }
+
+  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+  return true;
+}
+
+/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
+static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+      (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+      isHorizontalBinOp(LHS, RHS, true))
+    return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
+  return SDValue();
+}
+
+/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
+static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Try to synthesize horizontal subs from subs of shuffles.
+  if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+      (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+      isHorizontalBinOp(LHS, RHS, false))
+    return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
+  return SDValue();
+}
+
  /// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
  /// X86ISD::FXOR nodes.
  static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
@@ -13885,8 +14703,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
    case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
    case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
+  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
+  case ISD::LOAD:           return PerformLOADCombine(N, DAG, Subtarget);
    case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
    case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
+  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
+  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
    case X86ISD::FXOR:
    case X86ISD::FOR:         return PerformFORCombine(N, DAG);
    case X86ISD::FAND:        return PerformFANDCombine(N, DAG);