Remove some patterns for matching vector_shuffle instructions since vector_shuffles...

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 4d0662a7fd02dbdac81e63241d67f5bca33d76fe..1343b571ece4934b0bf7185be30418dc6d1b9286 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -39,7 +39,6 @@
  #include "llvm/MC/MCContext.h"
  #include "llvm/MC/MCExpr.h"
  #include "llvm/MC/MCSymbol.h"
-#include "llvm/ADT/BitVector.h"
  #include "llvm/ADT/SmallSet.h"
  #include "llvm/ADT/Statistic.h"
  #include "llvm/ADT/StringExtras.h"
@@ -179,8 +178,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
  
    // For 64-bit since we have so many registers use the ILP scheduler, for
    // 32-bit code use the register pressure specific scheduling.
+  // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling.
    if (Subtarget->is64Bit())
      setSchedulingPreference(Sched::ILP);
+  else if (Subtarget->isAtom()) 
+    setSchedulingPreference(Sched::Hybrid);
    else
      setSchedulingPreference(Sched::RegPressure);
    setStackPointerRegisterToSaveRestore(X86StackPtr);
@@ -1218,6 +1220,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setTargetDAGCombine(ISD::LOAD);
    setTargetDAGCombine(ISD::STORE);
    setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::TRUNCATE);
    setTargetDAGCombine(ISD::SINT_TO_FP);
    if (Subtarget->is64Bit())
      setTargetDAGCombine(ISD::MUL);
@@ -2510,8 +2514,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
    // registers.
    if (UseRegMask) {
      const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-    const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
-    Ops.push_back(DAG.getRegisterMask(Mask));
+    if (const uint32_t *Mask = TRI->getCallPreservedMask(CallConv))
+      Ops.push_back(DAG.getRegisterMask(Mask));
    }
  
    if (InFlag.getNode())
@@ -3709,7 +3713,7 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
  /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
  /// to the same elements of the low, but to the higher half of the source.
  /// In VPERMILPD the two lanes could be shuffled independently of each other
-/// with the same restriction that lanes can't be crossed.
+/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
  static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
    if (!HasAVX)
      return false;
@@ -3738,35 +3742,6 @@ static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
    return true;
  }
  
-/// getShuffleVPERMILPImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERMILPS/D* instructions.
-static unsigned getShuffleVPERMILPImmediate(ShuffleVectorSDNode *SVOp) {
-  EVT VT = SVOp->getValueType(0);
-
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned LaneSize = NumElts/NumLanes;
-
-  // Although the mask is equal for both lanes do it twice to get the cases
-  // where a mask will match because the same mask element is undef on the
-  // first half but valid on the second. This would get pathological cases
-  // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid.
-  unsigned Shift = (LaneSize == 4) ? 2 : 1;
-  unsigned Mask = 0;
-  for (unsigned i = 0; i != NumElts; ++i) {
-    int MaskElt = SVOp->getMaskElt(i);
-    if (MaskElt < 0)
-      continue;
-    MaskElt %= LaneSize;
-    unsigned Shamt = i;
-    // VPERMILPSY, the mask of the first half must be equal to the second one
-    if (NumElts == 8) Shamt %= LaneSize;
-    Mask |= MaskElt << (Shamt*Shift);
-  }
-
-  return Mask;
-}
-
  /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
  /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
  /// element of vector 2 and the other elements to come from vector 1 in order.
@@ -4207,7 +4182,7 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
  
  /// getZeroVector - Returns a vector of specified type with all zero elements.
  ///
-static SDValue getZeroVector(EVT VT, bool HasSSE2, bool HasAVX2,
+static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
                               SelectionDAG &DAG, DebugLoc dl) {
    assert(VT.isVector() && "Expected a vector type");
  
@@ -4215,7 +4190,7 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, bool HasAVX2,
    // to their dest type. This ensures they get CSE'd.
    SDValue Vec;
    if (VT.getSizeInBits() == 128) {  // SSE
-    if (HasSSE2) {  // SSE2
+    if (Subtarget->hasSSE2()) {  // SSE2
        SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
      } else { // SSE1
@@ -4223,7 +4198,7 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, bool HasAVX2,
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
      }
    } else if (VT.getSizeInBits() == 256) { // AVX
-    if (HasAVX2) { // AVX2
+    if (Subtarget->hasAVX2()) { // AVX2
        SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
        SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
@@ -4270,23 +4245,12 @@ static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
  
  /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
  /// that point to V2 points to its first element.
-static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
-  EVT VT = SVOp->getValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-
-  bool Changed = false;
-  SmallVector<int, 8> MaskVec(SVOp->getMask().begin(), SVOp->getMask().end());
-
+static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
    for (unsigned i = 0; i != NumElems; ++i) {
-    if (MaskVec[i] > (int)NumElems) {
-      MaskVec[i] = NumElems;
-      Changed = true;
+    if (Mask[i] > (int)NumElems) {
+      Mask[i] = NumElems;
      }
    }
-  if (Changed)
-    return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
-                                SVOp->getOperand(1), &MaskVec[0]);
-  return SDValue(SVOp, 0);
  }
  
  /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
@@ -4427,8 +4391,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
                                             SelectionDAG &DAG) {
    EVT VT = V2.getValueType();
    SDValue V1 = IsZero
-    ? getZeroVector(VT, Subtarget->hasSSE2(), Subtarget->hasAVX2(), DAG,
-                    V2.getDebugLoc()) : DAG.getUNDEF(VT);
+    ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
    unsigned NumElems = VT.getVectorNumElements();
    SmallVector<int, 16> MaskVec;
    for (unsigned i = 0; i != NumElems; ++i)
@@ -4455,14 +4418,15 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
      if (Index < 0)
        return DAG.getUNDEF(VT.getVectorElementType());
  
-    int NumElems = VT.getVectorNumElements();
-    SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1);
+    unsigned NumElems = VT.getVectorNumElements();
+    SDValue NewV = (Index < (int)NumElems) ? SV->getOperand(0)
+                                           : SV->getOperand(1);
      return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1);
    }
  
    // Recurse into target specific vector shuffles to find scalars.
    if (isTargetShuffle(Opcode)) {
-    int NumElems = VT.getVectorNumElements();
+    unsigned NumElems = VT.getVectorNumElements();
      SmallVector<unsigned, 16> ShuffleMask;
      SDValue ImmN;
  
@@ -4485,9 +4449,9 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
        DecodeMOVLHPSMask(NumElems, ShuffleMask);
        break;
      case X86ISD::PSHUFD:
+    case X86ISD::VPERMILP:
        ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodePSHUFMask(NumElems,
-                      cast<ConstantSDNode>(ImmN)->getZExtValue(),
+      DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                        ShuffleMask);
        break;
      case X86ISD::PSHUFHW:
@@ -4509,14 +4473,9 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
        return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
                                   Depth+1);
      }
-    case X86ISD::VPERMILP:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERMILPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
      case X86ISD::VPERM2X128:
        ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+      DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                             ShuffleMask);
        break;
      case X86ISD::MOVDDUP:
@@ -4527,16 +4486,15 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
      case X86ISD::MOVSLDUP:
      case X86ISD::PALIGN:
        return SDValue(); // Not yet implemented.
-    default:
-      assert(0 && "unknown target shuffle node");
-      return SDValue();
+    default: llvm_unreachable("unknown target shuffle node");
      }
  
      Index = ShuffleMask[Index];
      if (Index < 0)
        return DAG.getUNDEF(VT.getVectorElementType());
  
-    SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1);
+    SDValue NewV = (Index < (int)NumElems) ? N->getOperand(0)
+                                           : N->getOperand(1);
      return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG,
                                 Depth+1);
    }
@@ -4697,6 +4655,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
  static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
                                         unsigned NumNonZero, unsigned NumZero,
                                         SelectionDAG &DAG,
+                                       const X86Subtarget* Subtarget,
                                         const TargetLowering &TLI) {
    if (NumNonZero > 8)
      return SDValue();
@@ -4708,8 +4667,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
      bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
      if (ThisIsNonZero && First) {
        if (NumZero)
-        V = getZeroVector(MVT::v8i16, /*HasSSE2*/ true, /*HasAVX2*/ false,
-                          DAG, dl);
+        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
        else
          V = DAG.getUNDEF(MVT::v8i16);
        First = false;
@@ -4745,6 +4703,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
  static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
                                       unsigned NumNonZero, unsigned NumZero,
                                       SelectionDAG &DAG,
+                                     const X86Subtarget* Subtarget,
                                       const TargetLowering &TLI) {
    if (NumNonZero > 4)
      return SDValue();
@@ -4757,8 +4716,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
      if (isNonZero) {
        if (First) {
          if (NumZero)
-          V = getZeroVector(MVT::v8i16, /*HasSSE2*/ true, /*HasAVX2*/ false,
-                            DAG, dl);
+          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
          else
            V = DAG.getUNDEF(MVT::v8i16);
          First = false;
@@ -4997,6 +4955,10 @@ static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) {
    if (!ISD::isNormalLoad(Ld.getNode()))
      return SDValue();
  
+  // Reject loads that have uses of the chain result
+  if (Ld->hasAnyUseOfValue(1))
+    return SDValue();
+
    bool Is256 = VT.getSizeInBits() == 256;
    bool Is128 = VT.getSizeInBits() == 128;
    unsigned ScalarSize = Ld.getValueType().getSizeInBits();
@@ -5040,8 +5002,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
      if (VT == MVT::v4i32 || VT == MVT::v8i32)
        return Op;
  
-    return getZeroVector(VT, Subtarget->hasSSE2(),
-                         Subtarget->hasAVX2(), DAG, dl);
+    return getZeroVector(VT, Subtarget, DAG, dl);
    }
  
    // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
@@ -5135,8 +5096,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
        if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
            (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
          if (VT.getSizeInBits() == 256) {
-          SDValue ZeroVec = getZeroVector(VT, Subtarget->hasSSE2(),
-                                          Subtarget->hasAVX2(), DAG, dl);
+          SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
            return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
                               Item, DAG.getIntPtrConstant(0));
          }
@@ -5150,8 +5110,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
          Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
          if (VT.getSizeInBits() == 256) {
-          SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget->hasSSE2(),
-                                          Subtarget->hasAVX2(), DAG, dl);
+          SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
            Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32),
                                      DAG, dl);
          } else {
@@ -5215,9 +5174,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
  
    // For AVX-length vectors, build the individual 128-bit pieces and use
    // shuffles to put them in place.
-  if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) {
+  if (VT.getSizeInBits() == 256) {
      SmallVector<SDValue, 32> V;
-    for (unsigned i = 0; i < NumElems; ++i)
+    for (unsigned i = 0; i != NumElems; ++i)
        V.push_back(Op.getOperand(i));
  
      EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
@@ -5249,13 +5208,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
    // If element VT is < 32 bits, convert it to inserts into a zero vector.
    if (EVTBits == 8 && NumElems == 16) {
      SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
-                                        *this);
+                                        Subtarget, *this);
      if (V.getNode()) return V;
    }
  
    if (EVTBits == 16 && NumElems == 8) {
      SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
-                                      *this);
+                                      Subtarget, *this);
      if (V.getNode()) return V;
    }
  
@@ -5265,8 +5224,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
      for (unsigned i = 0; i < 4; ++i) {
        bool isZero = !(NonZeros & (1 << i));
        if (isZero)
-        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), Subtarget->hasAVX2(),
-                             DAG, dl);
+        V[i] = getZeroVector(VT, Subtarget, DAG, dl);
        else
          V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
      }
@@ -5294,8 +5252,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
      int MaskVec[] = {
        Reverse1 ? 1 : 0,
        Reverse1 ? 0 : 1,
-      static_cast<int>(Reverse2 ? 1-NumElems :   NumElems),
-      static_cast<int>(Reverse2 ?   NumElems : 1+NumElems)
+      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
+      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
      };
      return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
    }
@@ -5442,7 +5400,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
    // mask values count as coming from any quadword, for better codegen.
    unsigned LoQuad[] = { 0, 0, 0, 0 };
    unsigned HiQuad[] = { 0, 0, 0, 0 };
-  BitVector InputQuads(4);
+  std::bitset<4> InputQuads;
    for (unsigned i = 0; i < 8; ++i) {
      unsigned *Quad = i < 4 ? LoQuad : HiQuad;
      int EltIdx = SVOp->getMaskElt(i);
@@ -5484,8 +5442,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
    bool V2Used = InputQuads[2] || InputQuads[3];
    if (Subtarget->hasSSSE3()) {
      if (InputQuads.count() == 2 && V1Used && V2Used) {
-      BestLoQuad = InputQuads.find_first();
-      BestHiQuad = InputQuads.find_next(BestLoQuad);
+      BestLoQuad = InputQuads[0] ? 0 : 1;
+      BestHiQuad = InputQuads[2] ? 2 : 3;
      }
      if (InputQuads.count() > 2) {
        BestLoQuad = -1;
@@ -5838,7 +5796,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
    unsigned NewWidth = (NumElems == 4) ? 2 : 4;
    EVT NewVT;
    switch (VT.getSimpleVT().SimpleTy) {
-  default: assert(false && "Unexpected!");
+  default: llvm_unreachable("Unexpected!");
    case MVT::v4f32: NewVT = MVT::v2f64; break;
    case MVT::v4i32: NewVT = MVT::v2i64; break;
    case MVT::v8i16: NewVT = MVT::v4i32; break;
@@ -6370,8 +6328,7 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
    SDValue V2 = Op.getOperand(1);
  
    if (isZeroShuffle(SVOp))
-    return getZeroVector(VT, Subtarget->hasSSE2(), Subtarget->hasAVX2(),
-                         DAG, dl);
+    return getZeroVector(VT, Subtarget, DAG, dl);
  
    // Handle splat operations
    if (SVOp->isSplat()) {
@@ -6499,6 +6456,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  
      unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
  
+    if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64))
+      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG);
+
      if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
        return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
  
@@ -6564,18 +6524,16 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    V1IsSplat = isSplatVector(V1.getNode());
    V2IsSplat = isSplatVector(V2.getNode());
  
+  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
+
    // Canonicalize the splat or undef, if present, to be on the RHS.
-  if (V1IsSplat && !V2IsSplat) {
-    Op = CommuteVectorShuffle(SVOp, DAG);
-    SVOp = cast<ShuffleVectorSDNode>(Op);
-    V1 = SVOp->getOperand(0);
-    V2 = SVOp->getOperand(1);
+  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
+    CommuteVectorShuffleMask(M, NumElems);
+    std::swap(V1, V2);
      std::swap(V1IsSplat, V2IsSplat);
      Commuted = true;
    }
  
-  ArrayRef<int> M = SVOp->getMask();
-
    if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
      // Shuffling low element of v1 into undef, just return v1.
      if (V2IsUndef)
@@ -6595,29 +6553,29 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    if (V2IsSplat) {
      // Normalize mask so all entries that point to V2 points to its first
      // element then try to match unpck{h|l} again. If match, return a
-    // new vector_shuffle with the corrected mask.
-    SDValue NewMask = NormalizeMask(SVOp, DAG);
-    ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
-    if (NSVOp != SVOp) {
-      if (X86::isUNPCKLMask(NSVOp, HasAVX2, true)) {
-        return NewMask;
-      } else if (X86::isUNPCKHMask(NSVOp, HasAVX2, true)) {
-        return NewMask;
-      }
+    // new vector_shuffle with the corrected mask.p
+    SmallVector<int, 8> NewMask(M.begin(), M.end());
+    NormalizeMask(NewMask, NumElems);
+    if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) {
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+    } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) {
+      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
      }
    }
  
    if (Commuted) {
      // Commute is back and try unpck* again.
      // FIXME: this seems wrong.
-    SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
-    ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
+    CommuteVectorShuffleMask(M, NumElems);
+    std::swap(V1, V2);
+    std::swap(V1IsSplat, V2IsSplat);
+    Commuted = false;
  
-    if (X86::isUNPCKLMask(NewSVOp, HasAVX2))
-      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V2, V1, DAG);
+    if (isUNPCKLMask(M, VT, HasAVX2))
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
  
-    if (X86::isUNPCKHMask(NewSVOp, HasAVX2))
-      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V2, V1, DAG);
+    if (isUNPCKHMask(M, VT, HasAVX2))
+      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
    }
  
    // Normalize the node to match x86 shuffle ops if needed
@@ -6668,9 +6626,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
  
    // Handle VPERMILPS/D* permutations
-  if (isVPERMILPMask(M, VT, HasAVX))
+  if (isVPERMILPMask(M, VT, HasAVX)) {
+    if (HasAVX2 && VT == MVT::v8i32)
+      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
+                                  X86::getShuffleSHUFImmediate(SVOp), DAG);
      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
-                                getShuffleVPERMILPImmediate(SVOp), DAG);
+                                X86::getShuffleSHUFImmediate(SVOp), DAG);
+  }
  
    // Handle VPERM2F128/VPERM2I128 permutations
    if (isVPERM2X128Mask(M, VT, HasAVX))
@@ -7559,12 +7521,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
    LLVMContext *Context = DAG.getContext();
  
    // Build some magic constants.
-  SmallVector<Constant*,4> CV0;
-  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
-  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
-  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
-  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
-  Constant *C0 = ConstantVector::get(CV0);
+  const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+  Constant *C0 = ConstantDataVector::get(*Context, CV0);
    SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
  
    SmallVector<Constant*,2> CV1;
@@ -8316,7 +8274,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    if (isFP) {
      unsigned SSECC = 8;
      EVT EltVT = Op0.getValueType().getVectorElementType();
-    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+    assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT;
  
      bool Swap = false;
  
@@ -9452,6 +9410,10 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
    }
  
    // Arithmetic intrinsics.
+  case Intrinsic::x86_sse2_pmulu_dq:
+  case Intrinsic::x86_avx2_pmulu_dq:
+    return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
    case Intrinsic::x86_sse3_hadd_ps:
    case Intrinsic::x86_sse3_hadd_pd:
    case Intrinsic::x86_avx_hadd_ps_256:
@@ -9492,26 +9454,6 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
    case Intrinsic::x86_avx2_psrav_d_256:
      return DAG.getNode(ISD::SRA, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
-  case Intrinsic::x86_sse2_pcmpeq_b:
-  case Intrinsic::x86_sse2_pcmpeq_w:
-  case Intrinsic::x86_sse2_pcmpeq_d:
-  case Intrinsic::x86_sse41_pcmpeqq:
-  case Intrinsic::x86_avx2_pcmpeq_b:
-  case Intrinsic::x86_avx2_pcmpeq_w:
-  case Intrinsic::x86_avx2_pcmpeq_d:
-  case Intrinsic::x86_avx2_pcmpeq_q:
-    return DAG.getNode(X86ISD::PCMPEQ, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-  case Intrinsic::x86_sse2_pcmpgt_b:
-  case Intrinsic::x86_sse2_pcmpgt_w:
-  case Intrinsic::x86_sse2_pcmpgt_d:
-  case Intrinsic::x86_sse42_pcmpgtq:
-  case Intrinsic::x86_avx2_pcmpgt_b:
-  case Intrinsic::x86_avx2_pcmpgt_w:
-  case Intrinsic::x86_avx2_pcmpgt_d:
-  case Intrinsic::x86_avx2_pcmpgt_q:
-    return DAG.getNode(X86ISD::PCMPGT, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
    case Intrinsic::x86_ssse3_pshuf_b_128:
    case Intrinsic::x86_avx2_pshuf_b:
      return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
@@ -9533,6 +9475,12 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
    case Intrinsic::x86_avx2_vperm2i128:
      return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::x86_avx_vpermil_ps:
+  case Intrinsic::x86_avx_vpermil_pd:
+  case Intrinsic::x86_avx_vpermil_ps_256:
+  case Intrinsic::x86_avx_vpermil_pd_256:
+    return DAG.getNode(X86ISD::VPERMILP, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
  
    // ptest and testp intrinsics. The intrinsic these come from are designed to
    // return an integer value, not just an instruction so lower it to the ptest
@@ -10131,78 +10079,46 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
    if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
      return Lower256IntArith(Op, DAG);
  
+  assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
+         "Only know how to lower V2I64/V4I64 multiply");
+
    DebugLoc dl = Op.getDebugLoc();
  
+  //  Ahi = psrlqi(a, 32);
+  //  Bhi = psrlqi(b, 32);
+  //
+  //  AloBlo = pmuludq(a, b);
+  //  AloBhi = pmuludq(a, Bhi);
+  //  AhiBlo = pmuludq(Ahi, b);
+
+  //  AloBhi = psllqi(AloBhi, 32);
+  //  AhiBlo = psllqi(AhiBlo, 32);
+  //  return AloBlo + AloBhi + AhiBlo;
+
    SDValue A = Op.getOperand(0);
    SDValue B = Op.getOperand(1);
  
-  if (VT == MVT::v4i64) {
-    assert(Subtarget->hasAVX2() && "Lowering v4i64 multiply requires AVX2");
+  SDValue ShAmt = DAG.getConstant(32, MVT::i32);
  
-    //  ulong2 Ahi = __builtin_ia32_psrlqi256( a, 32);
-    //  ulong2 Bhi = __builtin_ia32_psrlqi256( b, 32);
-    //  ulong2 AloBlo = __builtin_ia32_pmuludq256( a, b );
-    //  ulong2 AloBhi = __builtin_ia32_pmuludq256( a, Bhi );
-    //  ulong2 AhiBlo = __builtin_ia32_pmuludq256( Ahi, b );
-    //
-    //  AloBhi = __builtin_ia32_psllqi256( AloBhi, 32 );
-    //  AhiBlo = __builtin_ia32_psllqi256( AhiBlo, 32 );
-    //  return AloBlo + AloBhi + AhiBlo;
-
-    SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A,
-                              DAG.getConstant(32, MVT::i32));
-    SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B,
-                              DAG.getConstant(32, MVT::i32));
-    SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
-                         A, B);
-    SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
-                         A, Bhi);
-    SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
-                         Ahi, B);
-    AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi,
-                         DAG.getConstant(32, MVT::i32));
-    AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo,
-                         DAG.getConstant(32, MVT::i32));
-    SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
-    Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
-    return Res;
-  }
+  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
+  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
  
-  assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
+  // Bit cast to 32-bit vectors for MULUDQ
+  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
+  A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
+  B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
+  Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
+  Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
  
-  //  ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
-  //  ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
-  //  ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
-  //  ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
-  //  ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
-  //
-  //  AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
-  //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
-  //  return AloBlo + AloBhi + AhiBlo;
+  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
+  SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
+  SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
+
+  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
+  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
  
-  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A,
-                            DAG.getConstant(32, MVT::i32));
-  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B,
-                            DAG.getConstant(32, MVT::i32));
-  SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
-                       A, B);
-  SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
-                       A, Bhi);
-  SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
-                       Ahi, B);
-  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi,
-                       DAG.getConstant(32, MVT::i32));
-  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo,
-                       DAG.getConstant(32, MVT::i32));
    SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
-  Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
-  return Res;
+  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
  }
  
  SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
@@ -10264,8 +10180,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
          if (Op.getOpcode() == ISD::SRA) {
            if (ShiftAmt == 7) {
              // R s>> 7  ===  R s< 0
-            SDValue Zeros = getZeroVector(VT, /* HasSSE2 */true,
-                                          /* HasAVX2 */false, DAG, dl);
+            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
              return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
            }
  
@@ -10308,8 +10223,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
          if (Op.getOpcode() == ISD::SRA) {
            if (ShiftAmt == 7) {
              // R s>> 7  ===  R s< 0
-            SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */,
-                                          true /* HasAVX2 */, DAG, dl);
+            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
              return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
            }
  
@@ -10331,8 +10245,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
      Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1),
                       DAG.getConstant(23, MVT::i32));
  
-    ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U));
-    Constant *C = ConstantVector::getSplat(4, CI);
+    const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U};
+    Constant *C = ConstantDataVector::get(*Context, CV);
      SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
      SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                                   MachinePointerInfo::getConstantPool(),
@@ -10653,8 +10567,7 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
    unsigned Reg = 0;
    unsigned size = 0;
    switch(T.getSimpleVT().SimpleTy) {
-  default:
-    assert(false && "Invalid value type!");
+  default: llvm_unreachable("Invalid value type!");
    case MVT::i8:  Reg = X86::AL;  size = 1; break;
    case MVT::i16: Reg = X86::AX;  size = 2; break;
    case MVT::i32: Reg = X86::EAX; size = 4; break;
@@ -10772,7 +10685,7 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
    unsigned Opc;
    bool ExtraOp = false;
    switch (Op.getOpcode()) {
-  default: assert(0 && "Invalid code");
+  default: llvm_unreachable("Invalid code");
    case ISD::ADDC: Opc = X86ISD::ADD; break;
    case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
    case ISD::SUBC: Opc = X86ISD::SUB; break;
@@ -10914,8 +10827,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
    DebugLoc dl = N->getDebugLoc();
    switch (N->getOpcode()) {
    default:
-    assert(false && "Do not know how to custom type legalize this operation!");
-    return;
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
    case ISD::SIGN_EXTEND_INREG:
    case ISD::ADDC:
    case ISD::ADDE:
@@ -11140,6 +11052,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
    case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
    case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
+  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
    case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
    case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
    case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -12071,6 +11984,42 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
    return EndMBB;
  }
  
+// The EFLAGS operand of SelectItr might be missing a kill marker
+// because there were multiple uses of EFLAGS, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
+                                     MachineBasicBlock* BB,
+                                     const TargetRegisterInfo* TRI) {
+  // Scan forward through BB for a use/def of EFLAGS.
+  MachineBasicBlock::iterator miI(llvm::next(SelectItr));
+  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+    const MachineInstr& mi = *miI;
+    if (mi.readsRegister(X86::EFLAGS))
+      return false;
+    if (mi.definesRegister(X86::EFLAGS))
+      break; // Should have kill-flag - update below.
+  }
+
+  // If we hit the end of the block, check whether EFLAGS is live into a
+  // successor.
+  if (miI == BB->end()) {
+    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+                                          sEnd = BB->succ_end();
+         sItr != sEnd; ++sItr) {
+      MachineBasicBlock* succ = *sItr;
+      if (succ->isLiveIn(X86::EFLAGS))
+        return false;
+    }
+  }
+
+  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
+  // out. SelectMI should have a kill flag on EFLAGS.
+  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
+  return true;
+}
+
  MachineBasicBlock *
  X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
                                       MachineBasicBlock *BB) const {
@@ -12100,7 +12049,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
  
    // If the EFLAGS register isn't dead in the terminator, then claim that it's
    // live into the sink and copy blocks.
-  if (!MI->killsRegister(X86::EFLAGS)) {
+  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+  if (!MI->killsRegister(X86::EFLAGS) &&
+      !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
      copy0MBB->addLiveIn(X86::EFLAGS);
      sinkMBB->addLiveIn(X86::EFLAGS);
    }
@@ -12359,11 +12310,11 @@ MachineBasicBlock *
  X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                 MachineBasicBlock *BB) const {
    switch (MI->getOpcode()) {
-  default: assert(0 && "Unexpected instr type to insert");
+  default: llvm_unreachable("Unexpected instr type to insert");
    case X86::TAILJMPd64:
    case X86::TAILJMPr64:
    case X86::TAILJMPm64:
-    assert(0 && "TAILJMP64 would not be touched here.");
+    llvm_unreachable("TAILJMP64 would not be touched here.");
    case X86::TCRETURNdi64:
    case X86::TCRETURNri64:
    case X86::TCRETURNmi64:
@@ -12736,6 +12687,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
      case Intrinsic::x86_avx2_pmovmskb: {
        // High bits of movmskp{s|d}, pmovmskb are known zero.
        switch (IntId) {
+        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
          case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
          case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
          case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
@@ -12814,7 +12766,7 @@ static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
  /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
  static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
                                          TargetLowering::DAGCombinerInfo &DCI,
-                                        bool HasAVX2) {
+                                        const X86Subtarget* Subtarget) {
    DebugLoc dl = N->getDebugLoc();
    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
    SDValue V1 = SVOp->getOperand(0);
@@ -12866,7 +12818,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
  
      // Emit a zeroed vector and insert the desired subvector on its
      // first half.
-    SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, HasAVX2, DAG, dl);
+    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
      SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
                           DAG.getConstant(0, MVT::i32), DAG, dl);
      return DCI.CombineTo(N, InsV);
@@ -12911,7 +12863,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
    if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
        N->getOpcode() == ISD::VECTOR_SHUFFLE)
-    return PerformShuffleCombine256(N, DAG, DCI, Subtarget->hasAVX2());
+    return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
  
    // Only handle 128 wide vector from here on.
    if (VT.getSizeInBits() != 128)
@@ -12927,6 +12879,82 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
  }
  
+
+/// PerformTruncateCombine - Converts truncate operation to
+/// a sequence of vector shuffle operations.
+/// It is possible when we truncate 256-bit vector to 128-bit vector
+
+SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, 
+                                                  DAGCombinerInfo &DCI) const {
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (!Subtarget->hasAVX()) return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  EVT OpVT = Op.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
+
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
+                          DAG.getIntPtrConstant(0));
+
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
+                          DAG.getIntPtrConstant(2));
+
+    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
+
+    // PSHUFD
+    int ShufMask1[] = {0, 2, 0, 0};
+
+    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT),
+                                ShufMask1);
+    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT),
+                                ShufMask1);
+
+    // MOVLHPS
+    int ShufMask2[] = {0, 1, 4, 5};
+
+    return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
+  }
+  if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
+
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
+                          DAG.getIntPtrConstant(0));
+
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
+                          DAG.getIntPtrConstant(4));
+
+    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
+
+    // PSHUFB
+    int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13, 
+                      -1, -1, -1, -1, -1, -1, -1, -1};
+
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo,
+                                DAG.getUNDEF(MVT::v16i8),
+                                ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi,
+                                DAG.getUNDEF(MVT::v16i8),
+                                ShufMask1);
+
+    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
+
+    // MOVLHPS
+    int ShufMask2[] = {0, 1, 4, 5};
+
+    SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
+  }
+
+  return SDValue();
+}
+
  /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
  /// generation and convert it from being a bunch of shuffles and extracts
  /// to a simple store and scalar loads to extract the elements.
@@ -13543,6 +13571,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
  /// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
  ///                       when possible.
  static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget *Subtarget) {
    EVT VT = N->getValueType(0);
    if (N->getOpcode() == ISD::SHL) {
@@ -13606,9 +13635,16 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
             BaseShAmt = InVec.getOperand(1);
         }
      }
-    if (BaseShAmt.getNode() == 0)
+    if (BaseShAmt.getNode() == 0) {
+      // Don't create instructions with illegal types after legalize
+      // types has run.
+      if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) &&
+          !DCI.isBeforeLegalize())
+        return SDValue();
+
        BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
                                DAG.getIntPtrConstant(0));
+    }
    } else
      return SDValue();
  
@@ -14245,7 +14281,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
        Ld = cast<LoadSDNode>(St->getChain());
      else if (St->getValue().hasOneUse() &&
               ChainVal->getOpcode() == ISD::TokenFactor) {
-      for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
+      for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
          if (ChainVal->getOperand(i).getNode() == LdVal) {
            TokenFactorIndex = i;
            Ld = cast<LoadSDNode>(St->getValue());
@@ -14542,6 +14578,56 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
    return SDValue();
  }
  
+static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (!Subtarget->hasAVX()) 
+    return SDValue();
+
+  // Optimize vectors in AVX mode
+  // Sign extend  v8i16 to v8i32 and
+  //              v4i32 to v4i64
+  //
+  // Divide input vector into two parts
+  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+  // concat the vectors to original VT
+
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  EVT OpVT = Op.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
+      (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
+
+    unsigned NumElems = OpVT.getVectorNumElements();
+    SmallVector<int,8> ShufMask1(NumElems, -1);
+    for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i;
+
+    SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
+                                        ShufMask1.data());
+
+    SmallVector<int,8> ShufMask2(NumElems, -1);
+    for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2;
+
+    SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
+                                        ShufMask2.data());
+
+    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), 
+                                  VT.getVectorNumElements()/2);
+
+    OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); 
+    OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+  }
+  return SDValue();
+}
+
  static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
                                    const X86Subtarget *Subtarget) {
    // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
@@ -14584,8 +14670,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
      if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16))  ||
        ((VT == MVT::v4i64) && (OpVT == MVT::v4i32)))  {
  
-      SDValue ZeroVec = getZeroVector(OpVT, Subtarget->hasSSE2(), Subtarget->hasAVX2(), 
-        DAG, dl);
+      SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
        SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG);
        SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG);
  
@@ -14772,7 +14857,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
    case ISD::SHL:
    case ISD::SRA:
-  case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
+  case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
    case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
    case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
    case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
@@ -14787,6 +14872,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
    case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
    case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, Subtarget);
+  case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
+  case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI);
    case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG);
    case X86ISD::SHUFP:       // Handle all target specific shuffles
    case X86ISD::PALIGN: