Remove some patterns for matching vector_shuffle instructions since vector_shuffles...

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 05c0ebd5ca1af90a4a3434e0ff583608e6155b33..1343b571ece4934b0bf7185be30418dc6d1b9286 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -39,7 +39,6 @@
  #include "llvm/MC/MCContext.h"
  #include "llvm/MC/MCExpr.h"
  #include "llvm/MC/MCSymbol.h"
-#include "llvm/ADT/BitVector.h"
  #include "llvm/ADT/SmallSet.h"
  #include "llvm/ADT/Statistic.h"
  #include "llvm/ADT/StringExtras.h"
@@ -3714,7 +3713,7 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
  /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
  /// to the same elements of the low, but to the higher half of the source.
  /// In VPERMILPD the two lanes could be shuffled independently of each other
-/// with the same restriction that lanes can't be crossed.
+/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
  static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
    if (!HasAVX)
      return false;
@@ -3743,35 +3742,6 @@ static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
    return true;
  }
  
-/// getShuffleVPERMILPImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERMILPS/D* instructions.
-static unsigned getShuffleVPERMILPImmediate(ShuffleVectorSDNode *SVOp) {
-  EVT VT = SVOp->getValueType(0);
-
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned LaneSize = NumElts/NumLanes;
-
-  // Although the mask is equal for both lanes do it twice to get the cases
-  // where a mask will match because the same mask element is undef on the
-  // first half but valid on the second. This would get pathological cases
-  // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid.
-  unsigned Shift = (LaneSize == 4) ? 2 : 1;
-  unsigned Mask = 0;
-  for (unsigned i = 0; i != NumElts; ++i) {
-    int MaskElt = SVOp->getMaskElt(i);
-    if (MaskElt < 0)
-      continue;
-    MaskElt %= LaneSize;
-    unsigned Shamt = i;
-    // VPERMILPSY, the mask of the first half must be equal to the second one
-    if (NumElts == 8) Shamt %= LaneSize;
-    Mask |= MaskElt << (Shamt*Shift);
-  }
-
-  return Mask;
-}
-
  /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
  /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
  /// element of vector 2 and the other elements to come from vector 1 in order.
@@ -4275,23 +4245,12 @@ static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
  
  /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
  /// that point to V2 points to its first element.
-static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
-  EVT VT = SVOp->getValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-
-  bool Changed = false;
-  SmallVector<int, 8> MaskVec(SVOp->getMask().begin(), SVOp->getMask().end());
-
+static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
    for (unsigned i = 0; i != NumElems; ++i) {
-    if (MaskVec[i] > (int)NumElems) {
-      MaskVec[i] = NumElems;
-      Changed = true;
+    if (Mask[i] > (int)NumElems) {
+      Mask[i] = NumElems;
      }
    }
-  if (Changed)
-    return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
-                                SVOp->getOperand(1), &MaskVec[0]);
-  return SDValue(SVOp, 0);
  }
  
  /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
@@ -4459,14 +4418,15 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
      if (Index < 0)
        return DAG.getUNDEF(VT.getVectorElementType());
  
-    int NumElems = VT.getVectorNumElements();
-    SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1);
+    unsigned NumElems = VT.getVectorNumElements();
+    SDValue NewV = (Index < (int)NumElems) ? SV->getOperand(0)
+                                           : SV->getOperand(1);
      return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1);
    }
  
    // Recurse into target specific vector shuffles to find scalars.
    if (isTargetShuffle(Opcode)) {
-    int NumElems = VT.getVectorNumElements();
+    unsigned NumElems = VT.getVectorNumElements();
      SmallVector<unsigned, 16> ShuffleMask;
      SDValue ImmN;
  
@@ -4489,9 +4449,9 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
        DecodeMOVLHPSMask(NumElems, ShuffleMask);
        break;
      case X86ISD::PSHUFD:
+    case X86ISD::VPERMILP:
        ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodePSHUFMask(NumElems,
-                      cast<ConstantSDNode>(ImmN)->getZExtValue(),
+      DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                        ShuffleMask);
        break;
      case X86ISD::PSHUFHW:
@@ -4513,14 +4473,9 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
        return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
                                   Depth+1);
      }
-    case X86ISD::VPERMILP:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERMILPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
      case X86ISD::VPERM2X128:
        ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+      DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                             ShuffleMask);
        break;
      case X86ISD::MOVDDUP:
@@ -4531,16 +4486,15 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
      case X86ISD::MOVSLDUP:
      case X86ISD::PALIGN:
        return SDValue(); // Not yet implemented.
-    default:
-      assert(0 && "unknown target shuffle node");
-      return SDValue();
+    default: llvm_unreachable("unknown target shuffle node");
      }
  
      Index = ShuffleMask[Index];
      if (Index < 0)
        return DAG.getUNDEF(VT.getVectorElementType());
  
-    SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1);
+    SDValue NewV = (Index < (int)NumElems) ? N->getOperand(0)
+                                           : N->getOperand(1);
      return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG,
                                 Depth+1);
    }
@@ -5220,9 +5174,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
  
    // For AVX-length vectors, build the individual 128-bit pieces and use
    // shuffles to put them in place.
-  if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) {
+  if (VT.getSizeInBits() == 256) {
      SmallVector<SDValue, 32> V;
-    for (unsigned i = 0; i < NumElems; ++i)
+    for (unsigned i = 0; i != NumElems; ++i)
        V.push_back(Op.getOperand(i));
  
      EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
@@ -5446,7 +5400,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
    // mask values count as coming from any quadword, for better codegen.
    unsigned LoQuad[] = { 0, 0, 0, 0 };
    unsigned HiQuad[] = { 0, 0, 0, 0 };
-  BitVector InputQuads(4);
+  std::bitset<4> InputQuads;
    for (unsigned i = 0; i < 8; ++i) {
      unsigned *Quad = i < 4 ? LoQuad : HiQuad;
      int EltIdx = SVOp->getMaskElt(i);
@@ -5488,8 +5442,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
    bool V2Used = InputQuads[2] || InputQuads[3];
    if (Subtarget->hasSSSE3()) {
      if (InputQuads.count() == 2 && V1Used && V2Used) {
-      BestLoQuad = InputQuads.find_first();
-      BestHiQuad = InputQuads.find_next(BestLoQuad);
+      BestLoQuad = InputQuads[0] ? 0 : 1;
+      BestHiQuad = InputQuads[2] ? 2 : 3;
      }
      if (InputQuads.count() > 2) {
        BestLoQuad = -1;
@@ -5842,7 +5796,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
    unsigned NewWidth = (NumElems == 4) ? 2 : 4;
    EVT NewVT;
    switch (VT.getSimpleVT().SimpleTy) {
-  default: assert(false && "Unexpected!");
+  default: llvm_unreachable("Unexpected!");
    case MVT::v4f32: NewVT = MVT::v2f64; break;
    case MVT::v4i32: NewVT = MVT::v2i64; break;
    case MVT::v8i16: NewVT = MVT::v4i32; break;
@@ -6502,6 +6456,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  
      unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
  
+    if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64))
+      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG);
+
      if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
        return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
  
@@ -6567,18 +6524,16 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    V1IsSplat = isSplatVector(V1.getNode());
    V2IsSplat = isSplatVector(V2.getNode());
  
+  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
+
    // Canonicalize the splat or undef, if present, to be on the RHS.
-  if (V1IsSplat && !V2IsSplat) {
-    Op = CommuteVectorShuffle(SVOp, DAG);
-    SVOp = cast<ShuffleVectorSDNode>(Op);
-    V1 = SVOp->getOperand(0);
-    V2 = SVOp->getOperand(1);
+  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
+    CommuteVectorShuffleMask(M, NumElems);
+    std::swap(V1, V2);
      std::swap(V1IsSplat, V2IsSplat);
      Commuted = true;
    }
  
-  ArrayRef<int> M = SVOp->getMask();
-
    if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
      // Shuffling low element of v1 into undef, just return v1.
      if (V2IsUndef)
@@ -6598,29 +6553,29 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    if (V2IsSplat) {
      // Normalize mask so all entries that point to V2 points to its first
      // element then try to match unpck{h|l} again. If match, return a
-    // new vector_shuffle with the corrected mask.
-    SDValue NewMask = NormalizeMask(SVOp, DAG);
-    ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
-    if (NSVOp != SVOp) {
-      if (X86::isUNPCKLMask(NSVOp, HasAVX2, true)) {
-        return NewMask;
-      } else if (X86::isUNPCKHMask(NSVOp, HasAVX2, true)) {
-        return NewMask;
-      }
+    // new vector_shuffle with the corrected mask.p
+    SmallVector<int, 8> NewMask(M.begin(), M.end());
+    NormalizeMask(NewMask, NumElems);
+    if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) {
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+    } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) {
+      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
      }
    }
  
    if (Commuted) {
      // Commute is back and try unpck* again.
      // FIXME: this seems wrong.
-    SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
-    ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
+    CommuteVectorShuffleMask(M, NumElems);
+    std::swap(V1, V2);
+    std::swap(V1IsSplat, V2IsSplat);
+    Commuted = false;
  
-    if (X86::isUNPCKLMask(NewSVOp, HasAVX2))
-      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V2, V1, DAG);
+    if (isUNPCKLMask(M, VT, HasAVX2))
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
  
-    if (X86::isUNPCKHMask(NewSVOp, HasAVX2))
-      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V2, V1, DAG);
+    if (isUNPCKHMask(M, VT, HasAVX2))
+      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
    }
  
    // Normalize the node to match x86 shuffle ops if needed
@@ -6671,9 +6626,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
  
    // Handle VPERMILPS/D* permutations
-  if (isVPERMILPMask(M, VT, HasAVX))
+  if (isVPERMILPMask(M, VT, HasAVX)) {
+    if (HasAVX2 && VT == MVT::v8i32)
+      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
+                                  X86::getShuffleSHUFImmediate(SVOp), DAG);
      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
-                                getShuffleVPERMILPImmediate(SVOp), DAG);
+                                X86::getShuffleSHUFImmediate(SVOp), DAG);
+  }
  
    // Handle VPERM2F128/VPERM2I128 permutations
    if (isVPERM2X128Mask(M, VT, HasAVX))
@@ -7562,12 +7521,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
    LLVMContext *Context = DAG.getContext();
  
    // Build some magic constants.
-  SmallVector<Constant*,4> CV0;
-  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
-  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
-  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
-  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
-  Constant *C0 = ConstantVector::get(CV0);
+  const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+  Constant *C0 = ConstantDataVector::get(*Context, CV0);
    SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
  
    SmallVector<Constant*,2> CV1;
@@ -8319,7 +8274,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    if (isFP) {
      unsigned SSECC = 8;
      EVT EltVT = Op0.getValueType().getVectorElementType();
-    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+    assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT;
  
      bool Swap = false;
  
@@ -9455,6 +9410,10 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
    }
  
    // Arithmetic intrinsics.
+  case Intrinsic::x86_sse2_pmulu_dq:
+  case Intrinsic::x86_avx2_pmulu_dq:
+    return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
    case Intrinsic::x86_sse3_hadd_ps:
    case Intrinsic::x86_sse3_hadd_pd:
    case Intrinsic::x86_avx_hadd_ps_256:
@@ -9516,6 +9475,12 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
    case Intrinsic::x86_avx2_vperm2i128:
      return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::x86_avx_vpermil_ps:
+  case Intrinsic::x86_avx_vpermil_pd:
+  case Intrinsic::x86_avx_vpermil_ps_256:
+  case Intrinsic::x86_avx_vpermil_pd_256:
+    return DAG.getNode(X86ISD::VPERMILP, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
  
    // ptest and testp intrinsics. The intrinsic these come from are designed to
    // return an integer value, not just an instruction so lower it to the ptest
@@ -10114,78 +10079,46 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
    if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
      return Lower256IntArith(Op, DAG);
  
+  assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
+         "Only know how to lower V2I64/V4I64 multiply");
+
    DebugLoc dl = Op.getDebugLoc();
  
+  //  Ahi = psrlqi(a, 32);
+  //  Bhi = psrlqi(b, 32);
+  //
+  //  AloBlo = pmuludq(a, b);
+  //  AloBhi = pmuludq(a, Bhi);
+  //  AhiBlo = pmuludq(Ahi, b);
+
+  //  AloBhi = psllqi(AloBhi, 32);
+  //  AhiBlo = psllqi(AhiBlo, 32);
+  //  return AloBlo + AloBhi + AhiBlo;
+
    SDValue A = Op.getOperand(0);
    SDValue B = Op.getOperand(1);
  
-  if (VT == MVT::v4i64) {
-    assert(Subtarget->hasAVX2() && "Lowering v4i64 multiply requires AVX2");
+  SDValue ShAmt = DAG.getConstant(32, MVT::i32);
  
-    //  ulong2 Ahi = __builtin_ia32_psrlqi256( a, 32);
-    //  ulong2 Bhi = __builtin_ia32_psrlqi256( b, 32);
-    //  ulong2 AloBlo = __builtin_ia32_pmuludq256( a, b );
-    //  ulong2 AloBhi = __builtin_ia32_pmuludq256( a, Bhi );
-    //  ulong2 AhiBlo = __builtin_ia32_pmuludq256( Ahi, b );
-    //
-    //  AloBhi = __builtin_ia32_psllqi256( AloBhi, 32 );
-    //  AhiBlo = __builtin_ia32_psllqi256( AhiBlo, 32 );
-    //  return AloBlo + AloBhi + AhiBlo;
-
-    SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A,
-                              DAG.getConstant(32, MVT::i32));
-    SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B,
-                              DAG.getConstant(32, MVT::i32));
-    SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
-                         A, B);
-    SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
-                         A, Bhi);
-    SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
-                         Ahi, B);
-    AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi,
-                         DAG.getConstant(32, MVT::i32));
-    AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo,
-                         DAG.getConstant(32, MVT::i32));
-    SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
-    Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
-    return Res;
-  }
+  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
+  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
  
-  assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
+  // Bit cast to 32-bit vectors for MULUDQ
+  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
+  A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
+  B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
+  Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
+  Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
  
-  //  ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
-  //  ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
-  //  ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
-  //  ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
-  //  ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
-  //
-  //  AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
-  //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
-  //  return AloBlo + AloBhi + AhiBlo;
+  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
+  SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
+  SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
+
+  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
+  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
  
-  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A,
-                            DAG.getConstant(32, MVT::i32));
-  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B,
-                            DAG.getConstant(32, MVT::i32));
-  SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
-                       A, B);
-  SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
-                       A, Bhi);
-  SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
-                       Ahi, B);
-  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi,
-                       DAG.getConstant(32, MVT::i32));
-  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo,
-                       DAG.getConstant(32, MVT::i32));
    SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
-  Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
-  return Res;
+  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
  }
  
  SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
@@ -10312,8 +10245,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
      Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1),
                       DAG.getConstant(23, MVT::i32));
  
-    ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U));
-    Constant *C = ConstantVector::getSplat(4, CI);
+    const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U};
+    Constant *C = ConstantDataVector::get(*Context, CV);
      SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
      SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                                   MachinePointerInfo::getConstantPool(),
@@ -10634,8 +10567,7 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
    unsigned Reg = 0;
    unsigned size = 0;
    switch(T.getSimpleVT().SimpleTy) {
-  default:
-    assert(false && "Invalid value type!");
+  default: llvm_unreachable("Invalid value type!");
    case MVT::i8:  Reg = X86::AL;  size = 1; break;
    case MVT::i16: Reg = X86::AX;  size = 2; break;
    case MVT::i32: Reg = X86::EAX; size = 4; break;
@@ -10753,7 +10685,7 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
    unsigned Opc;
    bool ExtraOp = false;
    switch (Op.getOpcode()) {
-  default: assert(0 && "Invalid code");
+  default: llvm_unreachable("Invalid code");
    case ISD::ADDC: Opc = X86ISD::ADD; break;
    case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
    case ISD::SUBC: Opc = X86ISD::SUB; break;
@@ -10895,8 +10827,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
    DebugLoc dl = N->getDebugLoc();
    switch (N->getOpcode()) {
    default:
-    assert(false && "Do not know how to custom type legalize this operation!");
-    return;
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
    case ISD::SIGN_EXTEND_INREG:
    case ISD::ADDC:
    case ISD::ADDE:
@@ -11121,6 +11052,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
    case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
    case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
+  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
    case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
    case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
    case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -12052,28 +11984,39 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
    return EndMBB;
  }
  
-// Check whether the given instruction should have had a kill marker on
-// the EFLAGS operand.
-static bool shouldHaveEFlagsKill(MachineBasicBlock::iterator SelectItr,
-                                 MachineBasicBlock* BB) {
-  for (MachineBasicBlock::iterator miI(llvm::next(SelectItr)), miE = BB->end();
-       miI != miE; ++miI) {
+// The EFLAGS operand of SelectItr might be missing a kill marker
+// because there were multiple uses of EFLAGS, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
+                                     MachineBasicBlock* BB,
+                                     const TargetRegisterInfo* TRI) {
+  // Scan forward through BB for a use/def of EFLAGS.
+  MachineBasicBlock::iterator miI(llvm::next(SelectItr));
+  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
      const MachineInstr& mi = *miI;
-    if (mi.readsRegister(X86::EFLAGS)) {
+    if (mi.readsRegister(X86::EFLAGS))
        return false;
-    }
-    if (mi.definesRegister(X86::EFLAGS)) {
-      // Should have kill-flag - update below.
-      break;
+    if (mi.definesRegister(X86::EFLAGS))
+      break; // Should have kill-flag - update below.
+  }
+
+  // If we hit the end of the block, check whether EFLAGS is live into a
+  // successor.
+  if (miI == BB->end()) {
+    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+                                          sEnd = BB->succ_end();
+         sItr != sEnd; ++sItr) {
+      MachineBasicBlock* succ = *sItr;
+      if (succ->isLiveIn(X86::EFLAGS))
+        return false;
      }
    }
  
-  // We found a def, or hit the end of the basic block. SelectMI should have a
-  // kill flag on EFLAGS.
-  MachineInstr& SelectMI = *SelectItr;
-  MachineOperand* EFlagsOp = SelectMI.findRegisterUseOperand(X86::EFLAGS);
-  assert(EFlagsOp != 0 && "No EFLAGS operand on select instruction?");
-  EFlagsOp->setIsKill();
+  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
+  // out. SelectMI should have a kill flag on EFLAGS.
+  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
    return true;
  }
  
@@ -12106,11 +12049,11 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
  
    // If the EFLAGS register isn't dead in the terminator, then claim that it's
    // live into the sink and copy blocks.
-  if (!MI->killsRegister(X86::EFLAGS)) {
-    if (!shouldHaveEFlagsKill(MI, BB)) {
-      copy0MBB->addLiveIn(X86::EFLAGS);
-      sinkMBB->addLiveIn(X86::EFLAGS);
-    }
+  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+  if (!MI->killsRegister(X86::EFLAGS) &&
+      !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
+    copy0MBB->addLiveIn(X86::EFLAGS);
+    sinkMBB->addLiveIn(X86::EFLAGS);
    }
  
    // Transfer the remainder of BB and its successor edges to sinkMBB.
@@ -12367,11 +12310,11 @@ MachineBasicBlock *
  X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                 MachineBasicBlock *BB) const {
    switch (MI->getOpcode()) {
-  default: assert(0 && "Unexpected instr type to insert");
+  default: llvm_unreachable("Unexpected instr type to insert");
    case X86::TAILJMPd64:
    case X86::TAILJMPr64:
    case X86::TAILJMPm64:
-    assert(0 && "TAILJMP64 would not be touched here.");
+    llvm_unreachable("TAILJMP64 would not be touched here.");
    case X86::TCRETURNdi64:
    case X86::TCRETURNri64:
    case X86::TCRETURNmi64:
@@ -12744,6 +12687,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
      case Intrinsic::x86_avx2_pmovmskb: {
        // High bits of movmskp{s|d}, pmovmskb are known zero.
        switch (IntId) {
+        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
          case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
          case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
          case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
@@ -14640,41 +14584,42 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
    if (!DCI.isBeforeLegalizeOps())
      return SDValue();
  
-  if (!Subtarget->hasAVX()) return SDValue();
+  if (!Subtarget->hasAVX()) 
+    return SDValue();
  
-   // Optimize vectors in AVX mode
-   // Sign extend  v8i16 to v8i32 and
-   //              v4i32 to v4i64
-   //
-   // Divide input vector into two parts
-   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
-   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
-   // concat the vectors to original VT
+  // Optimize vectors in AVX mode
+  // Sign extend  v8i16 to v8i32 and
+  //              v4i32 to v4i64
+  //
+  // Divide input vector into two parts
+  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+  // concat the vectors to original VT
  
    EVT VT = N->getValueType(0);
    SDValue Op = N->getOperand(0);
    EVT OpVT = Op.getValueType();
    DebugLoc dl = N->getDebugLoc();
  
-  if (((VT == MVT::v4i64) && (OpVT == MVT::v4i32)) ||
-    ((VT == MVT::v8i32) && (OpVT == MVT::v8i16))) {
+  if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
+      (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
  
      unsigned NumElems = OpVT.getVectorNumElements();
      SmallVector<int,8> ShufMask1(NumElems, -1);
-    for (unsigned i=0; i< NumElems/2; i++) ShufMask1[i] = i;
+    for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i;
  
      SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
-                                ShufMask1.data());
+                                        ShufMask1.data());
  
      SmallVector<int,8> ShufMask2(NumElems, -1);
-    for (unsigned i=0; i< NumElems/2; i++) ShufMask2[i] = i+NumElems/2;
+    for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2;
  
      SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
-                                ShufMask2.data());
+                                        ShufMask2.data());
  
      EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), 
-      VT.getVectorNumElements()/2);
-    
+                                  VT.getVectorNumElements()/2);
+
      OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); 
      OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);