PerformSubCombine to work on integers larger than i128. Fixes a crasher.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index b00b00d047b171de0d149658aa646e6dd01fc3ff..d2b7690489380289230cdc4b6b1a6b6558971fa8 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -883,7 +883,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
    }
  
-  if (Subtarget->hasSSE41()) {
+  if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
      setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
      setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
      setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
@@ -922,10 +922,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      }
    }
  
-  if (Subtarget->hasSSE2()) {
+  if (Subtarget->hasSSE2() || Subtarget->hasAVX()) {
      setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
      setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
      setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
+    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
  
      setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
      setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
@@ -935,7 +936,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
    }
  
-  if (Subtarget->hasSSE42())
+  if (Subtarget->hasSSE42() || Subtarget->hasAVX())
      setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
  
    if (!UseSoftFloat && Subtarget->hasAVX()) {
@@ -975,6 +976,28 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i8,  Custom);
      setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i16, Custom);
  
+    setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
+    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
+    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
+    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
+
+    setOperationAction(ISD::VSETCC,            MVT::v32i8, Custom);
+    setOperationAction(ISD::VSETCC,            MVT::v16i16, Custom);
+    setOperationAction(ISD::VSETCC,            MVT::v8i32, Custom);
+    setOperationAction(ISD::VSETCC,            MVT::v4i64, Custom);
+
+    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
+
      // Custom lower several nodes for 256-bit types.
      for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
                    i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
@@ -2732,6 +2755,7 @@ static bool isTargetShuffle(unsigned Opcode) {
    case X86ISD::VPERMILPSY:
    case X86ISD::VPERMILPD:
    case X86ISD::VPERMILPDY:
+  case X86ISD::VPERM2F128:
      return true;
    }
    return false;
@@ -2774,6 +2798,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
    case X86ISD::PALIGN:
    case X86ISD::SHUFPD:
    case X86ISD::SHUFPS:
+  case X86ISD::VPERM2F128:
      return DAG.getNode(Opc, dl, VT, V1, V2,
                         DAG.getConstant(TargetMask, MVT::i8));
    }
@@ -3004,6 +3029,17 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
    return (Val < 0) || (Val >= Low && Val < Hi);
  }
  
+/// isUndefOrInRange - Return true if every element in Mask, begining
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// range (L, L+Pos]. or is undef.
+static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask,
+                             int Pos, int Size, int Low, int Hi) {
+  for (int i = Pos, e = Pos+Size; i != e; ++i)
+    if (!isUndefOrInRange(Mask[i], Low, Hi))
+      return false;
+  return true;
+}
+
  /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
  /// specified value.
  static bool isUndefOrEqual(int Val, int CmpVal) {
@@ -3012,6 +3048,17 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
    return false;
  }
  
+/// isSequentialOrUndefInRange - Return true if every element in Mask, begining
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (L, L+Pos]. or is undef.
+static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask,
+                                       int Pos, int Size, int Low) {
+  for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+    if (!isUndefOrEqual(Mask[i], Low))
+      return false;
+  return true;
+}
+
  /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
  /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
  /// the second operand.
@@ -3423,6 +3470,67 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
    return ::isMOVLMask(M, N->getValueType(0));
  }
  
+/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered
+/// as permutations between 128-bit chunks or halves. As an example: this
+/// shuffle bellow:
+///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
+/// The first half comes from the second half of V1 and the second half from the
+/// the second half of V2.
+static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
+                             const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+    return false;
+
+  // The shuffle result is divided into half A and half B. In total the two
+  // sources have 4 halves, namely: C, D, E, F. The final values of A and
+  // B must come from C, D, E or F.
+  int HalfSize = VT.getVectorNumElements()/2;
+  bool MatchA = false, MatchB = false;
+
+  // Check if A comes from one of C, D, E, F.
+  for (int Half = 0; Half < 4; ++Half) {
+    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
+      MatchA = true;
+      break;
+    }
+  }
+
+  // Check if B comes from one of C, D, E, F.
+  for (int Half = 0; Half < 4; ++Half) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
+      MatchB = true;
+      break;
+    }
+  }
+
+  return MatchA && MatchB;
+}
+
+/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERM2F128 instructions.
+static unsigned getShuffleVPERM2F128Immediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+
+  int HalfSize = VT.getVectorNumElements()/2;
+
+  int FstHalf = 0, SndHalf = 0;
+  for (int i = 0; i < HalfSize; ++i) {
+    if (SVOp->getMaskElt(i) > 0) {
+      FstHalf = SVOp->getMaskElt(i)/HalfSize;
+      break;
+    }
+  }
+  for (int i = HalfSize; i < HalfSize*2; ++i) {
+    if (SVOp->getMaskElt(i) > 0) {
+      SndHalf = SVOp->getMaskElt(i)/HalfSize;
+      break;
+    }
+  }
+
+  return (FstHalf | (SndHalf << 4));
+}
+
  /// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
  /// Note that VPERMIL mask matching is different depending whether theunderlying
@@ -3488,9 +3596,12 @@ static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT,
    int LaneSize = NumElts/NumLanes;
    for (int i = 0; i < LaneSize; ++i) {
      int HighElt = i+LaneSize;
-    if (Mask[i] < 0 && (isUndefOrInRange(Mask[HighElt], LaneSize, NumElts)))
-      continue;
-    if (Mask[HighElt] < 0 && (isUndefOrInRange(Mask[i], 0, LaneSize)))
+    bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts);
+    bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize);
+
+    if (!HighValid || !LowValid)
+      return false;
+    if (Mask[i] < 0 || Mask[HighElt] < 0)
        continue;
      if (Mask[HighElt]-Mask[i] != LaneSize)
        return false;
@@ -3839,7 +3950,10 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
  /// V1 (and in order), and the upper half elements should come from the upper
  /// half of V2 (and in order).
  static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
-  if (Op->getValueType(0).getVectorNumElements() != 4)
+  EVT VT = Op->getValueType(0);
+  if (VT.getSizeInBits() != 128)
+    return false;
+  if (VT.getVectorNumElements() != 4)
      return false;
    for (unsigned i = 0, e = 2; i != e; ++i)
      if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
@@ -3871,6 +3985,10 @@ static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
  /// MOVLP, it must be either a vector load or a scalar load to vector.
  static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
                                 ShuffleVectorSDNode *Op) {
+  EVT VT = Op->getValueType(0);
+  if (VT.getSizeInBits() != 128)
+    return false;
+
    if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
      return false;
    // Is V2 is a vector load, don't do this transformation. We will try to use
@@ -3878,7 +3996,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
    if (ISD::isNON_EXTLoad(V2))
      return false;
  
-  unsigned NumElems = Op->getValueType(0).getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
  
    if (NumElems != 2 && NumElems != 4)
      return false;
@@ -4042,11 +4160,11 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
    return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
  }
  
-// PromoteSplatv8v16 - All i16 and i8 vector types can't be used directly by
+// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
  // a generic shuffle instruction because the target has no such instructions.
  // Generate shuffles which repeat i16 and i8 several times until they can be
  // represented by v4f32 and then be manipulated by target suported shuffles.
-static SDValue PromoteSplatv8v16(SDValue V, SelectionDAG &DAG, int &EltNo) {
+static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
    EVT VT = V.getValueType();
    int NumElems = VT.getVectorNumElements();
    DebugLoc dl = V.getDebugLoc();
@@ -4089,36 +4207,7 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
    return DAG.getNode(ISD::BITCAST, dl, VT, V);
  }
  
-/// PromoteVectorToScalarSplat - Since there's no native support for
-/// scalar_to_vector for 256-bit AVX, a 128-bit scalar_to_vector +
-/// INSERT_SUBVECTOR is generated. Recognize this idiom and do the
-/// shuffle before the insertion, this yields less instructions in the end.
-static SDValue PromoteVectorToScalarSplat(ShuffleVectorSDNode *SV,
-                                          SelectionDAG &DAG) {
-  EVT SrcVT = SV->getValueType(0);
-  SDValue V1 = SV->getOperand(0);
-  DebugLoc dl = SV->getDebugLoc();
-  int NumElems = SrcVT.getVectorNumElements();
-
-  assert(SrcVT.is256BitVector() && "unknown howto handle vector type");
-
-  SmallVector<int, 4> Mask;
-  for (int i = 0; i < NumElems/2; ++i)
-    Mask.push_back(SV->getMaskElt(i));
-
-  EVT SVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
-                             NumElems/2);
-  SDValue SV1 = DAG.getVectorShuffle(SVT, dl, V1.getOperand(1),
-                                     DAG.getUNDEF(SVT), &Mask[0]);
-  SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), SV1,
-                                    DAG.getConstant(0, MVT::i32), DAG, dl);
-
-  return Insert128BitVector(InsV, SV1,
-                       DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
-}
-
-/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32 and
-/// v8i32, v16i16 or v32i8 to v8f32.
+/// PromoteSplat - Splat is promoted to target supported vector shuffles.
  static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
    EVT SrcVT = SV->getValueType(0);
    SDValue V1 = SV->getOperand(0);
@@ -4137,9 +4226,14 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
        EltNo -= NumElems/2;
    }
  
-  // Make this 128-bit vector duplicate i8 and i16 elements
-  if (NumElems > 4)
-    V1 = PromoteSplatv8v16(V1, DAG, EltNo);
+  // All i16 and i8 vector types can't be used directly by a generic shuffle
+  // instruction because the target has no such instruction. Generate shuffles
+  // which repeat i16 and i8 several times until they fit in i32, and then can
+  // be manipulated by target suported shuffles. After the insertion of the
+  // necessary shuffles, the result is bitcasted back to v4f32 or v8f32.
+  EVT EltVT = SrcVT.getVectorElementType();
+  if (NumElems > 4 && (EltVT == MVT::i8 || EltVT == MVT::i16))
+    V1 = PromoteSplati8i16(V1, DAG, EltNo);
  
    // Recreate the 256-bit vector and place the same 128-bit vector
    // into the low and high part. This is necessary because we want
@@ -4285,6 +4379,11 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
        DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                          ShuffleMask);
        break;
+    case X86ISD::VPERM2F128:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                           ShuffleMask);
+      break;
      default:
        assert("not implemented for target shuffle node");
        return SDValue();
@@ -4566,42 +4665,52 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
        return SDValue();
      }
  
+    // FIXME: 256-bit vector instructions don't require a strict alignment,
+    // improve this code to support it better.
+    unsigned RequiredAlign = VT.getSizeInBits()/8;
      SDValue Chain = LD->getChain();
-    // Make sure the stack object alignment is at least 16.
+    // Make sure the stack object alignment is at least 16 or 32.
      MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-    if (DAG.InferPtrAlignment(Ptr) < 16) {
+    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
        if (MFI->isFixedObjectIndex(FI)) {
          // Can't change the alignment. FIXME: It's possible to compute
          // the exact stack offset and reference FI + adjust offset instead.
          // If someone *really* cares about this. That's the way to implement it.
          return SDValue();
        } else {
-        MFI->setObjectAlignment(FI, 16);
+        MFI->setObjectAlignment(FI, RequiredAlign);
        }
      }
  
-    // (Offset % 16) must be multiple of 4. Then address is then
+    // (Offset % 16 or 32) must be multiple of 4. Then address is then
      // Ptr + (Offset & ~15).
      if (Offset < 0)
        return SDValue();
-    if ((Offset % 16) & 3)
+    if ((Offset % RequiredAlign) & 3)
        return SDValue();
-    int64_t StartOffset = Offset & ~15;
+    int64_t StartOffset = Offset & ~(RequiredAlign-1);
      if (StartOffset)
        Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
                          Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
  
      int EltNo = (Offset - StartOffset) >> 2;
-    int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
-    EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
-    SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,
+    int NumElems = VT.getVectorNumElements();
+
+    EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32;
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
+    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
                               LD->getPointerInfo().getWithOffset(StartOffset),
                               false, false, 0);
-    // Canonicalize it to a v4i32 shuffle.
-    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
-    return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getVectorShuffle(MVT::v4i32, dl, V1,
-                                            DAG.getUNDEF(MVT::v4i32),&Mask[0]));
+
+    // Canonicalize it to a v4i32 or v8i32 shuffle.
+    SmallVector<int, 8> Mask;
+    for (int i = 0; i < NumElems; ++i)
+      Mask.push_back(EltNo);
+
+    V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1);
+    return DAG.getNode(ISD::BITCAST, dl, NVT,
+                       DAG.getVectorShuffle(CanonVT, dl, V1,
+                                            DAG.getUNDEF(CanonVT),&Mask[0]));
    }
  
    return SDValue();
@@ -5038,7 +5147,6 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
  
  SDValue
  X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
    EVT ResVT = Op.getValueType();
  
    assert(Op.getNumOperands() == 2);
@@ -5546,10 +5654,95 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
                                               OpVT, SrcOp)));
  }
  
+/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector
+/// shuffle node referes to only one lane in the sources.
+static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+  int HalfSize = NumElems/2;
+  SmallVector<int, 16> M;
+  SVOp->getMask(M);
+  bool MatchA = false, MatchB = false;
+
+  for (int l = 0; l < NumElems*2; l += HalfSize) {
+    if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) {
+      MatchA = true;
+      break;
+    }
+  }
+
+  for (int l = 0; l < NumElems*2; l += HalfSize) {
+    if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) {
+      MatchB = true;
+      break;
+    }
+  }
+
+  return MatchA && MatchB;
+}
+
  /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
  /// which could not be matched by any known target speficic shuffle
  static SDValue
  LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+  if (areShuffleHalvesWithinDisjointLanes(SVOp)) {
+    // If each half of a vector shuffle node referes to only one lane in the
+    // source vectors, extract each used 128-bit lane and shuffle them using
+    // 128-bit shuffles. Then, concatenate the results. Otherwise leave
+    // the work to the legalizer.
+    DebugLoc dl = SVOp->getDebugLoc();
+    EVT VT = SVOp->getValueType(0);
+    int NumElems = VT.getVectorNumElements();
+    int HalfSize = NumElems/2;
+
+    // Extract the reference for each half
+    int FstVecExtractIdx = 0, SndVecExtractIdx = 0;
+    int FstVecOpNum = 0, SndVecOpNum = 0;
+    for (int i = 0; i < HalfSize; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      if (SVOp->getMaskElt(i) < 0)
+        continue;
+      FstVecOpNum = Elt/NumElems;
+      FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
+      break;
+    }
+    for (int i = HalfSize; i < NumElems; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      if (SVOp->getMaskElt(i) < 0)
+        continue;
+      SndVecOpNum = Elt/NumElems;
+      SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
+      break;
+    }
+
+    // Extract the subvectors
+    SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum),
+                      DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl);
+    SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum),
+                      DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl);
+
+    // Generate 128-bit shuffles
+    SmallVector<int, 16> MaskV1, MaskV2;
+    for (int i = 0; i < HalfSize; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+    }
+    for (int i = HalfSize; i < NumElems; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+    }
+
+    EVT NVT = V1.getValueType();
+    V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]);
+    V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]);
+
+    // Concatenate the result back
+    SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1,
+                                   DAG.getConstant(0, MVT::i32), DAG, dl);
+    return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
+                              DAG, dl);
+  }
+
    return SDValue();
  }
  
@@ -5914,7 +6107,9 @@ static inline unsigned getUNPCKLOpcode(EVT VT) {
    case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
    case MVT::v4f32: return X86ISD::UNPCKLPS;
    case MVT::v2f64: return X86ISD::UNPCKLPD;
+  case MVT::v8i32: // Use fp unit for int unpack.
    case MVT::v8f32: return X86ISD::VUNPCKLPSY;
+  case MVT::v4i64: // Use fp unit for int unpack.
    case MVT::v4f64: return X86ISD::VUNPCKLPDY;
    case MVT::v16i8: return X86ISD::PUNPCKLBW;
    case MVT::v8i16: return X86ISD::PUNPCKLWD;
@@ -5930,7 +6125,9 @@ static inline unsigned getUNPCKHOpcode(EVT VT) {
    case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
    case MVT::v4f32: return X86ISD::UNPCKHPS;
    case MVT::v2f64: return X86ISD::UNPCKHPD;
+  case MVT::v8i32: // Use fp unit for int unpack.
    case MVT::v8f32: return X86ISD::VUNPCKHPSY;
+  case MVT::v4i64: // Use fp unit for int unpack.
    case MVT::v4f64: return X86ISD::VUNPCKHPDY;
    case MVT::v16i8: return X86ISD::PUNPCKHBW;
    case MVT::v8i16: return X86ISD::PUNPCKHWD;
@@ -5956,6 +6153,48 @@ static inline unsigned getVPERMILOpcode(EVT VT) {
    return 0;
  }
  
+/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
+/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming
+/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded.
+static bool isVectorBroadcast(SDValue &Op) {
+  EVT VT = Op.getValueType();
+  bool Is256 = VT.getSizeInBits() == 256;
+
+  assert((VT.getSizeInBits() == 128 || Is256) &&
+         "Unsupported type for vbroadcast node");
+
+  SDValue V = Op;
+  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  if (Is256 && !(V.hasOneUse() &&
+                 V.getOpcode() == ISD::INSERT_SUBVECTOR &&
+                 V.getOperand(0).getOpcode() == ISD::UNDEF))
+    return false;
+
+  if (Is256)
+    V = V.getOperand(1);
+  if (V.hasOneUse() && V.getOpcode() != ISD::SCALAR_TO_VECTOR)
+    return false;
+
+  // Check the source scalar_to_vector type. 256-bit broadcasts are
+  // supported for 32/64-bit sizes, while 128-bit ones are only supported
+  // for 32-bit scalars.
+  unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits();
+  if (ScalarSize != 32 && ScalarSize != 64)
+    return false;
+  if (!Is256 && ScalarSize == 64)
+    return false;
+
+  V = V.getOperand(0);
+  if (!MayFoldLoad(V))
+    return false;
+
+  // Return the load node
+  Op = V;
+  return true;
+}
+
  static
  SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
                                 const TargetLowering &TLI,
@@ -5979,26 +6218,15 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
      if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
        return Op;
  
-    // Since there's no native support for scalar_to_vector for 256-bit AVX, a
-    // 128-bit scalar_to_vector + INSERT_SUBVECTOR is generated. Recognize this
-    // idiom and do the shuffle before the insertion, this yields less
-    // instructions in the end.
-    if (VT.is256BitVector() &&
-        V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
-        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
-        V1.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return PromoteVectorToScalarSplat(SVOp, DAG);
+    // Use vbroadcast whenever the splat comes from a foldable load
+    if (Subtarget->hasAVX() && isVectorBroadcast(V1))
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
  
      // Handle splats by matching through known shuffle masks
-    if ((VT.is128BitVector() && NumElem <= 4) ||
-        (VT.is256BitVector() && NumElem <= 8))
+    if (VT.is128BitVector() && NumElem <= 4)
        return SDValue();
  
-    // All i16 and i8 vector types can't be used directly by a generic shuffle
-    // instruction because the target has no such instruction. Generate shuffles
-    // which repeat i16 and i8 several times until they fit in i32, and then can
-    // be manipulated by target suported shuffles. After the insertion of the
-    // necessary shuffles, the result is bitcasted back to v4f32 or v8f32.
+    // All remaning splats are promoted to target supported vector shuffles.
      return PromoteSplat(SVOp, DAG);
    }
  
@@ -6291,6 +6519,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
                                  getShuffleVPERMILPDImmediate(SVOp), DAG);
  
+  // Handle VPERM2F128 permutations
+  if (isVPERM2F128Mask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2,
+                                getShuffleVPERM2F128Immediate(SVOp), DAG);
+
    //===--------------------------------------------------------------------===//
    // Since no target specific shuffle was selected for this generic one,
    // lower it into other known shuffles. FIXME: this isn't true yet, but
@@ -6721,12 +6954,17 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
    CodeModel::Model M = getTargetMachine().getCodeModel();
  
    if (Subtarget->isPICStyleRIPRel() &&
-      (M == CodeModel::Small || M == CodeModel::Kernel))
+      (M == CodeModel::Small || M == CodeModel::Kernel)) {
+    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
+      OpFlag = X86II::MO_GOTPCREL;
      WrapperKind = X86ISD::WrapperRIP;
-  else if (Subtarget->isPICStyleGOT())
-    OpFlag = X86II::MO_GOTOFF;
-  else if (Subtarget->isPICStyleStubPIC())
-    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOT;
+  } else if (Subtarget->isPICStyleStubPIC()) {
+    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+  } else if (Subtarget->isPICStyleStubNoDynamic()) {
+    OpFlag = X86II::MO_DARWIN_NONLAZY;
+  }
  
    SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
  
@@ -6743,6 +6981,12 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
                           Result);
    }
  
+  // For symbols that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlag))
+    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, 0);
+
    return Result;
  }
  
@@ -7238,9 +7482,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
  
    // Load the 32-bit value into an XMM register.
    SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
-                             DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                                         Op.getOperand(0),
-                                         DAG.getIntPtrConstant(0)));
+                             Op.getOperand(0));
  
    Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                       DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
@@ -7879,6 +8121,39 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
                       DAG.getConstant(X86CC, MVT::i8), EFLAGS);
  }
  
+// Lower256IntVETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
+// ones, and then concatenate the result back.
+static SDValue Lower256IntVETCC(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::VSETCC &&
+         "Unsupported value type for operation");
+
+  int NumElems = VT.getVectorNumElements();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue CC = Op.getOperand(2);
+  SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+  SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+
+  // Issue the operation on the smaller types and concatenate the result back
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+}
+
+
  SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    SDValue Cond;
    SDValue Op0 = Op.getOperand(0);
@@ -7891,9 +8166,10 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
  
    if (isFP) {
      unsigned SSECC = 8;
-    EVT VT0 = Op0.getValueType();
-    assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
-    unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
+    EVT EltVT = Op0.getValueType().getVectorElementType();
+    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+
+    unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
      bool Swap = false;
  
      switch (SetCCOpcode) {
@@ -7940,6 +8216,10 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
      return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
    }
  
+  // Break 256-bit integer vector compare into smaller ones.
+  if (!isFP && VT.getSizeInBits() == 256)
+    return Lower256IntVETCC(Op, DAG);
+
    // We are handling one of the integer comparisons here.  Since SSE only has
    // GT and EQ comparisons for integer, swapping operands and multiple
    // operations may be required for some comparisons.
@@ -9188,11 +9468,51 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
    DebugLoc dl = Op.getDebugLoc();
    SDValue R = Op.getOperand(0);
    SDValue Amt = Op.getOperand(1);
-
    LLVMContext *Context = DAG.getContext();
  
-  // Must have SSE2.
-  if (!Subtarget->hasSSE2()) return SDValue();
+  if (!(Subtarget->hasSSE2() || Subtarget->hasAVX()))
+    return SDValue();
+
+  // Decompose 256-bit shifts into smaller 128-bit shifts.
+  if (VT.getSizeInBits() == 256) {
+    int NumElems = VT.getVectorNumElements();
+    MVT EltVT = VT.getVectorElementType().getSimpleVT();
+    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+    // Extract the two vectors
+    SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
+    SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
+                                     DAG, dl);
+
+    // Recreate the shift amount vectors
+    SDValue Amt1, Amt2;
+    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+      // Constant shift amount
+      SmallVector<SDValue, 4> Amt1Csts;
+      SmallVector<SDValue, 4> Amt2Csts;
+      for (int i = 0; i < NumElems/2; ++i)
+        Amt1Csts.push_back(Amt->getOperand(i));
+      for (int i = NumElems/2; i < NumElems; ++i)
+        Amt2Csts.push_back(Amt->getOperand(i));
+
+      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+                                 &Amt1Csts[0], NumElems/2);
+      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+                                 &Amt2Csts[0], NumElems/2);
+    } else {
+      // Variable shift amount
+      Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
+      Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
+                                 DAG, dl);
+    }
+
+    // Issue new vector shifts for the smaller types
+    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
+    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
+
+    // Concatenate the result back
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
+  }
  
    // Optimize shl/srl/sra with constant shift amount.
    if (isSplatVector(Amt.getNode())) {
@@ -9243,9 +9563,6 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
    }
  
    // Lower SHL with variable shift amount.
-  // Cannot lower SHL without SSE2 or later.
-  if (!Subtarget->hasSSE2()) return SDValue();
-
    if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
      Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                       DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
@@ -9963,10 +10280,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::PUNPCKHWD:          return "X86ISD::PUNPCKHWD";
    case X86ISD::PUNPCKHDQ:          return "X86ISD::PUNPCKHDQ";
    case X86ISD::PUNPCKHQDQ:         return "X86ISD::PUNPCKHQDQ";
+  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
    case X86ISD::VPERMILPS:          return "X86ISD::VPERMILPS";
    case X86ISD::VPERMILPSY:         return "X86ISD::VPERMILPSY";
    case X86ISD::VPERMILPD:          return "X86ISD::VPERMILPD";
    case X86ISD::VPERMILPDY:         return "X86ISD::VPERMILPDY";
+  case X86ISD::VPERM2F128:         return "X86ISD::VPERM2F128";
    case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
    case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
    case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -11112,6 +11431,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::CMOV_V4F32:
    case X86::CMOV_V2F64:
    case X86::CMOV_V2I64:
+  case X86::CMOV_V8F32:
+  case X86::CMOV_V4F64:
+  case X86::CMOV_V4I64:
    case X86::CMOV_GR16:
    case X86::CMOV_GR32:
    case X86::CMOV_RFP32:
@@ -11460,6 +11782,38 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
    return TargetLowering::isGAPlusOffset(N, GA, Offset);
  }
  
+/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
+/// same as extracting the high 128-bit part of 256-bit vector and then
+/// inserting the result into the low part of a new 256-bit vector
+static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+
+  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j)
+    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+        SVOp->getMaskElt(j) >= 0)
+      return false;
+
+  return true;
+}
+
+/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
+/// same as extracting the low 128-bit part of 256-bit vector and then
+/// inserting the result into the high part of a new 256-bit vector
+static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+
+  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j)
+    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+        SVOp->getMaskElt(j) >= 0)
+      return false;
+
+  return true;
+}
+
  /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
  static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
                                          TargetLowering::DAGCombinerInfo &DCI) {
@@ -11468,6 +11822,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
    SDValue V1 = SVOp->getOperand(0);
    SDValue V2 = SVOp->getOperand(1);
    EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
  
    if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
        V2.getOpcode() == ISD::CONCAT_VECTORS) {
@@ -11492,7 +11847,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
      // To match the shuffle mask, the first half of the mask should
      // be exactly the first vector, and all the rest a splat with the
      // first element of the second one.
-    int NumElems = VT.getVectorNumElements();
      for (int i = 0; i < NumElems/2; ++i)
        if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
            !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
@@ -11506,12 +11860,34 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
      return DCI.CombineTo(N, InsV);
    }
  
+  //===--------------------------------------------------------------------===//
+  // Combine some shuffles into subvector extracts and inserts:
+  //
+
+  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  if (isShuffleHigh128VectorInsertLow(SVOp)) {
+    SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32),
+                                    DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
+                                      V, DAG.getConstant(0, MVT::i32), DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  if (isShuffleLow128VectorInsertHigh(SVOp)) {
+    SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
+                             V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
    return SDValue();
  }
  
  /// PerformShuffleCombine - Performs several different shuffle combines.
  static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
-                                     TargetLowering::DAGCombinerInfo &DCI) {
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const X86Subtarget *Subtarget) {
    DebugLoc dl = N->getDebugLoc();
    EVT VT = N->getValueType(0);
  
@@ -11520,8 +11896,9 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
      return SDValue();
  
-  // Only handle pure VECTOR_SHUFFLE nodes.
-  if (VT.getSizeInBits() == 256 && N->getOpcode() == ISD::VECTOR_SHUFFLE)
+  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
+  if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
+      N->getOpcode() == ISD::VECTOR_SHUFFLE)
      return PerformShuffleCombine256(N, DAG, DCI);
  
    // Only handle 128 wide vector from here on.
@@ -11679,7 +12056,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
          // Converting this to a max would handle comparisons between positive
          // and negative zero incorrectly.
          if (!UnsafeFPMath &&
-            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS))
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
            break;
          Opcode = X86ISD::FMAX;
          break;
@@ -12092,7 +12469,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
    // all elements are shifted by the same amount.  We can't do this in legalize
    // because the a constant vector is typically transformed to a constant pool
    // so we have no knowledge of the shift amount.
-  if (!Subtarget->hasSSE2())
+  if (!(Subtarget->hasSSE2() || Subtarget->hasAVX()))
      return SDValue();
  
    if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
@@ -12291,16 +12668,17 @@ static bool CanFoldXORWithAllOnes(const SDNode *N) {
  
    // Sometimes the operand may come from a insert_subvector building a 256-bit
    // allones vector
-  SDValue V1 = N->getOperand(0);
-  SDValue V2 = N->getOperand(1);
-
    if (VT.getSizeInBits() == 256 &&
-      N->getOpcode() == ISD::INSERT_SUBVECTOR &&
-      V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
-      V1.getOperand(0).getOpcode() == ISD::UNDEF &&
-      ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
-      ISD::isBuildVectorAllOnes(V2.getNode()))
-    return true;
+      N->getOpcode() == ISD::INSERT_SUBVECTOR) {
+    SDValue V1 = N->getOperand(0);
+    SDValue V2 = N->getOperand(1);
+
+    if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
+        ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
+        ISD::isBuildVectorAllOnes(V2.getNode()))
+      return true;
+  }
  
    return false;
  }
@@ -12503,14 +12881,117 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
  /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
  static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                     const X86Subtarget *Subtarget) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  EVT VT = St->getValue().getValueType();
+  EVT StVT = St->getMemoryVT();
+  DebugLoc dl = St->getDebugLoc();
+  SDValue StoredVal = St->getOperand(1);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // If we are saving a concatination of two XMM registers, perform two stores.
+  // This is better in Sandy Bridge cause one 256-bit mem op is done via two
+  // 128-bit ones. If in the future the cost becomes only one memory access the
+  // first version would be better.
+  if (VT.getSizeInBits() == 256 &&
+    StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
+    StoredVal.getNumOperands() == 2) {
+
+    SDValue Value0 = StoredVal.getOperand(0);
+    SDValue Value1 = StoredVal.getOperand(1);
+
+    SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
+    SDValue Ptr0 = St->getBasePtr();
+    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
+
+    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+  }
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.
+  // First, pack all of the elements in one place. Next, store to memory
+  // in fewer chunks.
+  if (St->isTruncatingStore() && VT.isVector()) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+    // We are going to use the original vector elt for storing.
+    // accumulated smaller vector elements must be a multiple of bigger size.
+    if (0 != (NumElems * ToSz) % FromSz) return SDValue();
+    unsigned SizeRatio  = FromSz / ToSz;
+
+    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StVT.getScalarType(), NumElems*SizeRatio);
+
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type
+    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+                                DAG.getUNDEF(WideVec.getValueType()),
+                                ShuffleVec.data());
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
+        StoreType = Tp;
+    }
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue Ptr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(i));
+      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+      Chains.push_back(Ch);
+    }
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+                               Chains.size());
+  }
+
+
    // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
    // the FP state in cases where an emms may be missing.
    // A preferable solution to the general problem is to figure out the right
    // places to insert EMMS.  This qualifies as a quick hack.
  
    // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
-  StoreSDNode *St = cast<StoreSDNode>(N);
-  EVT VT = St->getValue().getValueType();
    if (VT.getSizeInBits() != 64)
      return SDValue();
  
@@ -12812,20 +13293,18 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) {
    // X86 can't encode an immediate LHS of a sub. See if we can push the
    // negation into a preceding instruction.
    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
-    uint64_t Op0C = C->getSExtValue();
-
      // If the RHS of the sub is a XOR with one use and a constant, invert the
      // immediate. Then add one to the LHS of the sub so we can turn
      // X-Y -> X+~Y+1, saving one register.
      if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
          isa<ConstantSDNode>(Op1.getOperand(1))) {
-      uint64_t XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getSExtValue();
+      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
        EVT VT = Op0.getValueType();
        SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
                                     Op1.getOperand(0),
                                     DAG.getConstant(~XorC, VT));
        return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
-                         DAG.getConstant(Op0C+1, VT));
+                         DAG.getConstant(C->getAPIntValue()+1, VT));
      }
    }
  
@@ -12889,7 +13368,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case X86ISD::VPERMILPSY:
    case X86ISD::VPERMILPD:
    case X86ISD::VPERMILPDY:
-  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI);
+  case X86ISD::VPERM2F128:
+  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
    }
  
    return SDValue();