PerformSubCombine to work on integers larger than i128. Fixes a crasher.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index be3ecd7e531b2c8fef6da040acc7c69aba085751..d2b7690489380289230cdc4b6b1a6b6558971fa8 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -989,6 +989,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
      setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
  
+    setOperationAction(ISD::VSETCC,            MVT::v32i8, Custom);
+    setOperationAction(ISD::VSETCC,            MVT::v16i16, Custom);
      setOperationAction(ISD::VSETCC,            MVT::v8i32, Custom);
      setOperationAction(ISD::VSETCC,            MVT::v4i64, Custom);
  
@@ -2753,6 +2755,7 @@ static bool isTargetShuffle(unsigned Opcode) {
    case X86ISD::VPERMILPSY:
    case X86ISD::VPERMILPD:
    case X86ISD::VPERMILPDY:
+  case X86ISD::VPERM2F128:
      return true;
    }
    return false;
@@ -2795,6 +2798,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
    case X86ISD::PALIGN:
    case X86ISD::SHUFPD:
    case X86ISD::SHUFPS:
+  case X86ISD::VPERM2F128:
      return DAG.getNode(Opc, dl, VT, V1, V2,
                         DAG.getConstant(TargetMask, MVT::i8));
    }
@@ -3025,6 +3029,17 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
    return (Val < 0) || (Val >= Low && Val < Hi);
  }
  
+/// isUndefOrInRange - Return true if every element in Mask, begining
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// range (L, L+Pos]. or is undef.
+static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask,
+                             int Pos, int Size, int Low, int Hi) {
+  for (int i = Pos, e = Pos+Size; i != e; ++i)
+    if (!isUndefOrInRange(Mask[i], Low, Hi))
+      return false;
+  return true;
+}
+
  /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
  /// specified value.
  static bool isUndefOrEqual(int Val, int CmpVal) {
@@ -3033,6 +3048,17 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
    return false;
  }
  
+/// isSequentialOrUndefInRange - Return true if every element in Mask, begining
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (L, L+Pos]. or is undef.
+static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask,
+                                       int Pos, int Size, int Low) {
+  for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+    if (!isUndefOrEqual(Mask[i], Low))
+      return false;
+  return true;
+}
+
  /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
  /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
  /// the second operand.
@@ -3444,6 +3470,67 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
    return ::isMOVLMask(M, N->getValueType(0));
  }
  
+/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered
+/// as permutations between 128-bit chunks or halves. As an example: this
+/// shuffle bellow:
+///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
+/// The first half comes from the second half of V1 and the second half from the
+/// the second half of V2.
+static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
+                             const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+    return false;
+
+  // The shuffle result is divided into half A and half B. In total the two
+  // sources have 4 halves, namely: C, D, E, F. The final values of A and
+  // B must come from C, D, E or F.
+  int HalfSize = VT.getVectorNumElements()/2;
+  bool MatchA = false, MatchB = false;
+
+  // Check if A comes from one of C, D, E, F.
+  for (int Half = 0; Half < 4; ++Half) {
+    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
+      MatchA = true;
+      break;
+    }
+  }
+
+  // Check if B comes from one of C, D, E, F.
+  for (int Half = 0; Half < 4; ++Half) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
+      MatchB = true;
+      break;
+    }
+  }
+
+  return MatchA && MatchB;
+}
+
+/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERM2F128 instructions.
+static unsigned getShuffleVPERM2F128Immediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+
+  int HalfSize = VT.getVectorNumElements()/2;
+
+  int FstHalf = 0, SndHalf = 0;
+  for (int i = 0; i < HalfSize; ++i) {
+    if (SVOp->getMaskElt(i) > 0) {
+      FstHalf = SVOp->getMaskElt(i)/HalfSize;
+      break;
+    }
+  }
+  for (int i = HalfSize; i < HalfSize*2; ++i) {
+    if (SVOp->getMaskElt(i) > 0) {
+      SndHalf = SVOp->getMaskElt(i)/HalfSize;
+      break;
+    }
+  }
+
+  return (FstHalf | (SndHalf << 4));
+}
+
  /// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
  /// Note that VPERMIL mask matching is different depending whether theunderlying
@@ -3863,7 +3950,10 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
  /// V1 (and in order), and the upper half elements should come from the upper
  /// half of V2 (and in order).
  static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
-  if (Op->getValueType(0).getVectorNumElements() != 4)
+  EVT VT = Op->getValueType(0);
+  if (VT.getSizeInBits() != 128)
+    return false;
+  if (VT.getVectorNumElements() != 4)
      return false;
    for (unsigned i = 0, e = 2; i != e; ++i)
      if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
@@ -3895,6 +3985,10 @@ static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
  /// MOVLP, it must be either a vector load or a scalar load to vector.
  static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
                                 ShuffleVectorSDNode *Op) {
+  EVT VT = Op->getValueType(0);
+  if (VT.getSizeInBits() != 128)
+    return false;
+
    if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
      return false;
    // Is V2 is a vector load, don't do this transformation. We will try to use
@@ -3902,7 +3996,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
    if (ISD::isNON_EXTLoad(V2))
      return false;
  
-  unsigned NumElems = Op->getValueType(0).getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
  
    if (NumElems != 2 && NumElems != 4)
      return false;
@@ -4113,36 +4207,7 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
    return DAG.getNode(ISD::BITCAST, dl, VT, V);
  }
  
-/// PromoteVectorToScalarSplat - Since there's no native support for
-/// scalar_to_vector for 256-bit AVX, a 128-bit scalar_to_vector +
-/// INSERT_SUBVECTOR is generated. Recognize this idiom and do the
-/// shuffle before the insertion, this yields less instructions in the end.
-static SDValue PromoteVectorToScalarSplat(ShuffleVectorSDNode *SV,
-                                          SelectionDAG &DAG) {
-  EVT SrcVT = SV->getValueType(0);
-  SDValue V1 = SV->getOperand(0);
-  DebugLoc dl = SV->getDebugLoc();
-  int NumElems = SrcVT.getVectorNumElements();
-
-  assert(SrcVT.is256BitVector() && "unknown howto handle vector type");
-  assert(SV->isSplat() && "shuffle must be a splat");
-
-  int SplatIdx = SV->getSplatIndex();
-  const int Mask[4] = { SplatIdx, SplatIdx, SplatIdx, SplatIdx };
-
-  EVT SVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
-                             NumElems/2);
-  SDValue SV1 = DAG.getVectorShuffle(SVT, dl, V1.getOperand(1),
-                                     DAG.getUNDEF(SVT), Mask);
-  SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), SV1,
-                                    DAG.getConstant(0, MVT::i32), DAG, dl);
-
-  return Insert128BitVector(InsV, SV1,
-                       DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
-}
-
-/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32 and
-/// v8i32, v16i16 or v32i8 to v8f32.
+/// PromoteSplat - Splat is promoted to target supported vector shuffles.
  static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
    EVT SrcVT = SV->getValueType(0);
    SDValue V1 = SV->getOperand(0);
@@ -4161,7 +4226,11 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
        EltNo -= NumElems/2;
    }
  
-  // Make this 128-bit vector duplicate i8 and i16 elements
+  // All i16 and i8 vector types can't be used directly by a generic shuffle
+  // instruction because the target has no such instruction. Generate shuffles
+  // which repeat i16 and i8 several times until they fit in i32, and then can
+  // be manipulated by target suported shuffles. After the insertion of the
+  // necessary shuffles, the result is bitcasted back to v4f32 or v8f32.
    EVT EltVT = SrcVT.getVectorElementType();
    if (NumElems > 4 && (EltVT == MVT::i8 || EltVT == MVT::i16))
      V1 = PromoteSplati8i16(V1, DAG, EltNo);
@@ -4310,6 +4379,11 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
        DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                          ShuffleMask);
        break;
+    case X86ISD::VPERM2F128:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                           ShuffleMask);
+      break;
      default:
        assert("not implemented for target shuffle node");
        return SDValue();
@@ -5580,10 +5654,95 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
                                               OpVT, SrcOp)));
  }
  
+/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector
+/// shuffle node referes to only one lane in the sources.
+static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+  int HalfSize = NumElems/2;
+  SmallVector<int, 16> M;
+  SVOp->getMask(M);
+  bool MatchA = false, MatchB = false;
+
+  for (int l = 0; l < NumElems*2; l += HalfSize) {
+    if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) {
+      MatchA = true;
+      break;
+    }
+  }
+
+  for (int l = 0; l < NumElems*2; l += HalfSize) {
+    if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) {
+      MatchB = true;
+      break;
+    }
+  }
+
+  return MatchA && MatchB;
+}
+
  /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
  /// which could not be matched by any known target speficic shuffle
  static SDValue
  LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+  if (areShuffleHalvesWithinDisjointLanes(SVOp)) {
+    // If each half of a vector shuffle node referes to only one lane in the
+    // source vectors, extract each used 128-bit lane and shuffle them using
+    // 128-bit shuffles. Then, concatenate the results. Otherwise leave
+    // the work to the legalizer.
+    DebugLoc dl = SVOp->getDebugLoc();
+    EVT VT = SVOp->getValueType(0);
+    int NumElems = VT.getVectorNumElements();
+    int HalfSize = NumElems/2;
+
+    // Extract the reference for each half
+    int FstVecExtractIdx = 0, SndVecExtractIdx = 0;
+    int FstVecOpNum = 0, SndVecOpNum = 0;
+    for (int i = 0; i < HalfSize; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      if (SVOp->getMaskElt(i) < 0)
+        continue;
+      FstVecOpNum = Elt/NumElems;
+      FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
+      break;
+    }
+    for (int i = HalfSize; i < NumElems; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      if (SVOp->getMaskElt(i) < 0)
+        continue;
+      SndVecOpNum = Elt/NumElems;
+      SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
+      break;
+    }
+
+    // Extract the subvectors
+    SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum),
+                      DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl);
+    SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum),
+                      DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl);
+
+    // Generate 128-bit shuffles
+    SmallVector<int, 16> MaskV1, MaskV2;
+    for (int i = 0; i < HalfSize; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+    }
+    for (int i = HalfSize; i < NumElems; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+    }
+
+    EVT NVT = V1.getValueType();
+    V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]);
+    V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]);
+
+    // Concatenate the result back
+    SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1,
+                                   DAG.getConstant(0, MVT::i32), DAG, dl);
+    return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
+                              DAG, dl);
+  }
+
    return SDValue();
  }
  
@@ -5994,6 +6153,48 @@ static inline unsigned getVPERMILOpcode(EVT VT) {
    return 0;
  }
  
+/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
+/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming
+/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded.
+static bool isVectorBroadcast(SDValue &Op) {
+  EVT VT = Op.getValueType();
+  bool Is256 = VT.getSizeInBits() == 256;
+
+  assert((VT.getSizeInBits() == 128 || Is256) &&
+         "Unsupported type for vbroadcast node");
+
+  SDValue V = Op;
+  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  if (Is256 && !(V.hasOneUse() &&
+                 V.getOpcode() == ISD::INSERT_SUBVECTOR &&
+                 V.getOperand(0).getOpcode() == ISD::UNDEF))
+    return false;
+
+  if (Is256)
+    V = V.getOperand(1);
+  if (V.hasOneUse() && V.getOpcode() != ISD::SCALAR_TO_VECTOR)
+    return false;
+
+  // Check the source scalar_to_vector type. 256-bit broadcasts are
+  // supported for 32/64-bit sizes, while 128-bit ones are only supported
+  // for 32-bit scalars.
+  unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits();
+  if (ScalarSize != 32 && ScalarSize != 64)
+    return false;
+  if (!Is256 && ScalarSize == 64)
+    return false;
+
+  V = V.getOperand(0);
+  if (!MayFoldLoad(V))
+    return false;
+
+  // Return the load node
+  Op = V;
+  return true;
+}
+
  static
  SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
                                 const TargetLowering &TLI,
@@ -6017,25 +6218,15 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
      if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
        return Op;
  
-    // Since there's no native support for scalar_to_vector for 256-bit AVX, a
-    // 128-bit scalar_to_vector + INSERT_SUBVECTOR is generated. Recognize this
-    // idiom and do the shuffle before the insertion, this yields less
-    // instructions in the end.
-    if (VT.is256BitVector() &&
-        V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
-        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
-        V1.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return PromoteVectorToScalarSplat(SVOp, DAG);
+    // Use vbroadcast whenever the splat comes from a foldable load
+    if (Subtarget->hasAVX() && isVectorBroadcast(V1))
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
  
      // Handle splats by matching through known shuffle masks
      if (VT.is128BitVector() && NumElem <= 4)
        return SDValue();
  
-    // All i16 and i8 vector types can't be used directly by a generic shuffle
-    // instruction because the target has no such instruction. Generate shuffles
-    // which repeat i16 and i8 several times until they fit in i32, and then can
-    // be manipulated by target suported shuffles. After the insertion of the
-    // necessary shuffles, the result is bitcasted back to v4f32 or v8f32.
+    // All remaning splats are promoted to target supported vector shuffles.
      return PromoteSplat(SVOp, DAG);
    }
  
@@ -6328,6 +6519,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
                                  getShuffleVPERMILPDImmediate(SVOp), DAG);
  
+  // Handle VPERM2F128 permutations
+  if (isVPERM2F128Mask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2,
+                                getShuffleVPERM2F128Immediate(SVOp), DAG);
+
    //===--------------------------------------------------------------------===//
    // Since no target specific shuffle was selected for this generic one,
    // lower it into other known shuffles. FIXME: this isn't true yet, but
@@ -7925,6 +8121,39 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
                       DAG.getConstant(X86CC, MVT::i8), EFLAGS);
  }
  
+// Lower256IntVETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
+// ones, and then concatenate the result back.
+static SDValue Lower256IntVETCC(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::VSETCC &&
+         "Unsupported value type for operation");
+
+  int NumElems = VT.getVectorNumElements();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue CC = Op.getOperand(2);
+  SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+  SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+
+  // Issue the operation on the smaller types and concatenate the result back
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+}
+
+
  SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    SDValue Cond;
    SDValue Op0 = Op.getOperand(0);
@@ -7987,8 +8216,9 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
      return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
    }
  
+  // Break 256-bit integer vector compare into smaller ones.
    if (!isFP && VT.getSizeInBits() == 256)
-    return SDValue();
+    return Lower256IntVETCC(Op, DAG);
  
    // We are handling one of the integer comparisons here.  Since SSE only has
    // GT and EQ comparisons for integer, swapping operands and multiple
@@ -9255,17 +9485,26 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
                                       DAG, dl);
  
      // Recreate the shift amount vectors
-    SmallVector<SDValue, 4> Amt1Csts;
-    SmallVector<SDValue, 4> Amt2Csts;
-    for (int i = 0; i < NumElems/2; ++i)
-      Amt1Csts.push_back(Amt->getOperand(i));
-    for (int i = NumElems/2; i < NumElems; ++i)
-      Amt2Csts.push_back(Amt->getOperand(i));
-
-    SDValue Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
-                               &Amt1Csts[0], NumElems/2);
-    SDValue Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
-                               &Amt2Csts[0], NumElems/2);
+    SDValue Amt1, Amt2;
+    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+      // Constant shift amount
+      SmallVector<SDValue, 4> Amt1Csts;
+      SmallVector<SDValue, 4> Amt2Csts;
+      for (int i = 0; i < NumElems/2; ++i)
+        Amt1Csts.push_back(Amt->getOperand(i));
+      for (int i = NumElems/2; i < NumElems; ++i)
+        Amt2Csts.push_back(Amt->getOperand(i));
+
+      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+                                 &Amt1Csts[0], NumElems/2);
+      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+                                 &Amt2Csts[0], NumElems/2);
+    } else {
+      // Variable shift amount
+      Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
+      Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
+                                 DAG, dl);
+    }
  
      // Issue new vector shifts for the smaller types
      V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
@@ -10041,10 +10280,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::PUNPCKHWD:          return "X86ISD::PUNPCKHWD";
    case X86ISD::PUNPCKHDQ:          return "X86ISD::PUNPCKHDQ";
    case X86ISD::PUNPCKHQDQ:         return "X86ISD::PUNPCKHQDQ";
+  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
    case X86ISD::VPERMILPS:          return "X86ISD::VPERMILPS";
    case X86ISD::VPERMILPSY:         return "X86ISD::VPERMILPSY";
    case X86ISD::VPERMILPD:          return "X86ISD::VPERMILPD";
    case X86ISD::VPERMILPDY:         return "X86ISD::VPERMILPDY";
+  case X86ISD::VPERM2F128:         return "X86ISD::VPERM2F128";
    case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
    case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
    case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -11541,6 +11782,38 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
    return TargetLowering::isGAPlusOffset(N, GA, Offset);
  }
  
+/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
+/// same as extracting the high 128-bit part of 256-bit vector and then
+/// inserting the result into the low part of a new 256-bit vector
+static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+
+  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j)
+    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+        SVOp->getMaskElt(j) >= 0)
+      return false;
+
+  return true;
+}
+
+/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
+/// same as extracting the low 128-bit part of 256-bit vector and then
+/// inserting the result into the high part of a new 256-bit vector
+static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+
+  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j)
+    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+        SVOp->getMaskElt(j) >= 0)
+      return false;
+
+  return true;
+}
+
  /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
  static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
                                          TargetLowering::DAGCombinerInfo &DCI) {
@@ -11549,6 +11822,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
    SDValue V1 = SVOp->getOperand(0);
    SDValue V2 = SVOp->getOperand(1);
    EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
  
    if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
        V2.getOpcode() == ISD::CONCAT_VECTORS) {
@@ -11573,7 +11847,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
      // To match the shuffle mask, the first half of the mask should
      // be exactly the first vector, and all the rest a splat with the
      // first element of the second one.
-    int NumElems = VT.getVectorNumElements();
      for (int i = 0; i < NumElems/2; ++i)
        if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
            !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
@@ -11587,12 +11860,34 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
      return DCI.CombineTo(N, InsV);
    }
  
+  //===--------------------------------------------------------------------===//
+  // Combine some shuffles into subvector extracts and inserts:
+  //
+
+  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  if (isShuffleHigh128VectorInsertLow(SVOp)) {
+    SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32),
+                                    DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
+                                      V, DAG.getConstant(0, MVT::i32), DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  if (isShuffleLow128VectorInsertHigh(SVOp)) {
+    SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
+                             V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
    return SDValue();
  }
  
  /// PerformShuffleCombine - Performs several different shuffle combines.
  static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
-                                     TargetLowering::DAGCombinerInfo &DCI) {
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const X86Subtarget *Subtarget) {
    DebugLoc dl = N->getDebugLoc();
    EVT VT = N->getValueType(0);
  
@@ -11601,8 +11896,9 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
      return SDValue();
  
-  // Only handle pure VECTOR_SHUFFLE nodes.
-  if (VT.getSizeInBits() == 256 && N->getOpcode() == ISD::VECTOR_SHUFFLE)
+  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
+  if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
+      N->getOpcode() == ISD::VECTOR_SHUFFLE)
      return PerformShuffleCombine256(N, DAG, DCI);
  
    // Only handle 128 wide vector from here on.
@@ -12589,6 +12885,32 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
    EVT VT = St->getValue().getValueType();
    EVT StVT = St->getMemoryVT();
    DebugLoc dl = St->getDebugLoc();
+  SDValue StoredVal = St->getOperand(1);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // If we are saving a concatination of two XMM registers, perform two stores.
+  // This is better in Sandy Bridge cause one 256-bit mem op is done via two
+  // 128-bit ones. If in the future the cost becomes only one memory access the
+  // first version would be better.
+  if (VT.getSizeInBits() == 256 &&
+    StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
+    StoredVal.getNumOperands() == 2) {
+
+    SDValue Value0 = StoredVal.getOperand(0);
+    SDValue Value1 = StoredVal.getOperand(1);
+
+    SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
+    SDValue Ptr0 = St->getBasePtr();
+    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
+
+    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+  }
  
    // Optimize trunc store (of multiple scalars) to shuffle and store.
    // First, pack all of the elements in one place. Next, store to memory
@@ -12971,20 +13293,18 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) {
    // X86 can't encode an immediate LHS of a sub. See if we can push the
    // negation into a preceding instruction.
    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
-    uint64_t Op0C = C->getSExtValue();
-
      // If the RHS of the sub is a XOR with one use and a constant, invert the
      // immediate. Then add one to the LHS of the sub so we can turn
      // X-Y -> X+~Y+1, saving one register.
      if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
          isa<ConstantSDNode>(Op1.getOperand(1))) {
-      uint64_t XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getSExtValue();
+      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
        EVT VT = Op0.getValueType();
        SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
                                     Op1.getOperand(0),
                                     DAG.getConstant(~XorC, VT));
        return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
-                         DAG.getConstant(Op0C+1, VT));
+                         DAG.getConstant(C->getAPIntValue()+1, VT));
      }
    }
  
@@ -13048,7 +13368,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case X86ISD::VPERMILPSY:
    case X86ISD::VPERMILPD:
    case X86ISD::VPERMILPDY:
-  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI);
+  case X86ISD::VPERM2F128:
+  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
    }
  
    return SDValue();