Add hasSideEffects=0 to some forms of ROUND, RCP, and RSQRT.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 6b650726b622c158bf09dcc4c561298c88201f7a..31e69514fb2161ee6ae22f77b43caf580a6c670e 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -870,6 +870,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
      setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
      setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
      setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
      setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
      setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
@@ -1238,7 +1239,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
  
-
    // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
    // handle type legalization for these operations here.
    //
@@ -1313,13 +1313,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setPrefFunctionAlignment(4); // 2^4 bytes.
  }
  
-
  EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
    if (!VT.isVector()) return MVT::i8;
    return VT.changeVectorElementTypeToInteger();
  }
  
-
  /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
  /// the desired ByVal argument alignment.
  static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
@@ -1524,7 +1522,6 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
    return true;
  }
  
-
  //===----------------------------------------------------------------------===//
  //               Return Value Calling Convention Implementation
  //===----------------------------------------------------------------------===//
@@ -1772,7 +1769,6 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
    return Chain;
  }
  
-
  //===----------------------------------------------------------------------===//
  //                C & StdCall & Fast Calling Convention implementation
  //===----------------------------------------------------------------------===//
@@ -2663,7 +2659,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                           Ins, dl, DAG, InVals);
  }
  
-
  //===----------------------------------------------------------------------===//
  //                Fast Calling Convention (tail call) implementation
  //===----------------------------------------------------------------------===//
@@ -2972,7 +2967,6 @@ X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
    return X86::createFastISel(funcInfo, libInfo);
  }
  
-
  //===----------------------------------------------------------------------===//
  //                           Other Lowering Hooks
  //===----------------------------------------------------------------------===//
@@ -3083,7 +3077,6 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
    return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
  }
  
-
  bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
                                         bool hasSymbolicDisplacement) {
    // Offset should fit into 32 bit immediate field.
@@ -6996,7 +6989,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
                                  getShuffleCLImmediate(SVOp), DAG);
  
-
    //===--------------------------------------------------------------------===//
    // Since no target specific shuffle was selected for this generic one,
    // lower it into other known shuffles. FIXME: this isn't true yet, but
@@ -7098,7 +7090,6 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
    return SDValue();
  }
  
-
  SDValue
  X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                             SelectionDAG &DAG) const {
@@ -7463,7 +7454,6 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
    DebugLoc DL = Op.getDebugLoc();
    Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
  
-
    // With PIC, the address is actually $g + Offset.
    if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
        !Subtarget->is64Bit()) {
@@ -7850,7 +7840,6 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
    llvm_unreachable("TLS not implemented for this target.");
  }
  
-
  /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
  /// and take a 2 x i32 value to shift plus a shift amount.
  SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
@@ -9075,7 +9064,6 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
                       DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
  }
  
-
  SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    SDValue Cond;
    SDValue Op0 = Op.getOperand(0);
@@ -9183,8 +9171,28 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    if (VT == MVT::v2i64) {
      if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42())
        return SDValue();
-    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41())
-      return SDValue();
+    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
+      // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
+      // pcmpeqd + pshufd + pand.
+      assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
+
+      // First cast everything to the right type,
+      Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
+      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+
+      // Do the compare.
+      SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
+
+      // Make sure the lower and upper halves are both all-ones.
+      const int Mask[] = { 1, 0, 3, 2 };
+      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
+      Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
+
+      if (Invert)
+        Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+    }
    }
  
    // Since SSE has no unsigned integer comparisons, we need to flip  the sign
@@ -9728,7 +9736,6 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
                       Chain, Dest, CC, Cond);
  }
  
-
  // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
  // Calls to _alloca is needed to probe the stack when allocating more than 4k
  // bytes in one go. Touching the stack at 4K increments is necessary to ensure
@@ -10154,6 +10161,40 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
                         Op.getOperand(1), Op.getOperand(2));
    }
  
+  // SSE2/SSE41/AVX2 integer max/min intrinsics.
+  case Intrinsic::x86_sse2_pmaxu_b:
+  case Intrinsic::x86_sse41_pmaxuw:
+  case Intrinsic::x86_sse41_pmaxud:
+  case Intrinsic::x86_avx2_pmaxu_b:
+  case Intrinsic::x86_avx2_pmaxu_w:
+  case Intrinsic::x86_avx2_pmaxu_d:
+    return DAG.getNode(X86ISD::UMAX, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse2_pminu_b:
+  case Intrinsic::x86_sse41_pminuw:
+  case Intrinsic::x86_sse41_pminud:
+  case Intrinsic::x86_avx2_pminu_b:
+  case Intrinsic::x86_avx2_pminu_w:
+  case Intrinsic::x86_avx2_pminu_d:
+    return DAG.getNode(X86ISD::UMIN, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse41_pmaxsb:
+  case Intrinsic::x86_sse2_pmaxs_w:
+  case Intrinsic::x86_sse41_pmaxsd:
+  case Intrinsic::x86_avx2_pmaxs_b:
+  case Intrinsic::x86_avx2_pmaxs_w:
+  case Intrinsic::x86_avx2_pmaxs_d:
+    return DAG.getNode(X86ISD::SMAX, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse41_pminsb:
+  case Intrinsic::x86_sse2_pmins_w:
+  case Intrinsic::x86_sse41_pminsd:
+  case Intrinsic::x86_avx2_pmins_b:
+  case Intrinsic::x86_avx2_pmins_w:
+  case Intrinsic::x86_avx2_pmins_d:
+    return DAG.getNode(X86ISD::SMIN, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
    // AVX2 variable shift intrinsics
    case Intrinsic::x86_avx2_psllv_d:
    case Intrinsic::x86_avx2_psllv_q:
@@ -10831,7 +10872,6 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
    int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
  
-
    MachineMemOperand *MMO =
     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
                             MachineMemOperand::MOStore, 2, 2);
@@ -10864,7 +10904,6 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
                              DAG.getConstant(1, MVT::i16)),
                  DAG.getConstant(3, MVT::i16));
  
-
    return DAG.getNode((VT.getSizeInBits() < 16 ?
                        ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
  }
@@ -10993,17 +11032,43 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
  
  static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
                          SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
    EVT VT = Op.getValueType();
  
    // Decompose 256-bit ops into smaller 128-bit ops.
    if (VT.is256BitVector() && !Subtarget->hasInt256())
      return Lower256IntArith(Op, DAG);
  
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
+  if (VT == MVT::v4i32) {
+    assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
+           "Should not custom lower when pmuldq is available!");
+
+    // Extract the odd parts.
+    const int UnpackMask[] = { 1, -1, 3, -1 };
+    SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
+    SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
+
+    // Multiply the even parts.
+    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
+    // Now multiply odd parts.
+    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
+
+    Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
+    Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
+
+    // Merge the two vectors back together with a shuffle. This expands into 2
+    // shuffles.
+    const int ShufMask[] = { 0, 4, 2, 6 };
+    return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
+  }
+
    assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
           "Only know how to lower V2I64/V4I64 multiply");
  
-  DebugLoc dl = Op.getDebugLoc();
-
    //  Ahi = psrlqi(a, 32);
    //  Bhi = psrlqi(b, 32);
    //
@@ -11015,9 +11080,6 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
    //  AhiBlo = psllqi(AhiBlo, 32);
    //  return AloBlo + AloBhi + AhiBlo;
  
-  SDValue A = Op.getOperand(0);
-  SDValue B = Op.getOperand(1);
-
    SDValue ShAmt = DAG.getConstant(32, MVT::i32);
  
    SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
@@ -11394,7 +11456,6 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
    }
  }
  
-
  static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget,
                                SelectionDAG &DAG) {
    DebugLoc dl = Op.getDebugLoc();
@@ -11479,7 +11540,6 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
    return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
  }
  
-
  static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
                               SelectionDAG &DAG) {
    EVT T = Op.getValueType();
@@ -11974,6 +12034,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::HSUB:               return "X86ISD::HSUB";
    case X86ISD::FHADD:              return "X86ISD::FHADD";
    case X86ISD::FHSUB:              return "X86ISD::FHSUB";
+  case X86ISD::UMAX:               return "X86ISD::UMAX";
+  case X86ISD::UMIN:               return "X86ISD::UMIN";
+  case X86ISD::SMAX:               return "X86ISD::SMAX";
+  case X86ISD::SMIN:               return "X86ISD::SMIN";
    case X86ISD::FMAX:               return "X86ISD::FMAX";
    case X86ISD::FMIN:               return "X86ISD::FMIN";
    case X86ISD::FMAXC:              return "X86ISD::FMAXC";
@@ -12128,7 +12192,6 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
    return true;
  }
  
-
  bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
    if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
      return false;
@@ -14326,7 +14389,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
  }
  
-
  /// PerformTruncateCombine - Converts truncate operation to
  /// a sequence of vector shuffle operations.
  /// It is possible when we truncate 256-bit vector to 128-bit vector
@@ -14346,51 +14408,42 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
  
    if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
  
+    // On AVX2, v4i64 -> v4i32 becomes VPERMD.
      if (Subtarget->hasInt256()) {
-      // AVX2: v4i64 -> v4i32
-
-      // VPERMD
        static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
-
        Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op);
        Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32),
                                  ShufMask);
-
        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
                           DAG.getIntPtrConstant(0));
      }
  
-    // AVX: v4i64 -> v4i32
+    // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS.
      SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
                                 DAG.getIntPtrConstant(0));
-
      SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
                                 DAG.getIntPtrConstant(2));
  
      OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
      OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
  
-    // PSHUFD
+    // The PSHUFD mask:
      static const int ShufMask1[] = {0, 2, 0, 0};
-
      SDValue Undef = DAG.getUNDEF(VT);
      OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1);
      OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1);
  
-    // MOVLHPS
+    // The MOVLHPS mask:
      static const int ShufMask2[] = {0, 1, 4, 5};
-
      return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
    }
  
    if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
  
+    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
      if (Subtarget->hasInt256()) {
-      // AVX2: v8i32 -> v8i16
-
        Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op);
  
-      // PSHUFB
        SmallVector<SDValue,32> pshufbMask;
        for (unsigned i = 0; i < 2; ++i) {
          pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
@@ -14407,16 +14460,13 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
        SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8,
                                 &pshufbMask[0], 32);
        Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV);
-
        Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op);
  
        static const int ShufMask[] = {0,  2,  -1,  -1};
        Op = DAG.getVectorShuffle(MVT::v4i64, dl,  Op, DAG.getUNDEF(MVT::v4i64),
                                  &ShufMask[0]);
-
        Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
                         DAG.getIntPtrConstant(0));
-
        return DAG.getNode(ISD::BITCAST, dl, VT, Op);
      }
  
@@ -14429,7 +14479,7 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
      OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
      OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
  
-    // PSHUFB
+    // The PSHUFB mask:
      static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
                                     -1, -1, -1, -1, -1, -1, -1, -1};
  
@@ -14440,9 +14490,8 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
      OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
      OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
  
-    // MOVLHPS
+    // The MOVLHPS Mask:
      static const int ShufMask2[] = {0, 1, 4, 5};
-
      SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
    }
@@ -14641,6 +14690,76 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
+static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
+                                   SDValue RHS, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
+  if (!VT.isVector())
+    return 0;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return 0;
+  case MVT::v32i8:
+  case MVT::v16i16:
+  case MVT::v8i32:
+    if (!Subtarget->hasAVX2())
+      return 0;
+  case MVT::v16i8:
+  case MVT::v8i16:
+  case MVT::v4i32:
+    if (!Subtarget->hasSSE2())
+      return 0;
+  }
+
+  // SSE2 has only a small subset of the operations.
+  bool hasUnsigned = Subtarget->hasSSE41() ||
+                     (Subtarget->hasSSE2() && VT == MVT::v16i8);
+  bool hasSigned = Subtarget->hasSSE41() ||
+                   (Subtarget->hasSSE2() && VT == MVT::v8i16);
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+  // Check for x CC y ? x : y.
+  if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+    switch (CC) {
+    default: break;
+    case ISD::SETULT:
+    case ISD::SETULE:
+      return hasUnsigned ? X86ISD::UMIN : 0;
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+      return hasUnsigned ? X86ISD::UMAX : 0;
+    case ISD::SETLT:
+    case ISD::SETLE:
+      return hasSigned ? X86ISD::SMIN : 0;
+    case ISD::SETGT:
+    case ISD::SETGE:
+      return hasSigned ? X86ISD::SMAX : 0;
+    }
+  // Check for x CC y ? y : x -- a min/max with reversed arms.
+  } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+             DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+    switch (CC) {
+    default: break;
+    case ISD::SETULT:
+    case ISD::SETULE:
+      return hasUnsigned ? X86ISD::UMAX : 0;
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+      return hasUnsigned ? X86ISD::UMIN : 0;
+    case ISD::SETLT:
+    case ISD::SETLE:
+      return hasSigned ? X86ISD::SMAX : 0;
+    case ISD::SETGT:
+    case ISD::SETGE:
+      return hasSigned ? X86ISD::SMIN : 0;
+    }
+  }
+
+  return 0;
+}
+
  /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
  /// nodes.
  static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
@@ -14980,6 +15099,12 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
+  // Try to match a min/max vector operation.
+  if (!DCI.isBeforeLegalize() &&
+      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC)
+    if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
+      return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
+
    // If we know that this node is legal then we know that it is going to be
    // matched by one of the SSE/AVX BLEND instructions. These instructions only
    // depend on the highest bit in each word. Try to use SimplifyDemandedBits
@@ -15277,7 +15402,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
-
  /// PerformMulCombine - Optimize a single multiply with constant into two
  /// in order to implement it with two cheaper instructions, e.g.
  /// LEA + SHL, LEA + LEA.
@@ -15366,7 +15490,6 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
      }
    }
  
-
    // Hardware support for vector shifts is sparse which makes us scalarize the
    // vector operations in many cases. Also, on sandybridge ADD is faster than
    // shl.
@@ -15510,7 +15633,6 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
    }
  }
  
-
  // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
  // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
  // and friends.  Likewise for OR -> CMPNEQSS.
@@ -15928,14 +16050,14 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
    ISD::LoadExtType Ext = Ld->getExtensionType();
  
    // If this is a vector EXT Load then attempt to optimize it using a
-  // shuffle. We need SSSE3 shuffles.
-  // SEXT loads are suppoted starting SSE41.
-  // We generate X86ISD::VSEXT for them.
+  // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
+  // expansion is still better than scalar code.
+  // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
+  // emit a shuffle and a arithmetic shift.
    // TODO: It is possible to support ZExt by zeroing the undef values
    // during the shuffle phase or after the shuffle.
-  if (RegVT.isVector() && RegVT.isInteger() &&
-      ((Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) ||
-       (Ext == ISD::SEXTLOAD && Subtarget->hasSSE41()))){
+  if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
+      (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
      assert(MemVT != RegVT && "Cannot extend to the same type");
      assert(MemVT.isVector() && "Must load a vector from memory");
  
@@ -16028,9 +16150,40 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
      unsigned SizeRatio = RegSz/MemSz;
  
      if (Ext == ISD::SEXTLOAD) {
-      SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
-      return DCI.CombineTo(N, Sext, TF, true);
+      // If we have SSE4.1 we can directly emit a VSEXT node.
+      if (Subtarget->hasSSE41()) {
+        SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+        return DCI.CombineTo(N, Sext, TF, true);
+      }
+
+      // Otherwise we'll shuffle the small elements in the high bits of the
+      // larger type and perform an arithmetic shift. If the shift is not legal
+      // it's better to scalarize.
+      if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
+        return SDValue();
+
+      // Redistribute the loaded elements into the different locations.
+      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+      for (unsigned i = 0; i != NumElems; ++i)
+        ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
+
+      SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+                                           DAG.getUNDEF(WideVecVT),
+                                           &ShuffleVec[0]);
+
+      Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+
+      // Build the arithmetic shift.
+      unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
+                     MemVT.getVectorElementType().getSizeInBits();
+      SmallVector<SDValue, 8> C(NumElems,
+                                DAG.getConstant(Amt, RegVT.getScalarType()));
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, RegVT, &C[0], C.size());
+      Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, BV);
+
+      return DCI.CombineTo(N, Shuff, TF, true);
      }
+
      // Redistribute the loaded elements into the different locations.
      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
      for (unsigned i = 0; i != NumElems; ++i)
@@ -16164,7 +16317,6 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                 Chains.size());
    }
  
-
    // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
    // the FP state in cases where an emms may be missing.
    // A preferable solution to the general problem is to figure out the right
@@ -16470,7 +16622,6 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
                       N->getOperand(0), N->getOperand(1));
  }
  
-
  /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
  static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
    // FAND(0.0, x) -> 0.0
@@ -17208,8 +17359,6 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
    return false;
  }
  
-
-
  /// getConstraintType - Given a constraint letter, return the type of
  /// constraint it is for this target.
  X86TargetLowering::ConstraintType
@@ -17818,14 +17967,13 @@ X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
    return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty);
  }
  
-
  unsigned
  X86VectorTargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
                                                unsigned Alignment,
                                                unsigned AddressSpace) const {
    // Legalize the type.
    std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Src);
-  assert(Opcode == Instruction::Load || Opcode == Instruction::Store &&
+  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
           "Invalid Opcode");
  
    const X86Subtarget &ST =