Add hasSideEffects=0 to some forms of ROUND, RCP, and RSQRT.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index d4ee985ca6593c1490f8e69a2c6f16e1a5ff194f..31e69514fb2161ee6ae22f77b43caf580a6c670e 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -870,6 +870,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
      setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
      setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
      setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
      setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
      setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
@@ -1238,7 +1239,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
  
-
    // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
    // handle type legalization for these operations here.
    //
@@ -1313,13 +1313,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setPrefFunctionAlignment(4); // 2^4 bytes.
  }
  
-
  EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
    if (!VT.isVector()) return MVT::i8;
    return VT.changeVectorElementTypeToInteger();
  }
  
-
  /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
  /// the desired ByVal argument alignment.
  static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
@@ -1383,7 +1381,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
                                         MachineFunction &MF) const {
    const Function *F = MF.getFunction();
    if ((!IsMemset || ZeroMemset) &&
-      !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
+      !F->getFnAttributes().hasAttribute(Attribute::NoImplicitFloat)) {
      if (Size >= 16 &&
          (Subtarget->isUnalignedMemAccessFast() ||
           ((DstAlign == 0 || DstAlign >= 16) &&
@@ -1479,10 +1477,10 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
  
  // FIXME: Why this routine is here? Move to RegInfo!
  std::pair<const TargetRegisterClass*, uint8_t>
-X86TargetLowering::findRepresentativeClass(EVT VT) const{
+X86TargetLowering::findRepresentativeClass(MVT VT) const{
    const TargetRegisterClass *RRC = 0;
    uint8_t Cost = 1;
-  switch (VT.getSimpleVT().SimpleTy) {
+  switch (VT.SimpleTy) {
    default:
      return TargetLowering::findRepresentativeClass(VT);
    case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
@@ -1524,7 +1522,6 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
    return true;
  }
  
-
  //===----------------------------------------------------------------------===//
  //               Return Value Calling Convention Implementation
  //===----------------------------------------------------------------------===//
@@ -1696,8 +1693,8 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
    return true;
  }
  
-EVT
-X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
+MVT
+X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
                                              ISD::NodeType ExtendKind) const {
    MVT ReturnMVT;
    // TODO: Is this also valid on 32-bit?
@@ -1706,7 +1703,7 @@ X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
    else
      ReturnMVT = MVT::i32;
  
-  EVT MinVT = getRegisterType(Context, ReturnMVT);
+  MVT MinVT = getRegisterType(ReturnMVT);
    return VT.bitsLT(MinVT) ? MinVT : VT;
  }
  
@@ -1772,7 +1769,6 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
    return Chain;
  }
  
-
  //===----------------------------------------------------------------------===//
  //                C & StdCall & Fast Calling Convention implementation
  //===----------------------------------------------------------------------===//
@@ -2066,7 +2062,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
                                                         TotalNumIntRegs);
  
        bool NoImplicitFloatOps = Fn->getFnAttributes().
-        hasAttribute(Attributes::NoImplicitFloat);
+        hasAttribute(Attribute::NoImplicitFloat);
        assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
               "SSE register cannot be used when SSE is disabled!");
        assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
@@ -2545,7 +2541,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
        } else if (Subtarget->isPICStyleRIPRel() &&
                   isa<Function>(GV) &&
                   cast<Function>(GV)->getFnAttributes().
-                   hasAttribute(Attributes::NonLazyBind)) {
+                   hasAttribute(Attribute::NonLazyBind)) {
          // If the function is marked as non-lazy, generate an indirect call
          // which loads from the GOT directly. This avoids runtime overhead
          // at the cost of eager binding (and one extra byte of encoding).
@@ -2663,7 +2659,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                           Ins, dl, DAG, InVals);
  }
  
-
  //===----------------------------------------------------------------------===//
  //                Fast Calling Convention (tail call) implementation
  //===----------------------------------------------------------------------===//
@@ -2972,7 +2967,6 @@ X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
    return X86::createFastISel(funcInfo, libInfo);
  }
  
-
  //===----------------------------------------------------------------------===//
  //                           Other Lowering Hooks
  //===----------------------------------------------------------------------===//
@@ -3083,7 +3077,6 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
    return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
  }
  
-
  bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
                                         bool hasSymbolicDisplacement) {
    // Offset should fit into 32 bit immediate field.
@@ -6735,7 +6728,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    bool HasInt256   = Subtarget->hasInt256();
    MachineFunction &MF = DAG.getMachineFunction();
    bool OptForSize = MF.getFunction()->getFnAttributes().
-    hasAttribute(Attributes::OptimizeForSize);
+    hasAttribute(Attribute::OptimizeForSize);
  
    assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
  
@@ -6996,7 +6989,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
                                  getShuffleCLImmediate(SVOp), DAG);
  
-
    //===--------------------------------------------------------------------===//
    // Since no target specific shuffle was selected for this generic one,
    // lower it into other known shuffles. FIXME: this isn't true yet, but
@@ -7098,7 +7090,6 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
    return SDValue();
  }
  
-
  SDValue
  X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                             SelectionDAG &DAG) const {
@@ -7463,7 +7454,6 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
    DebugLoc DL = Op.getDebugLoc();
    Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
  
-
    // With PIC, the address is actually $g + Offset.
    if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
        !Subtarget->is64Bit()) {
@@ -7850,7 +7840,6 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
    llvm_unreachable("TLS not implemented for this target.");
  }
  
-
  /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
  /// and take a 2 x i32 value to shift plus a shift amount.
  SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
@@ -9075,7 +9064,6 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
                       DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
  }
  
-
  SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    SDValue Cond;
    SDValue Op0 = Op.getOperand(0);
@@ -9183,8 +9171,28 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    if (VT == MVT::v2i64) {
      if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42())
        return SDValue();
-    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41())
-      return SDValue();
+    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
+      // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
+      // pcmpeqd + pshufd + pand.
+      assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
+
+      // First cast everything to the right type,
+      Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
+      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+
+      // Do the compare.
+      SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
+
+      // Make sure the lower and upper halves are both all-ones.
+      const int Mask[] = { 1, 0, 3, 2 };
+      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
+      Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
+
+      if (Invert)
+        Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+    }
    }
  
    // Since SSE has no unsigned integer comparisons, we need to flip  the sign
@@ -9728,7 +9736,6 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
                       Chain, Dest, CC, Cond);
  }
  
-
  // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
  // Calls to _alloca is needed to probe the stack when allocating more than 4k
  // bytes in one go. Touching the stack at 4K increments is necessary to ensure
@@ -9892,7 +9899,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
      assert(!getTargetMachine().Options.UseSoftFloat &&
             !(DAG.getMachineFunction()
                  .getFunction()->getFnAttributes()
-                .hasAttribute(Attributes::NoImplicitFloat)) &&
+                .hasAttribute(Attribute::NoImplicitFloat)) &&
             Subtarget->hasSSE1());
    }
  
@@ -10097,6 +10104,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
      return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
                         Op.getOperand(1), Op.getOperand(2));
  
+  // SSE2/AVX2 sub with unsigned saturation intrinsics
+  case Intrinsic::x86_sse2_psubus_b:
+  case Intrinsic::x86_sse2_psubus_w:
+  case Intrinsic::x86_avx2_psubus_b:
+  case Intrinsic::x86_avx2_psubus_w:
+    return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
    // SSE3/AVX horizontal add/sub intrinsics
    case Intrinsic::x86_sse3_hadd_ps:
    case Intrinsic::x86_sse3_hadd_pd:
@@ -10146,6 +10161,40 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
                         Op.getOperand(1), Op.getOperand(2));
    }
  
+  // SSE2/SSE41/AVX2 integer max/min intrinsics.
+  case Intrinsic::x86_sse2_pmaxu_b:
+  case Intrinsic::x86_sse41_pmaxuw:
+  case Intrinsic::x86_sse41_pmaxud:
+  case Intrinsic::x86_avx2_pmaxu_b:
+  case Intrinsic::x86_avx2_pmaxu_w:
+  case Intrinsic::x86_avx2_pmaxu_d:
+    return DAG.getNode(X86ISD::UMAX, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse2_pminu_b:
+  case Intrinsic::x86_sse41_pminuw:
+  case Intrinsic::x86_sse41_pminud:
+  case Intrinsic::x86_avx2_pminu_b:
+  case Intrinsic::x86_avx2_pminu_w:
+  case Intrinsic::x86_avx2_pminu_d:
+    return DAG.getNode(X86ISD::UMIN, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse41_pmaxsb:
+  case Intrinsic::x86_sse2_pmaxs_w:
+  case Intrinsic::x86_sse41_pmaxsd:
+  case Intrinsic::x86_avx2_pmaxs_b:
+  case Intrinsic::x86_avx2_pmaxs_w:
+  case Intrinsic::x86_avx2_pmaxs_d:
+    return DAG.getNode(X86ISD::SMAX, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse41_pminsb:
+  case Intrinsic::x86_sse2_pmins_w:
+  case Intrinsic::x86_sse41_pminsd:
+  case Intrinsic::x86_avx2_pmins_b:
+  case Intrinsic::x86_avx2_pmins_w:
+  case Intrinsic::x86_avx2_pmins_d:
+    return DAG.getNode(X86ISD::SMIN, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
    // AVX2 variable shift intrinsics
    case Intrinsic::x86_avx2_psllv_d:
    case Intrinsic::x86_avx2_psllv_q:
@@ -10733,7 +10782,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
  
          for (FunctionType::param_iterator I = FTy->param_begin(),
               E = FTy->param_end(); I != E; ++I, ++Idx)
-          if (Attrs.getParamAttributes(Idx).hasAttribute(Attributes::InReg))
+          if (Attrs.getParamAttributes(Idx).hasAttribute(Attribute::InReg))
              // FIXME: should only count parameters that are lowered to integers.
              InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
  
@@ -10823,7 +10872,6 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
    int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
  
-
    MachineMemOperand *MMO =
     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
                             MachineMemOperand::MOStore, 2, 2);
@@ -10856,7 +10904,6 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
                              DAG.getConstant(1, MVT::i16)),
                  DAG.getConstant(3, MVT::i16));
  
-
    return DAG.getNode((VT.getSizeInBits() < 16 ?
                        ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
  }
@@ -10985,17 +11032,43 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
  
  static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
                          SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
    EVT VT = Op.getValueType();
  
    // Decompose 256-bit ops into smaller 128-bit ops.
    if (VT.is256BitVector() && !Subtarget->hasInt256())
      return Lower256IntArith(Op, DAG);
  
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
+  if (VT == MVT::v4i32) {
+    assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
+           "Should not custom lower when pmuldq is available!");
+
+    // Extract the odd parts.
+    const int UnpackMask[] = { 1, -1, 3, -1 };
+    SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
+    SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
+
+    // Multiply the even parts.
+    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
+    // Now multiply odd parts.
+    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
+
+    Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
+    Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
+
+    // Merge the two vectors back together with a shuffle. This expands into 2
+    // shuffles.
+    const int ShufMask[] = { 0, 4, 2, 6 };
+    return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
+  }
+
    assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
           "Only know how to lower V2I64/V4I64 multiply");
  
-  DebugLoc dl = Op.getDebugLoc();
-
    //  Ahi = psrlqi(a, 32);
    //  Bhi = psrlqi(b, 32);
    //
@@ -11007,9 +11080,6 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
    //  AhiBlo = psllqi(AhiBlo, 32);
    //  return AloBlo + AloBhi + AhiBlo;
  
-  SDValue A = Op.getOperand(0);
-  SDValue B = Op.getOperand(1);
-
    SDValue ShAmt = DAG.getConstant(32, MVT::i32);
  
    SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
@@ -11386,7 +11456,6 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
    }
  }
  
-
  static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget,
                                SelectionDAG &DAG) {
    DebugLoc dl = Op.getDebugLoc();
@@ -11471,7 +11540,6 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
    return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
  }
  
-
  static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
                               SelectionDAG &DAG) {
    EVT T = Op.getValueType();
@@ -11961,10 +12029,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::PSIGN:              return "X86ISD::PSIGN";
    case X86ISD::BLENDV:             return "X86ISD::BLENDV";
    case X86ISD::BLENDI:             return "X86ISD::BLENDI";
+  case X86ISD::SUBUS:              return "X86ISD::SUBUS";
    case X86ISD::HADD:               return "X86ISD::HADD";
    case X86ISD::HSUB:               return "X86ISD::HSUB";
    case X86ISD::FHADD:              return "X86ISD::FHADD";
    case X86ISD::FHSUB:              return "X86ISD::FHSUB";
+  case X86ISD::UMAX:               return "X86ISD::UMAX";
+  case X86ISD::UMIN:               return "X86ISD::UMIN";
+  case X86ISD::SMAX:               return "X86ISD::SMAX";
+  case X86ISD::SMIN:               return "X86ISD::SMIN";
    case X86ISD::FMAX:               return "X86ISD::FMAX";
    case X86ISD::FMIN:               return "X86ISD::FMIN";
    case X86ISD::FMAXC:              return "X86ISD::FMAXC";
@@ -12017,7 +12090,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::OR:                 return "X86ISD::OR";
    case X86ISD::XOR:                return "X86ISD::XOR";
    case X86ISD::AND:                return "X86ISD::AND";
-  case X86ISD::ANDN:               return "X86ISD::ANDN";
    case X86ISD::BLSI:               return "X86ISD::BLSI";
    case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
    case X86ISD::BLSR:               return "X86ISD::BLSR";
@@ -12120,7 +12192,6 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
    return true;
  }
  
-
  bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
    if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
      return false;
@@ -14318,7 +14389,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
  }
  
-
  /// PerformTruncateCombine - Converts truncate operation to
  /// a sequence of vector shuffle operations.
  /// It is possible when we truncate 256-bit vector to 128-bit vector
@@ -14338,51 +14408,42 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
  
    if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
  
+    // On AVX2, v4i64 -> v4i32 becomes VPERMD.
      if (Subtarget->hasInt256()) {
-      // AVX2: v4i64 -> v4i32
-
-      // VPERMD
        static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
-
        Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op);
        Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32),
                                  ShufMask);
-
        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
                           DAG.getIntPtrConstant(0));
      }
  
-    // AVX: v4i64 -> v4i32
+    // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS.
      SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
                                 DAG.getIntPtrConstant(0));
-
      SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
                                 DAG.getIntPtrConstant(2));
  
      OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
      OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
  
-    // PSHUFD
+    // The PSHUFD mask:
      static const int ShufMask1[] = {0, 2, 0, 0};
-
      SDValue Undef = DAG.getUNDEF(VT);
      OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1);
      OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1);
  
-    // MOVLHPS
+    // The MOVLHPS mask:
      static const int ShufMask2[] = {0, 1, 4, 5};
-
      return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
    }
  
    if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
  
+    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
      if (Subtarget->hasInt256()) {
-      // AVX2: v8i32 -> v8i16
-
        Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op);
  
-      // PSHUFB
        SmallVector<SDValue,32> pshufbMask;
        for (unsigned i = 0; i < 2; ++i) {
          pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
@@ -14399,16 +14460,13 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
        SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8,
                                 &pshufbMask[0], 32);
        Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV);
-
        Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op);
  
        static const int ShufMask[] = {0,  2,  -1,  -1};
        Op = DAG.getVectorShuffle(MVT::v4i64, dl,  Op, DAG.getUNDEF(MVT::v4i64),
                                  &ShufMask[0]);
-
        Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
                         DAG.getIntPtrConstant(0));
-
        return DAG.getNode(ISD::BITCAST, dl, VT, Op);
      }
  
@@ -14421,7 +14479,7 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
      OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
      OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
  
-    // PSHUFB
+    // The PSHUFB mask:
      static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
                                     -1, -1, -1, -1, -1, -1, -1, -1};
  
@@ -14432,9 +14490,8 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
      OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
      OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
  
-    // MOVLHPS
+    // The MOVLHPS Mask:
      static const int ShufMask2[] = {0, 1, 4, 5};
-
      SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
    }
@@ -14633,6 +14690,76 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
+static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
+                                   SDValue RHS, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
+  if (!VT.isVector())
+    return 0;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return 0;
+  case MVT::v32i8:
+  case MVT::v16i16:
+  case MVT::v8i32:
+    if (!Subtarget->hasAVX2())
+      return 0;
+  case MVT::v16i8:
+  case MVT::v8i16:
+  case MVT::v4i32:
+    if (!Subtarget->hasSSE2())
+      return 0;
+  }
+
+  // SSE2 has only a small subset of the operations.
+  bool hasUnsigned = Subtarget->hasSSE41() ||
+                     (Subtarget->hasSSE2() && VT == MVT::v16i8);
+  bool hasSigned = Subtarget->hasSSE41() ||
+                   (Subtarget->hasSSE2() && VT == MVT::v8i16);
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+  // Check for x CC y ? x : y.
+  if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+    switch (CC) {
+    default: break;
+    case ISD::SETULT:
+    case ISD::SETULE:
+      return hasUnsigned ? X86ISD::UMIN : 0;
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+      return hasUnsigned ? X86ISD::UMAX : 0;
+    case ISD::SETLT:
+    case ISD::SETLE:
+      return hasSigned ? X86ISD::SMIN : 0;
+    case ISD::SETGT:
+    case ISD::SETGE:
+      return hasSigned ? X86ISD::SMAX : 0;
+    }
+  // Check for x CC y ? y : x -- a min/max with reversed arms.
+  } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+             DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+    switch (CC) {
+    default: break;
+    case ISD::SETULT:
+    case ISD::SETULE:
+      return hasUnsigned ? X86ISD::UMAX : 0;
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+      return hasUnsigned ? X86ISD::UMIN : 0;
+    case ISD::SETLT:
+    case ISD::SETLE:
+      return hasSigned ? X86ISD::SMAX : 0;
+    case ISD::SETGT:
+    case ISD::SETGE:
+      return hasSigned ? X86ISD::SMIN : 0;
+    }
+  }
+
+  return 0;
+}
+
  /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
  /// nodes.
  static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
@@ -14913,6 +15040,71 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
+  // Match VSELECTs into subs with unsigned saturation.
+  if (!DCI.isBeforeLegalize() &&
+      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+      // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
+      ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+       (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+    // Check if one of the arms of the VSELECT is a zero vector. If it's on the
+    // left side invert the predicate to simplify logic below.
+    SDValue Other;
+    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+      Other = RHS;
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
+      Other = LHS;
+    }
+
+    if (Other.getNode() && Other->getNumOperands() == 2 &&
+        DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
+      SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
+      SDValue CondRHS = Cond->getOperand(1);
+
+      // Look for a general sub with unsigned saturation first.
+      // x >= y ? x-y : 0 --> subus x, y
+      // x >  y ? x-y : 0 --> subus x, y
+      if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
+          Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
+        return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+
+      // If the RHS is a constant we have to reverse the const canonicalization.
+      // x > C-1 ? x+-C : 0 --> subus x, C
+      if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+          isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
+        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+        if (CondRHS.getConstantOperandVal(0) == -A-1) {
+          SmallVector<SDValue, 32> V(VT.getVectorNumElements(),
+                                     DAG.getConstant(-A, VT.getScalarType()));
+          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
+                             DAG.getNode(ISD::BUILD_VECTOR, DL, VT,
+                                         V.data(), V.size()));
+        }
+      }
+
+      // Another special case: If C was a sign bit, the sub has been
+      // canonicalized into a xor.
+      // FIXME: Would it be better to use ComputeMaskedBits to determine whether
+      //        it's safe to decanonicalize the xor?
+      // x s< 0 ? x^C : 0 --> subus x, C
+      if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+          ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+          isSplatVector(OpRHS.getNode())) {
+        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+        if (A.isSignBit())
+          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+      }
+    }
+  }
+
+  // Try to match a min/max vector operation.
+  if (!DCI.isBeforeLegalize() &&
+      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC)
+    if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
+      return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
+
    // If we know that this node is legal then we know that it is going to be
    // matched by one of the SSE/AVX BLEND instructions. These instructions only
    // depend on the highest bit in each word. Try to use SimplifyDemandedBits
@@ -15210,7 +15402,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
-
  /// PerformMulCombine - Optimize a single multiply with constant into two
  /// in order to implement it with two cheaper instructions, e.g.
  /// LEA + SHL, LEA + LEA.
@@ -15299,7 +15490,6 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
      }
    }
  
-
    // Hardware support for vector shifts is sparse which makes us scalarize the
    // vector operations in many cases. Also, on sandybridge ADD is faster than
    // shl.
@@ -15443,7 +15633,6 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
    }
  }
  
-
  // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
  // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
  // and friends.  Likewise for OR -> CMPNEQSS.
@@ -15564,7 +15753,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
  
    EVT VT = N->getValueType(0);
  
-  // Create ANDN, BLSI, and BLSR instructions
+  // Create BLSI, and BLSR instructions
    // BLSI is X & (-X)
    // BLSR is X & (X-1)
    if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
@@ -15572,13 +15761,6 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
      SDValue N1 = N->getOperand(1);
      DebugLoc DL = N->getDebugLoc();
  
-    // Check LHS for not
-    if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1)))
-      return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1);
-    // Check RHS for not
-    if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
-      return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
-
      // Check LHS for neg
      if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
          isZero(N0.getOperand(0)))
@@ -15868,11 +16050,14 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
    ISD::LoadExtType Ext = Ld->getExtensionType();
  
    // If this is a vector EXT Load then attempt to optimize it using a
-  // shuffle. We need SSSE3 shuffles.
+  // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
+  // expansion is still better than scalar code.
+  // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
+  // emit a shuffle and a arithmetic shift.
    // TODO: It is possible to support ZExt by zeroing the undef values
    // during the shuffle phase or after the shuffle.
-  if (RegVT.isVector() && RegVT.isInteger() &&
-      Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) {
+  if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
+      (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
      assert(MemVT != RegVT && "Cannot extend to the same type");
      assert(MemVT.isVector() && "Must load a vector from memory");
  
@@ -15881,6 +16066,9 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
      unsigned MemSz = MemVT.getSizeInBits();
      assert(RegSz > MemSz && "Register size must be greater than the mem size");
  
+    if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
+      return SDValue();
+
      // All sizes must be a power of two.
      if (!isPowerOf2_32(RegSz * MemSz * NumElems))
        return SDValue();
@@ -15904,16 +16092,23 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
      // Calculate the number of scalar loads that we need to perform
      // in order to load our vector from memory.
      unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+    if (Ext == ISD::SEXTLOAD && NumLoads > 1)
+      return SDValue();
+
+    unsigned loadRegZize = RegSz;
+    if (Ext == ISD::SEXTLOAD && RegSz == 256)
+      loadRegZize /= 2;
  
      // Represent our vector as a sequence of elements which are the
      // largest scalar that we can load.
      EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
-      RegSz/SclrLoadTy.getSizeInBits());
+      loadRegZize/SclrLoadTy.getSizeInBits());
  
      // Represent the data using the same element type that is stored in
      // memory. In practice, we ''widen'' MemVT.
-    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
-                                  RegSz/MemVT.getScalarType().getSizeInBits());
+    EVT WideVecVT = 
+         EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                       loadRegZize/MemVT.getScalarType().getSizeInBits());
  
      assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
        "Invalid vector type");
@@ -15954,6 +16149,41 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
      SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
      unsigned SizeRatio = RegSz/MemSz;
  
+    if (Ext == ISD::SEXTLOAD) {
+      // If we have SSE4.1 we can directly emit a VSEXT node.
+      if (Subtarget->hasSSE41()) {
+        SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+        return DCI.CombineTo(N, Sext, TF, true);
+      }
+
+      // Otherwise we'll shuffle the small elements in the high bits of the
+      // larger type and perform an arithmetic shift. If the shift is not legal
+      // it's better to scalarize.
+      if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
+        return SDValue();
+
+      // Redistribute the loaded elements into the different locations.
+      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+      for (unsigned i = 0; i != NumElems; ++i)
+        ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
+
+      SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+                                           DAG.getUNDEF(WideVecVT),
+                                           &ShuffleVec[0]);
+
+      Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+
+      // Build the arithmetic shift.
+      unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
+                     MemVT.getVectorElementType().getSizeInBits();
+      SmallVector<SDValue, 8> C(NumElems,
+                                DAG.getConstant(Amt, RegVT.getScalarType()));
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, RegVT, &C[0], C.size());
+      Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, BV);
+
+      return DCI.CombineTo(N, Shuff, TF, true);
+    }
+
      // Redistribute the loaded elements into the different locations.
      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
      for (unsigned i = 0; i != NumElems; ++i)
@@ -16087,7 +16317,6 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                 Chains.size());
    }
  
-
    // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
    // the FP state in cases where an emms may be missing.
    // A preferable solution to the general problem is to figure out the right
@@ -16099,7 +16328,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
  
    const Function *F = DAG.getMachineFunction().getFunction();
    bool NoImplicitFloatOps = F->getFnAttributes().
-    hasAttribute(Attributes::NoImplicitFloat);
+    hasAttribute(Attribute::NoImplicitFloat);
    bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
                       && Subtarget->hasSSE2();
    if ((VT.isVector() ||
@@ -16393,7 +16622,6 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
                       N->getOperand(0), N->getOperand(1));
  }
  
-
  /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
  static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
    // FAND(0.0, x) -> 0.0
@@ -17131,8 +17359,6 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
    return false;
  }
  
-
-
  /// getConstraintType - Given a constraint letter, return the type of
  /// constraint it is for this target.
  X86TargetLowering::ConstraintType
@@ -17741,6 +17967,29 @@ X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
    return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty);
  }
  
+unsigned
+X86VectorTargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                              unsigned Alignment,
+                                              unsigned AddressSpace) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Src);
+  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+         "Invalid Opcode");
+
+  const X86Subtarget &ST =
+  TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+  // Each load/store unit costs 1.
+  unsigned Cost = LT.first * 1;
+
+  // On Sandybridge 256bit load/stores are double pumped
+  // (but not on Haswell).
+  if (LT.second.getSizeInBits() > 128 && !ST.hasAVX2())
+    Cost*=2;
+
+  return Cost;
+}
+
  unsigned
  X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                   unsigned Index) const {
@@ -17806,10 +18055,10 @@ unsigned X86VectorTargetTransformInfo::getCmpSelInstrCost(unsigned Opcode,
      { ISD::SETCC,   MVT::v32i8,   1 },
    };
  
-  if (ST.hasSSE42()) {
-    int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
+  if (ST.hasAVX2()) {
+    int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
      if (Idx != -1)
-      return LT.first * SSE42CostTbl[Idx].Cost;
+      return LT.first * AVX2CostTbl[Idx].Cost;
    }
  
    if (ST.hasAVX()) {
@@ -17818,10 +18067,10 @@ unsigned X86VectorTargetTransformInfo::getCmpSelInstrCost(unsigned Opcode,
        return LT.first * AVX1CostTbl[Idx].Cost;
    }
  
-  if (ST.hasAVX2()) {
-    int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
+  if (ST.hasSSE42()) {
+    int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
      if (Idx != -1)
-      return LT.first * AVX2CostTbl[Idx].Cost;
+      return LT.first * SSE42CostTbl[Idx].Cost;
    }
  
    return VectorTargetTransformImpl::getCmpSelInstrCost(Opcode, ValTy, CondTy);