[CMake] X86AsmParser: Prune redundant LINK_LIBS.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index dc5c0d1e352bd31c4517e400d089c4d91471f3d1..517295aedbd7d7a454d8ca546c1ad141e90988f7 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -164,14 +164,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
  
    if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
+    if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512())
+      // f32/f64 are legal, f80 is custom.
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
+    else
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
      setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    } else if (!Subtarget->useSoftFloat()) {
      // We have an algorithm for SSE2->double, and we turn this into a
      // 64-bit FILD followed by conditional FADD for other targets.
      setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
      // We have an algorithm for SSE2, and we turn this into a 64-bit
-    // FILD for other targets.
+    // FILD or VCVTUSI2SS/SD for other targets.
      setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
    }
  
@@ -417,8 +421,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
    }
    setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
-  setOperationAction(ISD::CATCHRET        , MVT::Other, Custom);
-  setOperationAction(ISD::CLEANUPRET      , MVT::Other, Custom);
    // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
    // SjLj exception handling but a light-weight setjmp/longjmp replacement to
    // support continuation, user-level threading, and etc.. As a result, no
@@ -849,6 +851,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::CTPOP,              MVT::v4i32, Custom);
      setOperationAction(ISD::CTPOP,              MVT::v2i64, Custom);
  
+    setOperationAction(ISD::CTTZ,               MVT::v16i8, Custom);
+    setOperationAction(ISD::CTTZ,               MVT::v8i16, Custom);
+    setOperationAction(ISD::CTTZ,               MVT::v4i32, Custom);
+    // ISD::CTTZ v2i64 - scalarization is faster.
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v16i8, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v8i16, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v4i32, Custom);
+    // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster.
+
      // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
      for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
        MVT VT = (MVT::SimpleValueType)i;
@@ -1129,6 +1140,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
      setOperationAction(ISD::CTPOP,             MVT::v4i64, Custom);
  
+    setOperationAction(ISD::CTTZ,              MVT::v32i8, Custom);
+    setOperationAction(ISD::CTTZ,              MVT::v16i16, Custom);
+    setOperationAction(ISD::CTTZ,              MVT::v8i32, Custom);
+    setOperationAction(ISD::CTTZ,              MVT::v4i64, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v32i8, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v16i16, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v8i32, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v4i64, Custom);
+
      if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) {
        setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
        setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
@@ -1259,7 +1279,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      if (Subtarget->hasInt256())
        setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
  
-
      // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
      for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
        MVT VT = (MVT::SimpleValueType)i;
@@ -1337,13 +1356,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
      setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
  
-    // FIXME:  [US]INT_TO_FP are not legal for f80.
-    setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
-    setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
-    if (Subtarget->is64Bit()) {
-      setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
-      setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
-    }
      setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
      setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
      setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
@@ -1501,6 +1513,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
        setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i64, Legal);
        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v16i32, Legal);
+
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i64, Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v16i32, Custom);
      }
      if (Subtarget->hasVLX() && Subtarget->hasCDI()) {
        setOperationAction(ISD::CTLZ,             MVT::v4i64, Legal);
@@ -1511,6 +1526,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i32, Legal);
        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v2i64, Legal);
        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i32, Legal);
+
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i64, Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i32, Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v2i64, Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i32, Custom);
      }
      if (Subtarget->hasDQI()) {
        setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
@@ -1696,7 +1716,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::UMULO, VT, Custom);
    }
  
-
    if (!Subtarget->is64Bit()) {
      // These libcalls are not available in 32-bit.
      setLibcallName(RTLIB::SHL_I128, nullptr);
@@ -1766,7 +1785,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    MaxStoresPerMemmoveOptSize = 4;
    setPrefLoopAlignment(4); // 2^4 bytes.
  
-  // Predictable cmov don't hurt on atom because it's in-order.
+  // A predictable cmov does not hurt on an in-order CPU.
+  // FIXME: Use a CPU attribute to trigger this, not a CPU model.
    PredictableSelectIsExpensive = !Subtarget->isAtom();
    EnableExtLdPromotion = true;
    setPrefFunctionAlignment(4); // 2^4 bytes.
@@ -2060,6 +2080,29 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
    return true;
  }
  
+/// Android provides a fixed TLS slot for the SafeStack pointer.
+/// See the definition of TLS_SLOT_SAFESTACK in
+/// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+bool X86TargetLowering::getSafeStackPointerLocation(unsigned &AddressSpace,
+                                                    unsigned &Offset) const {
+  if (!Subtarget->isTargetAndroid())
+    return false;
+
+  if (Subtarget->is64Bit()) {
+    // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+    Offset = 0x48;
+    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
+      AddressSpace = 256;
+    else
+      AddressSpace = 257;
+  } else {
+    // %gs:0x24 on i386
+    Offset = 0x24;
+    AddressSpace = 256;
+  }
+  return true;
+}
+
  bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
                                              unsigned DestAS) const {
    assert(SrcAS != DestAS && "Expected different address spaces!");
@@ -2073,11 +2116,9 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
  
  #include "X86GenCallingConv.inc"
  
-bool
-X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
-                                  MachineFunction &MF, bool isVarArg,
-                        const SmallVectorImpl<ISD::OutputArg> &Outs,
-                        LLVMContext &Context) const {
+bool X86TargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
    SmallVector<CCValAssign, 16> RVLocs;
    CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
    return CCInfo.CheckReturn(Outs, RetCC_X86);
@@ -2510,15 +2551,10 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
    return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
  }
  
-SDValue
-X86TargetLowering::LowerFormalArguments(SDValue Chain,
-                                        CallingConv::ID CallConv,
-                                        bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                        SDLoc dl,
-                                        SelectionDAG &DAG,
-                                        SmallVectorImpl<SDValue> &InVals)
-                                          const {
+SDValue X86TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
    MachineFunction &MF = DAG.getMachineFunction();
    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
    const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
@@ -3529,17 +3565,12 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
  
  /// Check whether the call is eligible for tail call optimization. Targets
  /// that want to do tail call optimization should implement this function.
-bool
-X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
-                                                     CallingConv::ID CalleeCC,
-                                                     bool isVarArg,
-                                                     bool isCalleeStructRet,
-                                                     bool isCallerStructRet,
-                                                     Type *RetTy,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<SDValue> &OutVals,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                                     SelectionDAG &DAG) const {
+bool X86TargetLowering::IsEligibleForTailCallOptimization(
+    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+    bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
    if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
      return false;
  
@@ -6463,7 +6494,6 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op,
    return LowerAVXCONCAT_VECTORS(Op, DAG);
  }
  
-
  //===----------------------------------------------------------------------===//
  // Vector shuffle lowering
  //
@@ -7336,44 +7366,81 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
  ///
  /// Given a specific number of elements, element bit width, and extension
  /// stride, produce either a zero or any extension based on the available
-/// features of the subtarget.
+/// features of the subtarget. The extended elements are consecutive and
+/// begin and can start from an offseted element index in the input; to
+/// avoid excess shuffling the offset must either being in the bottom lane
+/// or at the start of a higher lane. All extended elements must be from
+/// the same lane.
  static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
-    SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
+    SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
      ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
    assert(Scale > 1 && "Need a scale to extend.");
-  int NumElements = VT.getVectorNumElements();
    int EltBits = VT.getScalarSizeInBits();
+  int NumElements = VT.getVectorNumElements();
+  int NumEltsPerLane = 128 / EltBits;
+  int OffsetLane = Offset / NumEltsPerLane;
    assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
           "Only 8, 16, and 32 bit elements can be extended.");
    assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
+  assert(0 <= Offset && "Extension offset must be positive.");
+  assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
+         "Extension offset must be in the first lane or start an upper lane.");
+
+  // Check that an index is in same lane as the base offset.
+  auto SafeOffset = [&](int Idx) {
+    return OffsetLane == (Idx / NumEltsPerLane);
+  };
+
+  // Shift along an input so that the offset base moves to the first element.
+  auto ShuffleOffset = [&](SDValue V) {
+    if (!Offset)
+      return V;
+
+    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+    for (int i = 0; i * Scale < NumElements; ++i) {
+      int SrcIdx = i + Offset;
+      ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
+    }
+    return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
+  };
  
    // Found a valid zext mask! Try various lowering strategies based on the
    // input type and available ISA extensions.
    if (Subtarget->hasSSE41()) {
+    // Not worth offseting 128-bit vectors if scale == 2, a pattern using
+    // PUNPCK will catch this in a later shuffle match.
+    if (Offset && Scale == 2 && VT.getSizeInBits() == 128)
+      return SDValue();
      MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                   NumElements / Scale);
-    return DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
+    InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV));
+    return DAG.getBitcast(VT, InputV);
    }
  
+  assert(VT.getSizeInBits() == 128 && "Only 128-bit vectors can be extended.");
+
    // For any extends we can cheat for larger element sizes and use shuffle
    // instructions that can fold with a load and/or copy.
    if (AnyExt && EltBits == 32) {
-    int PSHUFDMask[4] = {0, -1, 1, -1};
+    int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
+                         -1};
      return DAG.getBitcast(
          VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
                          DAG.getBitcast(MVT::v4i32, InputV),
                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
    }
    if (AnyExt && EltBits == 16 && Scale > 2) {
-    int PSHUFDMask[4] = {0, -1, 0, -1};
+    int PSHUFDMask[4] = {Offset / 2, -1,
+                         SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
      InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
                           DAG.getBitcast(MVT::v4i32, InputV),
                           getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
-    int PSHUFHWMask[4] = {1, -1, -1, -1};
+    int PSHUFWMask[4] = {1, -1, -1, -1};
+    unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
      return DAG.getBitcast(
-        VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
+        VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
                          DAG.getBitcast(MVT::v8i16, InputV),
-                        getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
+                        getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
    }
  
    // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
@@ -7382,18 +7449,21 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
      assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
      assert(VT.getSizeInBits() == 128 && "Unexpected vector width!");
  
+    int LoIdx = Offset * EltBits;
      SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
                               DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
                                           DAG.getConstant(EltBits, DL, MVT::i8),
-                                         DAG.getConstant(0, DL, MVT::i8)));
-    if (isUndefInRange(Mask, NumElements/2, NumElements/2))
+                                         DAG.getConstant(LoIdx, DL, MVT::i8)));
+
+    if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
+        !SafeOffset(Offset + 1))
        return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
  
-    SDValue Hi =
-        DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
-                    DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
-                                DAG.getConstant(EltBits, DL, MVT::i8),
-                                DAG.getConstant(EltBits, DL, MVT::i8)));
+    int HiIdx = (Offset + 1) * EltBits;
+    SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                             DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                         DAG.getConstant(EltBits, DL, MVT::i8),
+                                         DAG.getConstant(HiIdx, DL, MVT::i8)));
      return DAG.getNode(ISD::BITCAST, DL, VT,
                         DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
    }
@@ -7404,9 +7474,11 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
    if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
      assert(NumElements == 16 && "Unexpected byte vector width!");
      SDValue PSHUFBMask[16];
-    for (int i = 0; i < 16; ++i)
-      PSHUFBMask[i] =
-          DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, DL, MVT::i8);
+    for (int i = 0; i < 16; ++i) {
+      int Idx = Offset + (i / Scale);
+      PSHUFBMask[i] = DAG.getConstant(
+          (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
+    }
      InputV = DAG.getBitcast(MVT::v16i8, InputV);
      return DAG.getBitcast(VT,
                            DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
@@ -7414,13 +7486,30 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
                                                    MVT::v16i8, PSHUFBMask)));
    }
  
+  // If we are extending from an offset, ensure we start on a boundary that
+  // we can unpack from.
+  int AlignToUnpack = Offset % (NumElements / Scale);
+  if (AlignToUnpack) {
+    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+    for (int i = AlignToUnpack; i < NumElements; ++i)
+      ShMask[i - AlignToUnpack] = i;
+    InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
+    Offset -= AlignToUnpack;
+  }
+
    // Otherwise emit a sequence of unpacks.
    do {
+    unsigned UnpackLoHi = X86ISD::UNPCKL;
+    if (Offset >= (NumElements / 2)) {
+      UnpackLoHi = X86ISD::UNPCKH;
+      Offset -= (NumElements / 2);
+    }
+
      MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
      SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
                           : getZeroVector(InputVT, Subtarget, DAG, DL);
      InputV = DAG.getBitcast(InputVT, InputV);
-    InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
+    InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
      Scale /= 2;
      EltBits *= 2;
      NumElements /= 2;
@@ -7446,7 +7535,9 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
    SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
  
    int Bits = VT.getSizeInBits();
+  int NumLanes = Bits / 128;
    int NumElements = VT.getVectorNumElements();
+  int NumEltsPerLane = NumElements / NumLanes;
    assert(VT.getScalarSizeInBits() <= 32 &&
           "Exceeds 32-bit integer zero extension limit");
    assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
@@ -7456,8 +7547,11 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
    auto Lower = [&](int Scale) -> SDValue {
      SDValue InputV;
      bool AnyExt = true;
+    int Offset = 0;
+    int Matches = 0;
      for (int i = 0; i < NumElements; ++i) {
-      if (Mask[i] == -1)
+      int M = Mask[i];
+      if (M == -1)
          continue; // Valid anywhere but doesn't tell us anything.
        if (i % Scale != 0) {
          // Each of the extended elements need to be zeroable.
@@ -7471,14 +7565,29 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
  
        // Each of the base elements needs to be consecutive indices into the
        // same input vector.
-      SDValue V = Mask[i] < NumElements ? V1 : V2;
-      if (!InputV)
+      SDValue V = M < NumElements ? V1 : V2;
+      M = M % NumElements;
+      if (!InputV) {
          InputV = V;
-      else if (InputV != V)
+        Offset = M - (i / Scale);
+      } else if (InputV != V)
          return SDValue(); // Flip-flopping inputs.
  
-      if (Mask[i] % NumElements != i / Scale)
+      // Offset must start in the lowest 128-bit lane or at the start of an
+      // upper lane.
+      // FIXME: Is it ever worth allowing a negative base offset?
+      if (!((0 <= Offset && Offset < NumEltsPerLane) ||
+            (Offset % NumEltsPerLane) == 0))
+        return SDValue();
+
+      // If we are offsetting, all referenced entries must come from the same
+      // lane.
+      if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
+        return SDValue();
+
+      if ((M % NumElements) != (Offset + (i / Scale)))
          return SDValue(); // Non-consecutive strided elements.
+      Matches++;
      }
  
      // If we fail to find an input, we have a zero-shuffle which should always
@@ -7487,8 +7596,13 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
      if (!InputV)
        return SDValue();
  
+    // If we are offsetting, don't extend if we only match a single input, we
+    // can always do better by using a basic PSHUF or PUNPCK.
+    if (Offset != 0 && Matches < 2)
+      return SDValue();
+
      return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
-        DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG);
+        DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
    };
  
    // The widest scale possible for extending is to a 64-bit integer.
@@ -10561,12 +10675,12 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                DL, VT, V1, V2, Mask, Subtarget, DAG))
        return Insertion;
  
-  // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
-  // check for those subtargets here and avoid much of the subtarget querying in
-  // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
-  // ability to manipulate a 256-bit vector with integer types. Since we'll use
-  // floating point types there eventually, just immediately cast everything to
-  // a float and operate entirely in that domain.
+  // There is a really nice hard cut-over between AVX1 and AVX2 that means we
+  // can check for those subtargets here and avoid much of the subtarget
+  // querying in the per-vector-type lowering routines. With AVX1 we have
+  // essentially *zero* ability to manipulate a 256-bit vector with integer
+  // types. Since we'll use floating point types there eventually, just
+  // immediately cast everything to a float and operate entirely in that domain.
    if (VT.isInteger() && !Subtarget->hasAVX2()) {
      int ElementBits = VT.getScalarSizeInBits();
      if (ElementBits < 32)
@@ -12492,6 +12606,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
  
    MVT SrcVT = N0.getSimpleValueType();
    MVT DstVT = Op.getSimpleValueType();
+
+  if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
+      (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) {
+    // Conversions from unsigned i32 to f32/f64 are legal,
+    // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
+    return Op;
+  }
+
    if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
      return LowerUINT_TO_FP_i64(Op, DAG);
    if (SrcVT == MVT::i32 && X86ScalarSSEf64)
@@ -12657,7 +12779,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
      // for DAG type consistency we have to match the FP operand type.
  
      APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
-    APFloat::opStatus Status = APFloat::opOK;
+    LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
      bool LosesInfo = false;
      if (TheVT == MVT::f64)
        // The rounding mode is irrelevant as the conversion should be exact.
@@ -15697,6 +15819,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
        case X86ISD::CMPM:
        case X86ISD::CMPMU:
          return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+      case X86ISD::VFPCLASS:
+        return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
        case X86ISD::VTRUNC:
        case X86ISD::VTRUNCS:
        case X86ISD::VTRUNCUS:
@@ -15722,17 +15846,20 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
                                      SDValue PreservedSrc,
                                      const X86Subtarget *Subtarget,
                                      SelectionDAG &DAG) {
-    if (isAllOnes(Mask))
-      return Op;
+  if (isAllOnes(Mask))
+    return Op;
  
-    EVT VT = Op.getValueType();
-    SDLoc dl(Op);
-    // The mask should be of type MVT::i1
-    SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  // The mask should be of type MVT::i1
+  SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
  
-    if (PreservedSrc.getOpcode() == ISD::UNDEF)
-      PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
-    return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
+  if (Op.getOpcode() == X86ISD::FSETCC)
+    return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
+
+  if (PreservedSrc.getOpcode() == ISD::UNDEF)
+    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+  return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
  }
  
  static int getSEHRegistrationNodeSize(const Function *Fn) {
@@ -15856,6 +15983,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
        return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
                                    Mask, PassThru, Subtarget, DAG);
      }
+    case INTR_TYPE_SCALAR_MASK: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue passThru = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
+                                  Mask, passThru, Subtarget, DAG);
+    }
      case INTR_TYPE_SCALAR_MASK_RM: {
        SDValue Src1 = Op.getOperand(1);
        SDValue Src2 = Op.getOperand(2);
@@ -15906,7 +16041,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
        SDValue Src2 = Op.getOperand(2);
        SDValue PassThru = Op.getOperand(3);
        SDValue Mask = Op.getOperand(4);
-      // We specify 2 possible modes for intrinsics, with/without rounding modes.
+      // We specify 2 possible modes for intrinsics, with/without rounding
+      // modes.
        // First, we check if the intrinsic have rounding mode (6 operands),
        // if not, we set rounding mode to "current".
        SDValue Rnd;
@@ -15936,7 +16072,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
        SDValue Imm = Op.getOperand(3);
        SDValue PassThru = Op.getOperand(4);
        SDValue Mask = Op.getOperand(5);
-      // We specify 2 possible modes for intrinsics, with/without rounding modes.
+      // We specify 2 possible modes for intrinsics, with/without rounding
+      // modes.
        // First, we check if the intrinsic have rounding mode (7 operands),
        // if not, we set rounding mode to "current".
        SDValue Rnd;
@@ -15949,7 +16086,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
          Mask, PassThru, Subtarget, DAG);
      }
      case INTR_TYPE_3OP_IMM8_MASK:
-    case INTR_TYPE_3OP_MASK: {
+    case INTR_TYPE_3OP_MASK:
+    case INSERT_SUBVEC: {
        SDValue Src1 = Op.getOperand(1);
        SDValue Src2 = Op.getOperand(2);
        SDValue Src3 = Op.getOperand(3);
@@ -15958,6 +16096,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
  
        if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
          Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
+      else if (IntrData->Type == INSERT_SUBVEC) {
+        // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
+        assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
+        unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
+        Imm *= Src2.getValueType().getVectorNumElements();
+        Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
+      }
+
        // We specify 2 possible opcodes for intrinsics with rounding modes.
        // First, we check if the intrinsic may have non-default rounding mode,
        // (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -16014,6 +16160,25 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                                Src1, Src2, Src3),
                                    Mask, PassThru, Subtarget, DAG);
      }
+    case FPCLASS: {
+      // FPclass intrinsics with mask
+       SDValue Src1 = Op.getOperand(1);
+       EVT VT = Src1.getValueType();
+       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                      VT.getVectorNumElements());
+       SDValue Imm = Op.getOperand(2);
+       SDValue Mask = Op.getOperand(3);
+       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                        Mask.getValueType().getSizeInBits());
+       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
+       SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
+                                                 DAG.getTargetConstant(0, dl, MaskVT),
+                                                 Subtarget, DAG);
+       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+                                 DAG.getUNDEF(BitcastVT), FPclassMask,
+                                 DAG.getIntPtrConstant(0, dl));
+       return DAG.getBitcast(Op.getValueType(), Res);
+    }
      case CMP_MASK:
      case CMP_MASK_CC: {
        // Comparison intrinsics with masks.
@@ -16064,6 +16229,32 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                  DAG.getIntPtrConstant(0, dl));
        return DAG.getBitcast(Op.getValueType(), Res);
      }
+    case CMP_MASK_SCALAR_CC: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
+      SDValue Mask = Op.getOperand(4);
+
+      SDValue Cmp;
+      if (IntrData->Opc1 != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+            X86::STATIC_ROUNDING::CUR_DIRECTION)
+          Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
+      }
+      //default rounding mode
+      if(!Cmp.getNode())
+        Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
+
+      SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
+                                             DAG.getTargetConstant(0, dl,
+                                                                   MVT::i1),
+                                             Subtarget, DAG);
+
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8,
+                         DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask),
+                         DAG.getValueType(MVT::i1));
+    }
      case COMI: { // Comparison intrinsics
        ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
        SDValue LHS = Op.getOperand(1);
@@ -16922,41 +17113,6 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
                       DAG.getRegister(StoreAddrReg, PtrVT));
  }
  
-SDValue X86TargetLowering::LowerCATCHRET(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Chain = Op.getOperand(0);
-  SDValue Dest = Op.getOperand(1);
-  SDLoc DL(Op);
-
-  MVT PtrVT = getPointerTy(DAG.getDataLayout());
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  if (isAsynchronousEHPersonality(
-          classifyEHPersonality(MF.getFunction()->getPersonalityFn()))) {
-    // For SEH, codegen catchret as a branch for now.
-    // FIXME: Insert something to restore the frame.
-    return DAG.getNode(ISD::BR, DL, MVT::Other, Chain, Dest);
-  }
-
-  unsigned ReturnReg = (PtrVT == MVT::i64 ? X86::RAX : X86::EAX);
-
-  // Load the address of the destination block.
-  // FIXME: Do this without creating a BlockAddress.
-  MachineBasicBlock *DestMBB = cast<BasicBlockSDNode>(Dest)->getBasicBlock();
-  BlockAddress *BA =
-      BlockAddress::get(const_cast<Function *>(MF.getFunction()),
-                        const_cast<BasicBlock *>(DestMBB->getBasicBlock()));
-  DestMBB->setHasAddressTaken();
-  SDValue BlockPtr = DAG.getBlockAddress(BA, PtrVT);
-  Chain = DAG.getCopyToReg(Chain, DL, ReturnReg, BlockPtr);
-  return DAG.getNode(X86ISD::CATCHRET, DL, MVT::Other, Chain,
-                     DAG.getRegister(ReturnReg, PtrVT));
-}
-
-SDValue X86TargetLowering::LowerCLEANUPRET(SDValue Op, SelectionDAG &DAG) const {
-  return DAG.getNode(X86ISD::CLEANUPRET, SDLoc(Op), MVT::Other,
-                     Op.getOperand(0));
-}
-
  SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
                                                 SelectionDAG &DAG) const {
    SDLoc DL(Op);
@@ -17259,13 +17415,39 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
  
  static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
    MVT VT = Op.getSimpleValueType();
-  unsigned NumBits = VT.getSizeInBits();
+  unsigned NumBits = VT.getScalarSizeInBits();
    SDLoc dl(Op);
-  Op = Op.getOperand(0);
+
+  if (VT.isVector()) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+    SDValue N0 = Op.getOperand(0);
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+
+    // lsb(x) = (x & -x)
+    SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
+                              DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
+
+    // cttz_undef(x) = (width - 1) - ctlz(lsb)
+    if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF &&
+        TLI.isOperationLegal(ISD::CTLZ, VT)) {
+      SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
+      return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
+                         DAG.getNode(ISD::CTLZ, dl, VT, LSB));
+    }
+
+    // cttz(x) = ctpop(lsb - 1)
+    SDValue One = DAG.getConstant(1, dl, VT);
+    return DAG.getNode(ISD::CTPOP, dl, VT,
+                       DAG.getNode(ISD::SUB, dl, VT, LSB, One));
+  }
+
+  assert(Op.getOpcode() == ISD::CTTZ &&
+         "Only scalar CTTZ requires custom lowering");
  
    // Issue a bsf (scan bits forward) which also sets EFLAGS.
    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
+  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
  
    // If src is zero (i.e. bsf sets ZF), returns NumBits.
    SDValue Ops[] = {
@@ -19198,8 +19380,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
                                  return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
    case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
    case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
-  case ISD::CLEANUPRET:         return LowerCLEANUPRET(Op, DAG);
-  case ISD::CATCHRET:           return LowerCATCHRET(Op, DAG);
    case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
    case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
    case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
@@ -19207,7 +19387,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
    case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
    case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
-  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
    case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
    case ISD::UMUL_LOHI:
    case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
@@ -19537,8 +19718,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
    case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
    case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
-  case X86ISD::CATCHRET:           return "X86ISD::CATCHRET";
-  case X86ISD::CLEANUPRET:         return "X86ISD::CLEANUPRET";
    case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
    case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
    case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
@@ -19683,6 +19862,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
    case X86ISD::FP_TO_SINT_RND:     return "X86ISD::FP_TO_SINT_RND";
    case X86ISD::FP_TO_UINT_RND:     return "X86ISD::FP_TO_UINT_RND";
+  case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
    }
    return nullptr;
  }
@@ -22013,26 +22193,28 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
  
    // See if we can recurse into the operand to combine more things.
    switch (Op.getOpcode()) {
-    case X86ISD::PSHUFB:
-      HasPSHUFB = true;
-    case X86ISD::PSHUFD:
-    case X86ISD::PSHUFHW:
-    case X86ISD::PSHUFLW:
-      if (Op.getOperand(0).hasOneUse() &&
-          combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
-                                        HasPSHUFB, DAG, DCI, Subtarget))
-        return true;
-      break;
+  case X86ISD::PSHUFB:
+    HasPSHUFB = true;
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+    if (Op.getOperand(0).hasOneUse() &&
+        combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+                                      HasPSHUFB, DAG, DCI, Subtarget))
+      return true;
+    break;
  
-    case X86ISD::UNPCKL:
-    case X86ISD::UNPCKH:
-      assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
-      // We can't check for single use, we have to check that this shuffle is the only user.
-      if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
-          combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
-                                        HasPSHUFB, DAG, DCI, Subtarget))
-          return true;
-      break;
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
+    assert(Op.getOperand(0) == Op.getOperand(1) &&
+           "We only combine unary shuffles!");
+    // We can't check for single use, we have to check that this shuffle is the
+    // only user.
+    if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
+        combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+                                      HasPSHUFB, DAG, DCI, Subtarget))
+      return true;
+    break;
    }
  
    // Minor canonicalization of the accumulated shuffle mask to make it easier
@@ -22225,7 +22407,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
    return V;
  }
  
-/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
+/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
+/// pshufhw.
  ///
  /// We walk up the chain, skipping shuffles of the other half and looking
  /// through shuffles which switch halves trying to find a shuffle of the same
@@ -22714,9 +22897,9 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
        InputVector.getOpcode() == ISD::BITCAST &&
        dyn_cast<ConstantSDNode>(InputVector.getOperand(0))) {
      uint64_t ExtractedElt =
-         cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+        cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
      uint64_t InputValue =
-         cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+        cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
      uint64_t Res = (InputValue >> ExtractedElt) & 1;
      return DAG.getConstant(Res, dl, MVT::i1);
    }
@@ -24252,6 +24435,41 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
    return DAG.getBitcast(N0.getValueType(), NewShuffle);
  }
  
+/// If both input operands of a logic op are being cast from floating point
+/// types, try to convert this into a floating point logic node to avoid
+/// unnecessary moves from SSE to integer registers.
+static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
+                                        const X86Subtarget *Subtarget) {
+  unsigned FPOpcode = ISD::DELETED_NODE;
+  if (N->getOpcode() == ISD::AND)
+    FPOpcode = X86ISD::FAND;
+  else if (N->getOpcode() == ISD::OR)
+    FPOpcode = X86ISD::FOR;
+  else if (N->getOpcode() == ISD::XOR)
+    FPOpcode = X86ISD::FXOR;
+
+  assert(FPOpcode != ISD::DELETED_NODE &&
+         "Unexpected input node for FP logic conversion");
+
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+  if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
+      ((Subtarget->hasSSE1() && VT == MVT::i32) ||
+       (Subtarget->hasSSE2() && VT == MVT::i64))) {
+    SDValue N00 = N0.getOperand(0);
+    SDValue N10 = N1.getOperand(0);
+    EVT N00Type = N00.getValueType();
+    EVT N10Type = N10.getValueType();
+    if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
+      SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
+      return DAG.getBitcast(VT, FPLogic);
+    }
+  }
+  return SDValue();
+}
+
  static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const X86Subtarget *Subtarget) {
@@ -24264,6 +24482,9 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
    if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
      return R;
  
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+    return FPLogic;
+
    EVT VT = N->getValueType(0);
    SDValue N0 = N->getOperand(0);
    SDValue N1 = N->getOperand(1);
@@ -24324,6 +24545,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
    if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
      return R;
  
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+    return FPLogic;
+
    SDValue N0 = N->getOperand(0);
    SDValue N1 = N->getOperand(1);
    EVT VT = N->getValueType(0);
@@ -24566,6 +24790,9 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
      if (SDValue RV = performIntegerAbsCombine(N, DAG))
        return RV;
  
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+    return FPLogic;
+
    return SDValue();
  }
  
@@ -24657,8 +24884,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
        ShuffleVec[i] = i * SizeRatio;
  
      // Can't shuffle using an illegal type.
-    assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
-           && "WideVecVT should be legal");
+    assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+           "WideVecVT should be legal");
      WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
                                      DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
    }
@@ -24700,7 +24927,6 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
                                       ISD::NON_EXTLOAD);
    SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
    return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
-
  }
  /// PerformMSTORECombine - Resolve truncating stores
  static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
@@ -24750,8 +24976,8 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
      ShuffleVec[i] = i * SizeRatio;
  
    // Can't shuffle using an illegal type.
-  assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
-         && "WideVecVT should be legal");
+  assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+         "WideVecVT should be legal");
  
    SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
                                          DAG.getUNDEF(WideVecVT),