lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86ISelLowering.h"
  16 #include "Utils/X86ShuffleDecode.h"
  17 #include "X86CallingConv.h"
  18 #include "X86FrameLowering.h"
  19 #include "X86InstrBuilder.h"
  20 #include "X86MachineFunctionInfo.h"
  21 #include "X86TargetMachine.h"
  22 #include "X86TargetObjectFile.h"
  23 #include "llvm/ADT/SmallBitVector.h"
  24 #include "llvm/ADT/SmallSet.h"
  25 #include "llvm/ADT/Statistic.h"
  26 #include "llvm/ADT/StringExtras.h"
  27 #include "llvm/ADT/StringSwitch.h"
  28 #include "llvm/ADT/VariadicFunction.h"
  29 #include "llvm/CodeGen/IntrinsicLowering.h"
  30 #include "llvm/CodeGen/MachineFrameInfo.h"
  31 #include "llvm/CodeGen/MachineFunction.h"
  32 #include "llvm/CodeGen/MachineInstrBuilder.h"
  33 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  34 #include "llvm/CodeGen/MachineModuleInfo.h"
  35 #include "llvm/CodeGen/MachineRegisterInfo.h"
  36 #include "llvm/IR/CallSite.h"
  37 #include "llvm/IR/CallingConv.h"
  38 #include "llvm/IR/Constants.h"
  39 #include "llvm/IR/DerivedTypes.h"
  40 #include "llvm/IR/Function.h"
  41 #include "llvm/IR/GlobalAlias.h"
  42 #include "llvm/IR/GlobalVariable.h"
  43 #include "llvm/IR/Instructions.h"
  44 #include "llvm/IR/Intrinsics.h"
  45 #include "llvm/MC/MCAsmInfo.h"
  46 #include "llvm/MC/MCContext.h"
  47 #include "llvm/MC/MCExpr.h"
  48 #include "llvm/MC/MCSymbol.h"
  49 #include "llvm/Support/CommandLine.h"
  50 #include "llvm/Support/Debug.h"
  51 #include "llvm/Support/ErrorHandling.h"
  52 #include "llvm/Support/MathExtras.h"
  53 #include "llvm/Target/TargetOptions.h"
  54 #include "X86IntrinsicsInfo.h"
  55 #include <bitset>
  56 #include <numeric>
  57 #include <cctype>
  58 using namespace llvm;
  59
  60 #define DEBUG_TYPE "x86-isel"
  61
  62 STATISTIC(NumTailCalls, "Number of tail calls");
  63
  64 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  65     "x86-experimental-vector-widening-legalization", cl::init(false),
  66     cl::desc("Enable an experimental vector type legalization through widening "
  67              "rather than promotion."),
  68     cl::Hidden);
  69
  70 static cl::opt<bool> ExperimentalVectorShuffleLowering(
  71     "x86-experimental-vector-shuffle-lowering", cl::init(true),
  72     cl::desc("Enable an experimental vector shuffle lowering code path."),
  73     cl::Hidden);
  74
  75 static cl::opt<bool> ExperimentalVectorShuffleLegality(
  76     "x86-experimental-vector-shuffle-legality", cl::init(false),
  77     cl::desc("Enable experimental shuffle legality based on the experimental "
  78              "shuffle lowering. Should only be used with the experimental "
  79              "shuffle lowering."),
  80     cl::Hidden);
  81
  82 static cl::opt<int> ReciprocalEstimateRefinementSteps(
  83     "x86-recip-refinement-steps", cl::init(1),
  84     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
  85              "result of the hardware reciprocal estimate instruction."),
  86     cl::NotHidden);
  87
  88 // Forward declarations.
  89 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
  90                        SDValue V2);
  91
  92 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
  93                                 SelectionDAG &DAG, SDLoc dl,
  94                                 unsigned vectorWidth) {
  95   assert((vectorWidth == 128 || vectorWidth == 256) &&
  96          "Unsupported vector width");
  97   EVT VT = Vec.getValueType();
  98   EVT ElVT = VT.getVectorElementType();
  99   unsigned Factor = VT.getSizeInBits()/vectorWidth;
 100   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
 101                                   VT.getVectorNumElements()/Factor);
 102
 103   // Extract from UNDEF is UNDEF.
 104   if (Vec.getOpcode() == ISD::UNDEF)
 105     return DAG.getUNDEF(ResultVT);
 106
 107   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
 108   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
 109
 110   // This is the index of the first element of the vectorWidth-bit chunk
 111   // we want.
 112   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
 113                                * ElemsPerChunk);
 114
 115   // If the input is a buildvector just emit a smaller one.
 116   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
 117     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
 118                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
 119                                     ElemsPerChunk));
 120
 121   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 122   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 123 }
 124
 125 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 126 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
 127 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
 128 /// instructions or a simple subregister reference. Idx is an index in the
 129 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
 130 /// lowering EXTRACT_VECTOR_ELT operations easier.
 131 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
 132                                    SelectionDAG &DAG, SDLoc dl) {
 133   assert((Vec.getValueType().is256BitVector() ||
 134           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
 135   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
 136 }
 137
 138 /// Generate a DAG to grab 256-bits from a 512-bit vector.
 139 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
 140                                    SelectionDAG &DAG, SDLoc dl) {
 141   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
 142   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
 143 }
 144
 145 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
 146                                unsigned IdxVal, SelectionDAG &DAG,
 147                                SDLoc dl, unsigned vectorWidth) {
 148   assert((vectorWidth == 128 || vectorWidth == 256) &&
 149          "Unsupported vector width");
 150   // Inserting UNDEF is Result
 151   if (Vec.getOpcode() == ISD::UNDEF)
 152     return Result;
 153   EVT VT = Vec.getValueType();
 154   EVT ElVT = VT.getVectorElementType();
 155   EVT ResultVT = Result.getValueType();
 156
 157   // Insert the relevant vectorWidth bits.
 158   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
 159
 160   // This is the index of the first element of the vectorWidth-bit chunk
 161   // we want.
 162   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
 163                                * ElemsPerChunk);
 164
 165   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 166   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 167 }
 168
 169 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
 170 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
 171 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
 172 /// simple superregister reference.  Idx is an index in the 128 bits
 173 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 174 /// lowering INSERT_VECTOR_ELT operations easier.
 175 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 176                                   SelectionDAG &DAG,SDLoc dl) {
 177   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
 178   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 179 }
 180
 181 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 182                                   SelectionDAG &DAG, SDLoc dl) {
 183   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
 184   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 185 }
 186
 187 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
 188 /// instructions. This is used because creating CONCAT_VECTOR nodes of
 189 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
 190 /// large BUILD_VECTORS.
 191 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
 192                                    unsigned NumElems, SelectionDAG &DAG,
 193                                    SDLoc dl) {
 194   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 195   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
 196 }
 197
 198 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
 199                                    unsigned NumElems, SelectionDAG &DAG,
 200                                    SDLoc dl) {
 201   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 202   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 203 }
 204
 205 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 206                                      const X86Subtarget &STI)
 207     : TargetLowering(TM), Subtarget(&STI) {
 208   X86ScalarSSEf64 = Subtarget->hasSSE2();
 209   X86ScalarSSEf32 = Subtarget->hasSSE1();
 210   TD = getDataLayout();
 211
 212   // Set up the TargetLowering object.
 213   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 214
 215   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 216   setBooleanContents(ZeroOrOneBooleanContent);
 217   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 218   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 219
 220   // For 64-bit, since we have so many registers, use the ILP scheduler.
 221   // For 32-bit, use the register pressure specific scheduling.
 222   // For Atom, always use ILP scheduling.
 223   if (Subtarget->isAtom())
 224     setSchedulingPreference(Sched::ILP);
 225   else if (Subtarget->is64Bit())
 226     setSchedulingPreference(Sched::ILP);
 227   else
 228     setSchedulingPreference(Sched::RegPressure);
 229   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
 230   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 231
 232   // Bypass expensive divides on Atom when compiling with O2.
 233   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 234     if (Subtarget->hasSlowDivide32())
 235       addBypassSlowDiv(32, 8);
 236     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
 237       addBypassSlowDiv(64, 16);
 238   }
 239
 240   if (Subtarget->isTargetKnownWindowsMSVC()) {
 241     // Setup Windows compiler runtime calls.
 242     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 243     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 244     setLibcallName(RTLIB::SREM_I64, "_allrem");
 245     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 246     setLibcallName(RTLIB::MUL_I64, "_allmul");
 247     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 248     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 249     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 250     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 251     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 252
 253     // The _ftol2 runtime function has an unusual calling conv, which
 254     // is modeled by a special pseudo-instruction.
 255     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
 256     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
 257     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
 258     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
 259   }
 260
 261   if (Subtarget->isTargetDarwin()) {
 262     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 263     setUseUnderscoreSetJmp(false);
 264     setUseUnderscoreLongJmp(false);
 265   } else if (Subtarget->isTargetWindowsGNU()) {
 266     // MS runtime is weird: it exports _setjmp, but longjmp!
 267     setUseUnderscoreSetJmp(true);
 268     setUseUnderscoreLongJmp(false);
 269   } else {
 270     setUseUnderscoreSetJmp(true);
 271     setUseUnderscoreLongJmp(true);
 272   }
 273
 274   // Set up the register classes.
 275   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 276   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 277   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 278   if (Subtarget->is64Bit())
 279     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 280
 281   for (MVT VT : MVT::integer_valuetypes())
 282     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 283
 284   // We don't accept any truncstore of integer registers.
 285   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 286   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 287   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 288   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 289   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 290   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 291
 292   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 293
 294   // SETOEQ and SETUNE require checking two conditions.
 295   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 296   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 297   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 298   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 299   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 300   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 301
 302   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 303   // operation.
 304   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 305   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 306   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 307
 308   if (Subtarget->is64Bit()) {
 309     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
 310     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 311   } else if (!TM.Options.UseSoftFloat) {
 312     // We have an algorithm for SSE2->double, and we turn this into a
 313     // 64-bit FILD followed by conditional FADD for other targets.
 314     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 315     // We have an algorithm for SSE2, and we turn this into a 64-bit
 316     // FILD for other targets.
 317     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 318   }
 319
 320   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 321   // this operation.
 322   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 323   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 324
 325   if (!TM.Options.UseSoftFloat) {
 326     // SSE has no i16 to fp conversion, only i32
 327     if (X86ScalarSSEf32) {
 328       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 329       // f32 and f64 cases are Legal, f80 case is not
 330       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 331     } else {
 332       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 333       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 334     }
 335   } else {
 336     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 337     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 338   }
 339
 340   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 341   // are Legal, f80 is custom lowered.
 342   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 343   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 344
 345   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 346   // this operation.
 347   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 348   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 349
 350   if (X86ScalarSSEf32) {
 351     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 352     // f32 and f64 cases are Legal, f80 case is not
 353     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 354   } else {
 355     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 356     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 357   }
 358
 359   // Handle FP_TO_UINT by promoting the destination to a larger signed
 360   // conversion.
 361   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 362   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 363   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 364
 365   if (Subtarget->is64Bit()) {
 366     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
 367     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
 368   } else if (!TM.Options.UseSoftFloat) {
 369     // Since AVX is a superset of SSE3, only check for SSE here.
 370     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
 371       // Expand FP_TO_UINT into a select.
 372       // FIXME: We would like to use a Custom expander here eventually to do
 373       // the optimal thing for SSE vs. the default expansion in the legalizer.
 374       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 375     else
 376       // With SSE3 we can use fisttpll to convert to a signed i64; without
 377       // SSE, we're stuck with a fistpll.
 378       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 379   }
 380
 381   if (isTargetFTOL()) {
 382     // Use the _ftol2 runtime function, which has a pseudo-instruction
 383     // to handle its weird calling convention.
 384     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 385   }
 386
 387   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 388   if (!X86ScalarSSEf64) {
 389     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 390     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 391     if (Subtarget->is64Bit()) {
 392       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 393       // Without SSE, i64->f64 goes through memory.
 394       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 395     }
 396   }
 397
 398   // Scalar integer divide and remainder are lowered to use operations that
 399   // produce two results, to match the available instructions. This exposes
 400   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 401   // into a single instruction.
 402   //
 403   // Scalar integer multiply-high is also lowered to use two-result
 404   // operations, to match the available instructions. However, plain multiply
 405   // (low) operations are left as Legal, as there are single-result
 406   // instructions for this in x86. Using the two-result multiply instructions
 407   // when both high and low results are needed must be arranged by dagcombine.
 408   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 409     MVT VT = IntVTs[i];
 410     setOperationAction(ISD::MULHS, VT, Expand);
 411     setOperationAction(ISD::MULHU, VT, Expand);
 412     setOperationAction(ISD::SDIV, VT, Expand);
 413     setOperationAction(ISD::UDIV, VT, Expand);
 414     setOperationAction(ISD::SREM, VT, Expand);
 415     setOperationAction(ISD::UREM, VT, Expand);
 416
 417     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
 418     setOperationAction(ISD::ADDC, VT, Custom);
 419     setOperationAction(ISD::ADDE, VT, Custom);
 420     setOperationAction(ISD::SUBC, VT, Custom);
 421     setOperationAction(ISD::SUBE, VT, Custom);
 422   }
 423
 424   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 425   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 426   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
 427   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
 428   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
 429   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
 430   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
 431   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
 432   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
 433   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
 434   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
 435   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
 436   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
 437   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
 438   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
 439   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
 440   if (Subtarget->is64Bit())
 441     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 442   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 443   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 444   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 445   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 446   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 447   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 448   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 449   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 450
 451   // Promote the i8 variants and force them on up to i32 which has a shorter
 452   // encoding.
 453   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
 454   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
 455   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
 456   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
 457   if (Subtarget->hasBMI()) {
 458     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
 459     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
 460     if (Subtarget->is64Bit())
 461       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
 462   } else {
 463     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 464     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 465     if (Subtarget->is64Bit())
 466       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 467   }
 468
 469   if (Subtarget->hasLZCNT()) {
 470     // When promoting the i8 variants, force them to i32 for a shorter
 471     // encoding.
 472     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
 473     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
 474     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
 475     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 476     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
 477     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
 478     if (Subtarget->is64Bit())
 479       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 480   } else {
 481     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 482     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 483     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 484     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 485     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 486     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 487     if (Subtarget->is64Bit()) {
 488       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 489       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 490     }
 491   }
 492
 493   // Special handling for half-precision floating point conversions.
 494   // If we don't have F16C support, then lower half float conversions
 495   // into library calls.
 496   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
 497     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 498     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 499   }
 500
 501   // There's never any support for operations beyond MVT::f32.
 502   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 503   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 504   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 505   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 506
 507   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 508   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 509   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 510   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 511   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 512   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 513
 514   if (Subtarget->hasPOPCNT()) {
 515     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 516   } else {
 517     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 518     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 519     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 520     if (Subtarget->is64Bit())
 521       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 522   }
 523
 524   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 525
 526   if (!Subtarget->hasMOVBE())
 527     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 528
 529   // These should be promoted to a larger select which is supported.
 530   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 531   // X86 wants to expand cmov itself.
 532   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
 533   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
 534   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
 535   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
 536   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
 537   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
 538   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
 539   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
 540   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
 541   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
 542   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
 543   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
 544   if (Subtarget->is64Bit()) {
 545     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
 546     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
 547   }
 548   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 549   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 550   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 551   // support continuation, user-level threading, and etc.. As a result, no
 552   // other SjLj exception interfaces are implemented and please don't build
 553   // your own exception handling based on them.
 554   // LLVM/Clang supports zero-cost DWARF exception handling.
 555   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 556   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 557
 558   // Darwin ABI issue.
 559   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
 560   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
 561   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
 562   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
 563   if (Subtarget->is64Bit())
 564     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 565   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
 566   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
 567   if (Subtarget->is64Bit()) {
 568     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
 569     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
 570     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
 571     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
 572     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
 573   }
 574   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
 575   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
 576   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
 577   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
 578   if (Subtarget->is64Bit()) {
 579     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
 580     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
 581     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
 582   }
 583
 584   if (Subtarget->hasSSE1())
 585     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 586
 587   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 588
 589   // Expand certain atomics
 590   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 591     MVT VT = IntVTs[i];
 592     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 593     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 594     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 595   }
 596
 597   if (Subtarget->hasCmpxchg16b()) {
 598     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 599   }
 600
 601   // FIXME - use subtarget debug flags
 602   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
 603       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
 604     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 605   }
 606
 607   if (Subtarget->is64Bit()) {
 608     setExceptionPointerRegister(X86::RAX);
 609     setExceptionSelectorRegister(X86::RDX);
 610   } else {
 611     setExceptionPointerRegister(X86::EAX);
 612     setExceptionSelectorRegister(X86::EDX);
 613   }
 614   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 615   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 616
 617   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 618   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 619
 620   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 621   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 622
 623   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 624   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 625   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 626   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
 627     // TargetInfo::X86_64ABIBuiltinVaList
 628     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
 629     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
 630   } else {
 631     // TargetInfo::CharPtrBuiltinVaList
 632     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
 633     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
 634   }
 635
 636   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 637   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 638
 639   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
 640
 641   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
 642     // f32 and f64 use SSE.
 643     // Set up the FP register classes.
 644     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 645     addRegisterClass(MVT::f64, &X86::FR64RegClass);
 646
 647     // Use ANDPD to simulate FABS.
 648     setOperationAction(ISD::FABS , MVT::f64, Custom);
 649     setOperationAction(ISD::FABS , MVT::f32, Custom);
 650
 651     // Use XORP to simulate FNEG.
 652     setOperationAction(ISD::FNEG , MVT::f64, Custom);
 653     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 654
 655     // Use ANDPD and ORPD to simulate FCOPYSIGN.
 656     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 657     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 658
 659     // Lower this to FGETSIGNx86 plus an AND.
 660     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 661     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 662
 663     // We don't support sin/cos/fmod
 664     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 665     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 666     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 667     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 668     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 669     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 670
 671     // Expand FP immediates into loads from the stack, except for the special
 672     // cases we handle.
 673     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 674     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 675   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
 676     // Use SSE for f32, x87 for f64.
 677     // Set up the FP register classes.
 678     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 679     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 680
 681     // Use ANDPS to simulate FABS.
 682     setOperationAction(ISD::FABS , MVT::f32, Custom);
 683
 684     // Use XORP to simulate FNEG.
 685     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 686
 687     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 688
 689     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 690     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 691     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 692
 693     // We don't support sin/cos/fmod
 694     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 695     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 696     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 697
 698     // Special cases we handle for FP constants.
 699     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 700     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 701     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 702     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 703     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 704
 705     if (!TM.Options.UnsafeFPMath) {
 706       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 707       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 708       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 709     }
 710   } else if (!TM.Options.UseSoftFloat) {
 711     // f32 and f64 in x87.
 712     // Set up the FP register classes.
 713     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 714     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 715
 716     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 717     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
 718     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 719     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 720
 721     if (!TM.Options.UnsafeFPMath) {
 722       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 723       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 724       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 725       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 726       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 727       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 728     }
 729     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 730     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 731     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 732     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 733     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 734     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 735     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 736     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 737   }
 738
 739   // We don't support FMA.
 740   setOperationAction(ISD::FMA, MVT::f64, Expand);
 741   setOperationAction(ISD::FMA, MVT::f32, Expand);
 742
 743   // Long double always uses X87.
 744   if (!TM.Options.UseSoftFloat) {
 745     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 746     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 747     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 748     {
 749       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
 750       addLegalFPImmediate(TmpFlt);  // FLD0
 751       TmpFlt.changeSign();
 752       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 753
 754       bool ignored;
 755       APFloat TmpFlt2(+1.0);
 756       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
 757                       &ignored);
 758       addLegalFPImmediate(TmpFlt2);  // FLD1
 759       TmpFlt2.changeSign();
 760       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 761     }
 762
 763     if (!TM.Options.UnsafeFPMath) {
 764       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 765       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 766       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 767     }
 768
 769     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 770     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 771     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 772     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 773     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 774     setOperationAction(ISD::FMA, MVT::f80, Expand);
 775   }
 776
 777   // Always use a library call for pow.
 778   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 779   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 780   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 781
 782   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 783   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 784   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 785   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 786   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 787   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 788   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 789
 790   // First set operation action for all vector types to either promote
 791   // (for widening) or expand (for scalarization). Then we will selectively
 792   // turn on ones that can be effectively codegen'd.
 793   for (MVT VT : MVT::vector_valuetypes()) {
 794     setOperationAction(ISD::ADD , VT, Expand);
 795     setOperationAction(ISD::SUB , VT, Expand);
 796     setOperationAction(ISD::FADD, VT, Expand);
 797     setOperationAction(ISD::FNEG, VT, Expand);
 798     setOperationAction(ISD::FSUB, VT, Expand);
 799     setOperationAction(ISD::MUL , VT, Expand);
 800     setOperationAction(ISD::FMUL, VT, Expand);
 801     setOperationAction(ISD::SDIV, VT, Expand);
 802     setOperationAction(ISD::UDIV, VT, Expand);
 803     setOperationAction(ISD::FDIV, VT, Expand);
 804     setOperationAction(ISD::SREM, VT, Expand);
 805     setOperationAction(ISD::UREM, VT, Expand);
 806     setOperationAction(ISD::LOAD, VT, Expand);
 807     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 808     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 809     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 810     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 811     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 812     setOperationAction(ISD::FABS, VT, Expand);
 813     setOperationAction(ISD::FSIN, VT, Expand);
 814     setOperationAction(ISD::FSINCOS, VT, Expand);
 815     setOperationAction(ISD::FCOS, VT, Expand);
 816     setOperationAction(ISD::FSINCOS, VT, Expand);
 817     setOperationAction(ISD::FREM, VT, Expand);
 818     setOperationAction(ISD::FMA,  VT, Expand);
 819     setOperationAction(ISD::FPOWI, VT, Expand);
 820     setOperationAction(ISD::FSQRT, VT, Expand);
 821     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 822     setOperationAction(ISD::FFLOOR, VT, Expand);
 823     setOperationAction(ISD::FCEIL, VT, Expand);
 824     setOperationAction(ISD::FTRUNC, VT, Expand);
 825     setOperationAction(ISD::FRINT, VT, Expand);
 826     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 827     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 828     setOperationAction(ISD::MULHS, VT, Expand);
 829     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 830     setOperationAction(ISD::MULHU, VT, Expand);
 831     setOperationAction(ISD::SDIVREM, VT, Expand);
 832     setOperationAction(ISD::UDIVREM, VT, Expand);
 833     setOperationAction(ISD::FPOW, VT, Expand);
 834     setOperationAction(ISD::CTPOP, VT, Expand);
 835     setOperationAction(ISD::CTTZ, VT, Expand);
 836     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
 837     setOperationAction(ISD::CTLZ, VT, Expand);
 838     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
 839     setOperationAction(ISD::SHL, VT, Expand);
 840     setOperationAction(ISD::SRA, VT, Expand);
 841     setOperationAction(ISD::SRL, VT, Expand);
 842     setOperationAction(ISD::ROTL, VT, Expand);
 843     setOperationAction(ISD::ROTR, VT, Expand);
 844     setOperationAction(ISD::BSWAP, VT, Expand);
 845     setOperationAction(ISD::SETCC, VT, Expand);
 846     setOperationAction(ISD::FLOG, VT, Expand);
 847     setOperationAction(ISD::FLOG2, VT, Expand);
 848     setOperationAction(ISD::FLOG10, VT, Expand);
 849     setOperationAction(ISD::FEXP, VT, Expand);
 850     setOperationAction(ISD::FEXP2, VT, Expand);
 851     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 852     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 853     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 854     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 855     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 856     setOperationAction(ISD::TRUNCATE, VT, Expand);
 857     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 858     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 859     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 860     setOperationAction(ISD::VSELECT, VT, Expand);
 861     setOperationAction(ISD::SELECT_CC, VT, Expand);
 862     for (MVT InnerVT : MVT::vector_valuetypes()) {
 863       setTruncStoreAction(InnerVT, VT, Expand);
 864
 865       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 866       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 867
 868       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 869       // types, we have to deal with them whether we ask for Expansion or not.
 870       // Setting Expand causes its own optimisation problems though, so leave
 871       // them legal.
 872       if (VT.getVectorElementType() == MVT::i1)
 873         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 874     }
 875   }
 876
 877   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 878   // with -msoft-float, disable use of MMX as well.
 879   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
 880     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 881     // No operations on x86mmx supported, everything uses intrinsics.
 882   }
 883
 884   // MMX-sized vectors (other than x86mmx) are expected to be expanded
 885   // into smaller operations.
 886   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
 887   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
 888   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
 889   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
 890   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
 891   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
 892   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
 893   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
 894   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
 895   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
 896   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
 897   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
 898   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
 899   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
 900   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
 901   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
 902   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
 903   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
 904   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
 905   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
 906   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
 907   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
 908   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
 909   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
 910   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
 911   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
 912   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
 913   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
 914   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
 915
 916   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
 917     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 918
 919     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
 920     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
 921     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
 922     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
 923     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
 924     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 925     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 926     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
 927     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 928     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 929     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 930     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 931     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 932   }
 933
 934   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
 935     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 936
 937     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 938     // registers cannot be used even for integer operations.
 939     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
 940     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
 941     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
 942     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 943
 944     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
 945     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
 946     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
 947     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
 948     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 949     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 950     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 951     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 952     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 953     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 954     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
 955     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
 956     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
 957     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
 958     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 959     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
 960     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
 961     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
 962     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
 963     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
 964     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 965     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 966
 967     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
 968     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
 969     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
 970     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
 971
 972     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
 973     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
 974     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 975     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 976     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 977
 978     // Only provide customized ctpop vector bit twiddling for vector types we
 979     // know to perform better than using the popcnt instructions on each vector
 980     // element. If popcnt isn't supported, always provide the custom version.
 981     if (!Subtarget->hasPOPCNT()) {
 982       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
 983       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
 984     }
 985
 986     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
 987     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
 988       MVT VT = (MVT::SimpleValueType)i;
 989       // Do not attempt to custom lower non-power-of-2 vectors
 990       if (!isPowerOf2_32(VT.getVectorNumElements()))
 991         continue;
 992       // Do not attempt to custom lower non-128-bit vectors
 993       if (!VT.is128BitVector())
 994         continue;
 995       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 996       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 997       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 998     }
 999
1000     // We support custom legalizing of sext and anyext loads for specific
1001     // memory vector types which we can load as a scalar (or sequence of
1002     // scalars) and extend in-register to a legal 128-bit vector type. For sext
1003     // loads these must work with a single scalar load.
1004     for (MVT VT : MVT::integer_vector_valuetypes()) {
1005       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
1006       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
1007       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
1008       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
1009       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
1010       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
1011       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
1012       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
1013       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
1014     }
1015
1016     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
1017     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
1018     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
1019     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
1020     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
1021     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
1022
1023     if (Subtarget->is64Bit()) {
1024       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1025       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1026     }
1027
1028     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
1029     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1030       MVT VT = (MVT::SimpleValueType)i;
1031
1032       // Do not attempt to promote non-128-bit vectors
1033       if (!VT.is128BitVector())
1034         continue;
1035
1036       setOperationAction(ISD::AND,    VT, Promote);
1037       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
1038       setOperationAction(ISD::OR,     VT, Promote);
1039       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
1040       setOperationAction(ISD::XOR,    VT, Promote);
1041       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
1042       setOperationAction(ISD::LOAD,   VT, Promote);
1043       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
1044       setOperationAction(ISD::SELECT, VT, Promote);
1045       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1046     }
1047
1048     // Custom lower v2i64 and v2f64 selects.
1049     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
1050     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
1051     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1052     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1053
1054     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1055     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1056
1057     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
1058     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
1059     // As there is no 64-bit GPR available, we need build a special custom
1060     // sequence to convert from v2i32 to v2f32.
1061     if (!Subtarget->is64Bit())
1062       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
1063
1064     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1065     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1066
1067     for (MVT VT : MVT::fp_vector_valuetypes())
1068       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
1069
1070     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1071     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1072     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1073   }
1074
1075   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1076     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
1077     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
1078     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
1079     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
1080     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
1081     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
1082     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
1083     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
1084     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
1085     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
1086
1087     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
1088     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
1089     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
1090     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
1091     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
1092     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
1093     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
1094     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
1095     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
1096     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
1097
1098     // FIXME: Do we need to handle scalar-to-vector here?
1099     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1100
1101     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
1102     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
1103     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
1104     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
1105     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
1106     // There is no BLENDI for byte vectors. We don't need to custom lower
1107     // some vselects for now.
1108     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1109
1110     // SSE41 brings specific instructions for doing vector sign extend even in
1111     // cases where we don't have SRA.
1112     for (MVT VT : MVT::integer_vector_valuetypes()) {
1113       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
1114       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
1115       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
1116     }
1117
1118     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1119     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1120     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1121     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1122     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1123     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1124     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1125
1126     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1127     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1128     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1129     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1130     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1131     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1132
1133     // i8 and i16 vectors are custom because the source register and source
1134     // source memory operand types are not the same width.  f32 vectors are
1135     // custom since the immediate controlling the insert encodes additional
1136     // information.
1137     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1138     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1139     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1140     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1141
1142     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1143     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1144     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1145     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1146
1147     // FIXME: these should be Legal, but that's only for the case where
1148     // the index is constant.  For now custom expand to deal with that.
1149     if (Subtarget->is64Bit()) {
1150       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1151       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1152     }
1153   }
1154
1155   if (Subtarget->hasSSE2()) {
1156     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1157     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1158
1159     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1160     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1161
1162     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1163     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1164
1165     // In the customized shift lowering, the legal cases in AVX2 will be
1166     // recognized.
1167     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
1168     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
1169
1170     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
1171     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
1172
1173     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
1174   }
1175
1176   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1177     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1178     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1179     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1180     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1181     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1182     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1183
1184     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1185     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1186     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1187
1188     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1189     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1190     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1191     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1192     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1193     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1194     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1195     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1196     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1197     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1198     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1199     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1200
1201     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1202     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1203     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1204     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1205     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1206     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1207     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1208     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1209     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1210     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1211     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1212     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1213
1214     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1215     // even though v8i16 is a legal type.
1216     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1217     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1218     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1219
1220     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1221     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1222     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1223
1224     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1225     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1226
1227     for (MVT VT : MVT::fp_vector_valuetypes())
1228       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1229
1230     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1231     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1232
1233     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1234     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1235
1236     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1237     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1238
1239     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1240     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1241     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1242     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1243
1244     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1245     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1246     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1247
1248     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
1249     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
1250     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
1251     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
1252
1253     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
1254     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
1255     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1256     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1257     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1258     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1259     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1260     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1261     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1262     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1263     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1264     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1265
1266     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1267       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1268       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1269       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1270       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1271       setOperationAction(ISD::FMA,             MVT::f32, Legal);
1272       setOperationAction(ISD::FMA,             MVT::f64, Legal);
1273     }
1274
1275     if (Subtarget->hasInt256()) {
1276       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1277       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1278       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1279       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1280
1281       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1282       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1283       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1284       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1285
1286       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1287       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1288       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1289       // Don't lower v32i8 because there is no 128-bit byte mul
1290
1291       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
1292       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
1293       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
1294       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
1295
1296       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
1297       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1298
1299       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1300       // when we have a 256bit-wide blend with immediate.
1301       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1302
1303       // Only provide customized ctpop vector bit twiddling for vector types we
1304       // know to perform better than using the popcnt instructions on each
1305       // vector element. If popcnt isn't supported, always provide the custom
1306       // version.
1307       if (!Subtarget->hasPOPCNT())
1308         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
1309
1310       // Custom CTPOP always performs better on natively supported v8i32
1311       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
1312
1313       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1314       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1315       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1316       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1317       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1318       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1319       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1320
1321       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1322       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1323       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1324       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1325       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1326       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1327     } else {
1328       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1329       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1330       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1331       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1332
1333       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1334       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1335       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1336       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1337
1338       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1339       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1340       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1341       // Don't lower v32i8 because there is no 128-bit byte mul
1342     }
1343
1344     // In the customized shift lowering, the legal cases in AVX2 will be
1345     // recognized.
1346     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
1347     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
1348
1349     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
1350     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
1351
1352     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
1353
1354     // Custom lower several nodes for 256-bit types.
1355     for (MVT VT : MVT::vector_valuetypes()) {
1356       if (VT.getScalarSizeInBits() >= 32) {
1357         setOperationAction(ISD::MLOAD,  VT, Legal);
1358         setOperationAction(ISD::MSTORE, VT, Legal);
1359       }
1360       // Extract subvector is special because the value type
1361       // (result) is 128-bit but the source is 256-bit wide.
1362       if (VT.is128BitVector()) {
1363         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1364       }
1365       // Do not attempt to custom lower other non-256-bit vectors
1366       if (!VT.is256BitVector())
1367         continue;
1368
1369       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1370       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1371       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1372       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1373       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1374       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1375       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1376     }
1377
1378     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1379     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1380       MVT VT = (MVT::SimpleValueType)i;
1381
1382       // Do not attempt to promote non-256-bit vectors
1383       if (!VT.is256BitVector())
1384         continue;
1385
1386       setOperationAction(ISD::AND,    VT, Promote);
1387       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1388       setOperationAction(ISD::OR,     VT, Promote);
1389       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1390       setOperationAction(ISD::XOR,    VT, Promote);
1391       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1392       setOperationAction(ISD::LOAD,   VT, Promote);
1393       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1394       setOperationAction(ISD::SELECT, VT, Promote);
1395       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1396     }
1397   }
1398
1399   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1400     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1401     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1402     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1403     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1404
1405     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1406     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1407     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1408
1409     for (MVT VT : MVT::fp_vector_valuetypes())
1410       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1411
1412     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1413     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1414     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1415     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1416     setOperationAction(ISD::AND,                MVT::i1,    Legal);
1417     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
1418     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
1419     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
1420     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
1421     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
1422
1423     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
1424     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
1425     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
1426     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
1427     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
1428     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
1429
1430     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
1431     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
1432     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
1433     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
1434     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
1435     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
1436     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
1437     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
1438
1439     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
1440     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
1441     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
1442     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
1443     if (Subtarget->is64Bit()) {
1444       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
1445       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
1446       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
1447       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
1448     }
1449     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1450     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1451     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1452     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1453     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1454     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1455     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1456     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1457     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1458     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1459     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1460     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1461     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1462     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1463
1464     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1465     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1466     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1467     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1468     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1469     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1470     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1471     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1472     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1473     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1474     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1475     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1476     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1477
1478     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1479     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1480     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1481     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1482     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
1483     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
1484
1485     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1486     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1487
1488     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1489
1490     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1491     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1492     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1493     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1494     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1495     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1496     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1497     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1498     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1499
1500     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
1501     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
1502
1503     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
1504     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
1505
1506     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1507
1508     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
1509     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
1510
1511     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
1512     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
1513
1514     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
1515     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
1516
1517     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
1518     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
1519     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
1520     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
1521     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
1522     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
1523
1524     if (Subtarget->hasCDI()) {
1525       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
1526       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1527     }
1528
1529     // Custom lower several nodes.
1530     for (MVT VT : MVT::vector_valuetypes()) {
1531       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1532       // Extract subvector is special because the value type
1533       // (result) is 256/128-bit but the source is 512-bit wide.
1534       if (VT.is128BitVector() || VT.is256BitVector()) {
1535         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1536       }
1537       if (VT.getVectorElementType() == MVT::i1)
1538         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1539
1540       // Do not attempt to custom lower other non-512-bit vectors
1541       if (!VT.is512BitVector())
1542         continue;
1543
1544       if ( EltSize >= 32) {
1545         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1546         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1547         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1548         setOperationAction(ISD::VSELECT,             VT, Legal);
1549         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1550         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1551         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1552         setOperationAction(ISD::MLOAD,               VT, Legal);
1553         setOperationAction(ISD::MSTORE,              VT, Legal);
1554       }
1555     }
1556     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1557       MVT VT = (MVT::SimpleValueType)i;
1558
1559       // Do not attempt to promote non-512-bit vectors.
1560       if (!VT.is512BitVector())
1561         continue;
1562
1563       setOperationAction(ISD::SELECT, VT, Promote);
1564       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1565     }
1566   }// has  AVX-512
1567
1568   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
1569     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1570     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1571
1572     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1573     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1574
1575     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
1576     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
1577     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1578     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1579     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
1580     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
1581     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
1582     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
1583     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1584
1585     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1586       const MVT VT = (MVT::SimpleValueType)i;
1587
1588       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1589
1590       // Do not attempt to promote non-512-bit vectors.
1591       if (!VT.is512BitVector())
1592         continue;
1593
1594       if (EltSize < 32) {
1595         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1596         setOperationAction(ISD::VSELECT,             VT, Legal);
1597       }
1598     }
1599   }
1600
1601   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
1602     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1603     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1604
1605     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
1606     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
1607     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
1608
1609     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
1610     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
1611     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
1612     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
1613     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
1614     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
1615   }
1616
1617   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1618   // of this type with custom code.
1619   for (MVT VT : MVT::vector_valuetypes())
1620     setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1621
1622   // We want to custom lower some of our intrinsics.
1623   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1624   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1625   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1626   if (!Subtarget->is64Bit())
1627     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1628
1629   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1630   // handle type legalization for these operations here.
1631   //
1632   // FIXME: We really should do custom legalization for addition and
1633   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1634   // than generic legalization for 64-bit multiplication-with-overflow, though.
1635   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1636     // Add/Sub/Mul with overflow operations are custom lowered.
1637     MVT VT = IntVTs[i];
1638     setOperationAction(ISD::SADDO, VT, Custom);
1639     setOperationAction(ISD::UADDO, VT, Custom);
1640     setOperationAction(ISD::SSUBO, VT, Custom);
1641     setOperationAction(ISD::USUBO, VT, Custom);
1642     setOperationAction(ISD::SMULO, VT, Custom);
1643     setOperationAction(ISD::UMULO, VT, Custom);
1644   }
1645
1646
1647   if (!Subtarget->is64Bit()) {
1648     // These libcalls are not available in 32-bit.
1649     setLibcallName(RTLIB::SHL_I128, nullptr);
1650     setLibcallName(RTLIB::SRL_I128, nullptr);
1651     setLibcallName(RTLIB::SRA_I128, nullptr);
1652   }
1653
1654   // Combine sin / cos into one node or libcall if possible.
1655   if (Subtarget->hasSinCos()) {
1656     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1657     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1658     if (Subtarget->isTargetDarwin()) {
1659       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1660       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1661       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1662       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1663     }
1664   }
1665
1666   if (Subtarget->isTargetWin64()) {
1667     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1668     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1669     setOperationAction(ISD::SREM, MVT::i128, Custom);
1670     setOperationAction(ISD::UREM, MVT::i128, Custom);
1671     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1672     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1673   }
1674
1675   // We have target-specific dag combine patterns for the following nodes:
1676   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1677   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1678   setTargetDAGCombine(ISD::BITCAST);
1679   setTargetDAGCombine(ISD::VSELECT);
1680   setTargetDAGCombine(ISD::SELECT);
1681   setTargetDAGCombine(ISD::SHL);
1682   setTargetDAGCombine(ISD::SRA);
1683   setTargetDAGCombine(ISD::SRL);
1684   setTargetDAGCombine(ISD::OR);
1685   setTargetDAGCombine(ISD::AND);
1686   setTargetDAGCombine(ISD::ADD);
1687   setTargetDAGCombine(ISD::FADD);
1688   setTargetDAGCombine(ISD::FSUB);
1689   setTargetDAGCombine(ISD::FMA);
1690   setTargetDAGCombine(ISD::SUB);
1691   setTargetDAGCombine(ISD::LOAD);
1692   setTargetDAGCombine(ISD::MLOAD);
1693   setTargetDAGCombine(ISD::STORE);
1694   setTargetDAGCombine(ISD::MSTORE);
1695   setTargetDAGCombine(ISD::ZERO_EXTEND);
1696   setTargetDAGCombine(ISD::ANY_EXTEND);
1697   setTargetDAGCombine(ISD::SIGN_EXTEND);
1698   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1699   setTargetDAGCombine(ISD::TRUNCATE);
1700   setTargetDAGCombine(ISD::SINT_TO_FP);
1701   setTargetDAGCombine(ISD::SETCC);
1702   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1703   setTargetDAGCombine(ISD::BUILD_VECTOR);
1704   setTargetDAGCombine(ISD::MUL);
1705   setTargetDAGCombine(ISD::XOR);
1706
1707   computeRegisterProperties();
1708
1709   // On Darwin, -Os means optimize for size without hurting performance,
1710   // do not reduce the limit.
1711   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1712   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1713   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1714   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1715   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1716   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1717   setPrefLoopAlignment(4); // 2^4 bytes.
1718
1719   // Predictable cmov don't hurt on atom because it's in-order.
1720   PredictableSelectIsExpensive = !Subtarget->isAtom();
1721   EnableExtLdPromotion = true;
1722   setPrefFunctionAlignment(4); // 2^4 bytes.
1723
1724   verifyIntrinsicTables();
1725 }
1726
1727 // This has so far only been implemented for 64-bit MachO.
1728 bool X86TargetLowering::useLoadStackGuardNode() const {
1729   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
1730 }
1731
1732 TargetLoweringBase::LegalizeTypeAction
1733 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1734   if (ExperimentalVectorWideningLegalization &&
1735       VT.getVectorNumElements() != 1 &&
1736       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1737     return TypeWidenVector;
1738
1739   return TargetLoweringBase::getPreferredVectorAction(VT);
1740 }
1741
1742 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1743   if (!VT.isVector())
1744     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1745
1746   const unsigned NumElts = VT.getVectorNumElements();
1747   const EVT EltVT = VT.getVectorElementType();
1748   if (VT.is512BitVector()) {
1749     if (Subtarget->hasAVX512())
1750       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1751           EltVT == MVT::f32 || EltVT == MVT::f64)
1752         switch(NumElts) {
1753         case  8: return MVT::v8i1;
1754         case 16: return MVT::v16i1;
1755       }
1756     if (Subtarget->hasBWI())
1757       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1758         switch(NumElts) {
1759         case 32: return MVT::v32i1;
1760         case 64: return MVT::v64i1;
1761       }
1762   }
1763
1764   if (VT.is256BitVector() || VT.is128BitVector()) {
1765     if (Subtarget->hasVLX())
1766       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1767           EltVT == MVT::f32 || EltVT == MVT::f64)
1768         switch(NumElts) {
1769         case 2: return MVT::v2i1;
1770         case 4: return MVT::v4i1;
1771         case 8: return MVT::v8i1;
1772       }
1773     if (Subtarget->hasBWI() && Subtarget->hasVLX())
1774       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1775         switch(NumElts) {
1776         case  8: return MVT::v8i1;
1777         case 16: return MVT::v16i1;
1778         case 32: return MVT::v32i1;
1779       }
1780   }
1781
1782   return VT.changeVectorElementTypeToInteger();
1783 }
1784
1785 /// Helper for getByValTypeAlignment to determine
1786 /// the desired ByVal argument alignment.
1787 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1788   if (MaxAlign == 16)
1789     return;
1790   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1791     if (VTy->getBitWidth() == 128)
1792       MaxAlign = 16;
1793   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1794     unsigned EltAlign = 0;
1795     getMaxByValAlign(ATy->getElementType(), EltAlign);
1796     if (EltAlign > MaxAlign)
1797       MaxAlign = EltAlign;
1798   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1799     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1800       unsigned EltAlign = 0;
1801       getMaxByValAlign(STy->getElementType(i), EltAlign);
1802       if (EltAlign > MaxAlign)
1803         MaxAlign = EltAlign;
1804       if (MaxAlign == 16)
1805         break;
1806     }
1807   }
1808 }
1809
1810 /// Return the desired alignment for ByVal aggregate
1811 /// function arguments in the caller parameter area. For X86, aggregates
1812 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1813 /// are at 4-byte boundaries.
1814 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1815   if (Subtarget->is64Bit()) {
1816     // Max of 8 and alignment of type.
1817     unsigned TyAlign = TD->getABITypeAlignment(Ty);
1818     if (TyAlign > 8)
1819       return TyAlign;
1820     return 8;
1821   }
1822
1823   unsigned Align = 4;
1824   if (Subtarget->hasSSE1())
1825     getMaxByValAlign(Ty, Align);
1826   return Align;
1827 }
1828
1829 /// Returns the target specific optimal type for load
1830 /// and store operations as a result of memset, memcpy, and memmove
1831 /// lowering. If DstAlign is zero that means it's safe to destination
1832 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1833 /// means there isn't a need to check it against alignment requirement,
1834 /// probably because the source does not need to be loaded. If 'IsMemset' is
1835 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1836 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1837 /// source is constant so it does not need to be loaded.
1838 /// It returns EVT::Other if the type should be determined using generic
1839 /// target-independent logic.
1840 EVT
1841 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1842                                        unsigned DstAlign, unsigned SrcAlign,
1843                                        bool IsMemset, bool ZeroMemset,
1844                                        bool MemcpyStrSrc,
1845                                        MachineFunction &MF) const {
1846   const Function *F = MF.getFunction();
1847   if ((!IsMemset || ZeroMemset) &&
1848       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1849                                        Attribute::NoImplicitFloat)) {
1850     if (Size >= 16 &&
1851         (Subtarget->isUnalignedMemAccessFast() ||
1852          ((DstAlign == 0 || DstAlign >= 16) &&
1853           (SrcAlign == 0 || SrcAlign >= 16)))) {
1854       if (Size >= 32) {
1855         if (Subtarget->hasInt256())
1856           return MVT::v8i32;
1857         if (Subtarget->hasFp256())
1858           return MVT::v8f32;
1859       }
1860       if (Subtarget->hasSSE2())
1861         return MVT::v4i32;
1862       if (Subtarget->hasSSE1())
1863         return MVT::v4f32;
1864     } else if (!MemcpyStrSrc && Size >= 8 &&
1865                !Subtarget->is64Bit() &&
1866                Subtarget->hasSSE2()) {
1867       // Do not use f64 to lower memcpy if source is string constant. It's
1868       // better to use i32 to avoid the loads.
1869       return MVT::f64;
1870     }
1871   }
1872   if (Subtarget->is64Bit() && Size >= 8)
1873     return MVT::i64;
1874   return MVT::i32;
1875 }
1876
1877 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1878   if (VT == MVT::f32)
1879     return X86ScalarSSEf32;
1880   else if (VT == MVT::f64)
1881     return X86ScalarSSEf64;
1882   return true;
1883 }
1884
1885 bool
1886 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1887                                                   unsigned,
1888                                                   unsigned,
1889                                                   bool *Fast) const {
1890   if (Fast)
1891     *Fast = Subtarget->isUnalignedMemAccessFast();
1892   return true;
1893 }
1894
1895 /// Return the entry encoding for a jump table in the
1896 /// current function.  The returned value is a member of the
1897 /// MachineJumpTableInfo::JTEntryKind enum.
1898 unsigned X86TargetLowering::getJumpTableEncoding() const {
1899   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1900   // symbol.
1901   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1902       Subtarget->isPICStyleGOT())
1903     return MachineJumpTableInfo::EK_Custom32;
1904
1905   // Otherwise, use the normal jump table encoding heuristics.
1906   return TargetLowering::getJumpTableEncoding();
1907 }
1908
1909 const MCExpr *
1910 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1911                                              const MachineBasicBlock *MBB,
1912                                              unsigned uid,MCContext &Ctx) const{
1913   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
1914          Subtarget->isPICStyleGOT());
1915   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1916   // entries.
1917   return MCSymbolRefExpr::Create(MBB->getSymbol(),
1918                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1919 }
1920
1921 /// Returns relocation base for the given PIC jumptable.
1922 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1923                                                     SelectionDAG &DAG) const {
1924   if (!Subtarget->is64Bit())
1925     // This doesn't have SDLoc associated with it, but is not really the
1926     // same as a Register.
1927     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1928   return Table;
1929 }
1930
1931 /// This returns the relocation base for the given PIC jumptable,
1932 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1933 const MCExpr *X86TargetLowering::
1934 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1935                              MCContext &Ctx) const {
1936   // X86-64 uses RIP relative addressing based on the jump table label.
1937   if (Subtarget->isPICStyleRIPRel())
1938     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1939
1940   // Otherwise, the reference is relative to the PIC base.
1941   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1942 }
1943
1944 // FIXME: Why this routine is here? Move to RegInfo!
1945 std::pair<const TargetRegisterClass*, uint8_t>
1946 X86TargetLowering::findRepresentativeClass(MVT VT) const{
1947   const TargetRegisterClass *RRC = nullptr;
1948   uint8_t Cost = 1;
1949   switch (VT.SimpleTy) {
1950   default:
1951     return TargetLowering::findRepresentativeClass(VT);
1952   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1953     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1954     break;
1955   case MVT::x86mmx:
1956     RRC = &X86::VR64RegClass;
1957     break;
1958   case MVT::f32: case MVT::f64:
1959   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1960   case MVT::v4f32: case MVT::v2f64:
1961   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1962   case MVT::v4f64:
1963     RRC = &X86::VR128RegClass;
1964     break;
1965   }
1966   return std::make_pair(RRC, Cost);
1967 }
1968
1969 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1970                                                unsigned &Offset) const {
1971   if (!Subtarget->isTargetLinux())
1972     return false;
1973
1974   if (Subtarget->is64Bit()) {
1975     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1976     Offset = 0x28;
1977     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1978       AddressSpace = 256;
1979     else
1980       AddressSpace = 257;
1981   } else {
1982     // %gs:0x14 on i386
1983     Offset = 0x14;
1984     AddressSpace = 256;
1985   }
1986   return true;
1987 }
1988
1989 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1990                                             unsigned DestAS) const {
1991   assert(SrcAS != DestAS && "Expected different address spaces!");
1992
1993   return SrcAS < 256 && DestAS < 256;
1994 }
1995
1996 //===----------------------------------------------------------------------===//
1997 //               Return Value Calling Convention Implementation
1998 //===----------------------------------------------------------------------===//
1999
2000 #include "X86GenCallingConv.inc"
2001
2002 bool
2003 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2004                                   MachineFunction &MF, bool isVarArg,
2005                         const SmallVectorImpl<ISD::OutputArg> &Outs,
2006                         LLVMContext &Context) const {
2007   SmallVector<CCValAssign, 16> RVLocs;
2008   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2009   return CCInfo.CheckReturn(Outs, RetCC_X86);
2010 }
2011
2012 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2013   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2014   return ScratchRegs;
2015 }
2016
2017 SDValue
2018 X86TargetLowering::LowerReturn(SDValue Chain,
2019                                CallingConv::ID CallConv, bool isVarArg,
2020                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2021                                const SmallVectorImpl<SDValue> &OutVals,
2022                                SDLoc dl, SelectionDAG &DAG) const {
2023   MachineFunction &MF = DAG.getMachineFunction();
2024   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2025
2026   SmallVector<CCValAssign, 16> RVLocs;
2027   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2028   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2029
2030   SDValue Flag;
2031   SmallVector<SDValue, 6> RetOps;
2032   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2033   // Operand #1 = Bytes To Pop
2034   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
2035                    MVT::i16));
2036
2037   // Copy the result values into the output registers.
2038   for (unsigned i = 0; i != RVLocs.size(); ++i) {
2039     CCValAssign &VA = RVLocs[i];
2040     assert(VA.isRegLoc() && "Can only return in registers!");
2041     SDValue ValToCopy = OutVals[i];
2042     EVT ValVT = ValToCopy.getValueType();
2043
2044     // Promote values to the appropriate types.
2045     if (VA.getLocInfo() == CCValAssign::SExt)
2046       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2047     else if (VA.getLocInfo() == CCValAssign::ZExt)
2048       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2049     else if (VA.getLocInfo() == CCValAssign::AExt)
2050       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2051     else if (VA.getLocInfo() == CCValAssign::BCvt)
2052       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
2053
2054     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2055            "Unexpected FP-extend for return value.");
2056
2057     // If this is x86-64, and we disabled SSE, we can't return FP values,
2058     // or SSE or MMX vectors.
2059     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2060          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2061           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
2062       report_fatal_error("SSE register return with SSE disabled");
2063     }
2064     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2065     // llvm-gcc has never done it right and no one has noticed, so this
2066     // should be OK for now.
2067     if (ValVT == MVT::f64 &&
2068         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
2069       report_fatal_error("SSE2 register return with SSE2 disabled");
2070
2071     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2072     // the RET instruction and handled by the FP Stackifier.
2073     if (VA.getLocReg() == X86::FP0 ||
2074         VA.getLocReg() == X86::FP1) {
2075       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2076       // change the value to the FP stack register class.
2077       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2078         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2079       RetOps.push_back(ValToCopy);
2080       // Don't emit a copytoreg.
2081       continue;
2082     }
2083
2084     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2085     // which is returned in RAX / RDX.
2086     if (Subtarget->is64Bit()) {
2087       if (ValVT == MVT::x86mmx) {
2088         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2089           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
2090           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2091                                   ValToCopy);
2092           // If we don't have SSE2 available, convert to v4f32 so the generated
2093           // register is legal.
2094           if (!Subtarget->hasSSE2())
2095             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
2096         }
2097       }
2098     }
2099
2100     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2101     Flag = Chain.getValue(1);
2102     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2103   }
2104
2105   // The x86-64 ABIs require that for returning structs by value we copy
2106   // the sret argument into %rax/%eax (depending on ABI) for the return.
2107   // Win32 requires us to put the sret argument to %eax as well.
2108   // We saved the argument into a virtual register in the entry block,
2109   // so now we copy the value out and into %rax/%eax.
2110   //
2111   // Checking Function.hasStructRetAttr() here is insufficient because the IR
2112   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2113   // false, then an sret argument may be implicitly inserted in the SelDAG. In
2114   // either case FuncInfo->setSRetReturnReg() will have been called.
2115   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2116     assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) &&
2117            "No need for an sret register");
2118     SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
2119
2120     unsigned RetValReg
2121         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
2122           X86::RAX : X86::EAX;
2123     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2124     Flag = Chain.getValue(1);
2125
2126     // RAX/EAX now acts like a return value.
2127     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
2128   }
2129
2130   RetOps[0] = Chain;  // Update chain.
2131
2132   // Add the flag if we have it.
2133   if (Flag.getNode())
2134     RetOps.push_back(Flag);
2135
2136   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
2137 }
2138
2139 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2140   if (N->getNumValues() != 1)
2141     return false;
2142   if (!N->hasNUsesOfValue(1, 0))
2143     return false;
2144
2145   SDValue TCChain = Chain;
2146   SDNode *Copy = *N->use_begin();
2147   if (Copy->getOpcode() == ISD::CopyToReg) {
2148     // If the copy has a glue operand, we conservatively assume it isn't safe to
2149     // perform a tail call.
2150     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2151       return false;
2152     TCChain = Copy->getOperand(0);
2153   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2154     return false;
2155
2156   bool HasRet = false;
2157   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2158        UI != UE; ++UI) {
2159     if (UI->getOpcode() != X86ISD::RET_FLAG)
2160       return false;
2161     // If we are returning more than one value, we can definitely
2162     // not make a tail call see PR19530
2163     if (UI->getNumOperands() > 4)
2164       return false;
2165     if (UI->getNumOperands() == 4 &&
2166         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2167       return false;
2168     HasRet = true;
2169   }
2170
2171   if (!HasRet)
2172     return false;
2173
2174   Chain = TCChain;
2175   return true;
2176 }
2177
2178 EVT
2179 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
2180                                             ISD::NodeType ExtendKind) const {
2181   MVT ReturnMVT;
2182   // TODO: Is this also valid on 32-bit?
2183   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2184     ReturnMVT = MVT::i8;
2185   else
2186     ReturnMVT = MVT::i32;
2187
2188   EVT MinVT = getRegisterType(Context, ReturnMVT);
2189   return VT.bitsLT(MinVT) ? MinVT : VT;
2190 }
2191
2192 /// Lower the result values of a call into the
2193 /// appropriate copies out of appropriate physical registers.
2194 ///
2195 SDValue
2196 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2197                                    CallingConv::ID CallConv, bool isVarArg,
2198                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2199                                    SDLoc dl, SelectionDAG &DAG,
2200                                    SmallVectorImpl<SDValue> &InVals) const {
2201
2202   // Assign locations to each value returned by this call.
2203   SmallVector<CCValAssign, 16> RVLocs;
2204   bool Is64Bit = Subtarget->is64Bit();
2205   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2206                  *DAG.getContext());
2207   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2208
2209   // Copy all of the result registers out of their specified physreg.
2210   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2211     CCValAssign &VA = RVLocs[i];
2212     EVT CopyVT = VA.getValVT();
2213
2214     // If this is x86-64, and we disabled SSE, we can't return FP values
2215     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2216         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2217       report_fatal_error("SSE register return with SSE disabled");
2218     }
2219
2220     // If we prefer to use the value in xmm registers, copy it out as f80 and
2221     // use a truncate to move it from fp stack reg to xmm reg.
2222     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2223         isScalarFPTypeInSSEReg(VA.getValVT()))
2224       CopyVT = MVT::f80;
2225
2226     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2227                                CopyVT, InFlag).getValue(1);
2228     SDValue Val = Chain.getValue(0);
2229
2230     if (CopyVT != VA.getValVT())
2231       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2232                         // This truncation won't change the value.
2233                         DAG.getIntPtrConstant(1));
2234
2235     InFlag = Chain.getValue(2);
2236     InVals.push_back(Val);
2237   }
2238
2239   return Chain;
2240 }
2241
2242 //===----------------------------------------------------------------------===//
2243 //                C & StdCall & Fast Calling Convention implementation
2244 //===----------------------------------------------------------------------===//
2245 //  StdCall calling convention seems to be standard for many Windows' API
2246 //  routines and around. It differs from C calling convention just a little:
2247 //  callee should clean up the stack, not caller. Symbols should be also
2248 //  decorated in some fancy way :) It doesn't support any vector arguments.
2249 //  For info on fast calling convention see Fast Calling Convention (tail call)
2250 //  implementation LowerX86_32FastCCCallTo.
2251
2252 /// CallIsStructReturn - Determines whether a call uses struct return
2253 /// semantics.
2254 enum StructReturnType {
2255   NotStructReturn,
2256   RegStructReturn,
2257   StackStructReturn
2258 };
2259 static StructReturnType
2260 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2261   if (Outs.empty())
2262     return NotStructReturn;
2263
2264   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2265   if (!Flags.isSRet())
2266     return NotStructReturn;
2267   if (Flags.isInReg())
2268     return RegStructReturn;
2269   return StackStructReturn;
2270 }
2271
2272 /// Determines whether a function uses struct return semantics.
2273 static StructReturnType
2274 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2275   if (Ins.empty())
2276     return NotStructReturn;
2277
2278   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2279   if (!Flags.isSRet())
2280     return NotStructReturn;
2281   if (Flags.isInReg())
2282     return RegStructReturn;
2283   return StackStructReturn;
2284 }
2285
2286 /// Make a copy of an aggregate at address specified by "Src" to address
2287 /// "Dst" with size and alignment information specified by the specific
2288 /// parameter attribute. The copy will be passed as a byval function parameter.
2289 static SDValue
2290 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2291                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2292                           SDLoc dl) {
2293   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2294
2295   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2296                        /*isVolatile*/false, /*AlwaysInline=*/true,
2297                        MachinePointerInfo(), MachinePointerInfo());
2298 }
2299
2300 /// Return true if the calling convention is one that
2301 /// supports tail call optimization.
2302 static bool IsTailCallConvention(CallingConv::ID CC) {
2303   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2304           CC == CallingConv::HiPE);
2305 }
2306
2307 /// \brief Return true if the calling convention is a C calling convention.
2308 static bool IsCCallConvention(CallingConv::ID CC) {
2309   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2310           CC == CallingConv::X86_64_SysV);
2311 }
2312
2313 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2314   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2315     return false;
2316
2317   CallSite CS(CI);
2318   CallingConv::ID CalleeCC = CS.getCallingConv();
2319   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2320     return false;
2321
2322   return true;
2323 }
2324
2325 /// Return true if the function is being made into
2326 /// a tailcall target by changing its ABI.
2327 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2328                                    bool GuaranteedTailCallOpt) {
2329   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2330 }
2331
2332 SDValue
2333 X86TargetLowering::LowerMemArgument(SDValue Chain,
2334                                     CallingConv::ID CallConv,
2335                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2336                                     SDLoc dl, SelectionDAG &DAG,
2337                                     const CCValAssign &VA,
2338                                     MachineFrameInfo *MFI,
2339                                     unsigned i) const {
2340   // Create the nodes corresponding to a load from this parameter slot.
2341   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2342   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2343       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2344   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2345   EVT ValVT;
2346
2347   // If value is passed by pointer we have address passed instead of the value
2348   // itself.
2349   if (VA.getLocInfo() == CCValAssign::Indirect)
2350     ValVT = VA.getLocVT();
2351   else
2352     ValVT = VA.getValVT();
2353
2354   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2355   // changed with more analysis.
2356   // In case of tail call optimization mark all arguments mutable. Since they
2357   // could be overwritten by lowering of arguments in case of a tail call.
2358   if (Flags.isByVal()) {
2359     unsigned Bytes = Flags.getByValSize();
2360     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2361     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2362     return DAG.getFrameIndex(FI, getPointerTy());
2363   } else {
2364     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2365                                     VA.getLocMemOffset(), isImmutable);
2366     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2367     return DAG.getLoad(ValVT, dl, Chain, FIN,
2368                        MachinePointerInfo::getFixedStack(FI),
2369                        false, false, false, 0);
2370   }
2371 }
2372
2373 // FIXME: Get this from tablegen.
2374 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2375                                                 const X86Subtarget *Subtarget) {
2376   assert(Subtarget->is64Bit());
2377
2378   if (Subtarget->isCallingConvWin64(CallConv)) {
2379     static const MCPhysReg GPR64ArgRegsWin64[] = {
2380       X86::RCX, X86::RDX, X86::R8,  X86::R9
2381     };
2382     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2383   }
2384
2385   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2386     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2387   };
2388   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2389 }
2390
2391 // FIXME: Get this from tablegen.
2392 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2393                                                 CallingConv::ID CallConv,
2394                                                 const X86Subtarget *Subtarget) {
2395   assert(Subtarget->is64Bit());
2396   if (Subtarget->isCallingConvWin64(CallConv)) {
2397     // The XMM registers which might contain var arg parameters are shadowed
2398     // in their paired GPR.  So we only need to save the GPR to their home
2399     // slots.
2400     // TODO: __vectorcall will change this.
2401     return None;
2402   }
2403
2404   const Function *Fn = MF.getFunction();
2405   bool NoImplicitFloatOps = Fn->getAttributes().
2406       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2407   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
2408          "SSE register cannot be used when SSE is disabled!");
2409   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2410       !Subtarget->hasSSE1())
2411     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2412     // registers.
2413     return None;
2414
2415   static const MCPhysReg XMMArgRegs64Bit[] = {
2416     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2417     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2418   };
2419   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2420 }
2421
2422 SDValue
2423 X86TargetLowering::LowerFormalArguments(SDValue Chain,
2424                                         CallingConv::ID CallConv,
2425                                         bool isVarArg,
2426                                       const SmallVectorImpl<ISD::InputArg> &Ins,
2427                                         SDLoc dl,
2428                                         SelectionDAG &DAG,
2429                                         SmallVectorImpl<SDValue> &InVals)
2430                                           const {
2431   MachineFunction &MF = DAG.getMachineFunction();
2432   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2433
2434   const Function* Fn = MF.getFunction();
2435   if (Fn->hasExternalLinkage() &&
2436       Subtarget->isTargetCygMing() &&
2437       Fn->getName() == "main")
2438     FuncInfo->setForceFramePointer(true);
2439
2440   MachineFrameInfo *MFI = MF.getFrameInfo();
2441   bool Is64Bit = Subtarget->is64Bit();
2442   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2443
2444   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2445          "Var args not supported with calling convention fastcc, ghc or hipe");
2446
2447   // Assign locations to all of the incoming arguments.
2448   SmallVector<CCValAssign, 16> ArgLocs;
2449   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2450
2451   // Allocate shadow area for Win64
2452   if (IsWin64)
2453     CCInfo.AllocateStack(32, 8);
2454
2455   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2456
2457   unsigned LastVal = ~0U;
2458   SDValue ArgValue;
2459   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2460     CCValAssign &VA = ArgLocs[i];
2461     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2462     // places.
2463     assert(VA.getValNo() != LastVal &&
2464            "Don't support value assigned to multiple locs yet");
2465     (void)LastVal;
2466     LastVal = VA.getValNo();
2467
2468     if (VA.isRegLoc()) {
2469       EVT RegVT = VA.getLocVT();
2470       const TargetRegisterClass *RC;
2471       if (RegVT == MVT::i32)
2472         RC = &X86::GR32RegClass;
2473       else if (Is64Bit && RegVT == MVT::i64)
2474         RC = &X86::GR64RegClass;
2475       else if (RegVT == MVT::f32)
2476         RC = &X86::FR32RegClass;
2477       else if (RegVT == MVT::f64)
2478         RC = &X86::FR64RegClass;
2479       else if (RegVT.is512BitVector())
2480         RC = &X86::VR512RegClass;
2481       else if (RegVT.is256BitVector())
2482         RC = &X86::VR256RegClass;
2483       else if (RegVT.is128BitVector())
2484         RC = &X86::VR128RegClass;
2485       else if (RegVT == MVT::x86mmx)
2486         RC = &X86::VR64RegClass;
2487       else if (RegVT == MVT::i1)
2488         RC = &X86::VK1RegClass;
2489       else if (RegVT == MVT::v8i1)
2490         RC = &X86::VK8RegClass;
2491       else if (RegVT == MVT::v16i1)
2492         RC = &X86::VK16RegClass;
2493       else if (RegVT == MVT::v32i1)
2494         RC = &X86::VK32RegClass;
2495       else if (RegVT == MVT::v64i1)
2496         RC = &X86::VK64RegClass;
2497       else
2498         llvm_unreachable("Unknown argument type!");
2499
2500       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2501       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2502
2503       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2504       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2505       // right size.
2506       if (VA.getLocInfo() == CCValAssign::SExt)
2507         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2508                                DAG.getValueType(VA.getValVT()));
2509       else if (VA.getLocInfo() == CCValAssign::ZExt)
2510         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2511                                DAG.getValueType(VA.getValVT()));
2512       else if (VA.getLocInfo() == CCValAssign::BCvt)
2513         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2514
2515       if (VA.isExtInLoc()) {
2516         // Handle MMX values passed in XMM regs.
2517         if (RegVT.isVector())
2518           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2519         else
2520           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2521       }
2522     } else {
2523       assert(VA.isMemLoc());
2524       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2525     }
2526
2527     // If value is passed via pointer - do a load.
2528     if (VA.getLocInfo() == CCValAssign::Indirect)
2529       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2530                              MachinePointerInfo(), false, false, false, 0);
2531
2532     InVals.push_back(ArgValue);
2533   }
2534
2535   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
2536     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2537       // The x86-64 ABIs require that for returning structs by value we copy
2538       // the sret argument into %rax/%eax (depending on ABI) for the return.
2539       // Win32 requires us to put the sret argument to %eax as well.
2540       // Save the argument into a virtual register so that we can access it
2541       // from the return points.
2542       if (Ins[i].Flags.isSRet()) {
2543         unsigned Reg = FuncInfo->getSRetReturnReg();
2544         if (!Reg) {
2545           MVT PtrTy = getPointerTy();
2546           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2547           FuncInfo->setSRetReturnReg(Reg);
2548         }
2549         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2550         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2551         break;
2552       }
2553     }
2554   }
2555
2556   unsigned StackSize = CCInfo.getNextStackOffset();
2557   // Align stack specially for tail calls.
2558   if (FuncIsMadeTailCallSafe(CallConv,
2559                              MF.getTarget().Options.GuaranteedTailCallOpt))
2560     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2561
2562   // If the function takes variable number of arguments, make a frame index for
2563   // the start of the first vararg value... for expansion of llvm.va_start. We
2564   // can skip this if there are no va_start calls.
2565   if (MFI->hasVAStart() &&
2566       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2567                    CallConv != CallingConv::X86_ThisCall))) {
2568     FuncInfo->setVarArgsFrameIndex(
2569         MFI->CreateFixedObject(1, StackSize, true));
2570   }
2571
2572   // Figure out if XMM registers are in use.
2573   assert(!(MF.getTarget().Options.UseSoftFloat &&
2574            Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
2575                                             Attribute::NoImplicitFloat)) &&
2576          "SSE register cannot be used when SSE is disabled!");
2577
2578   // 64-bit calling conventions support varargs and register parameters, so we
2579   // have to do extra work to spill them in the prologue.
2580   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2581     // Find the first unallocated argument registers.
2582     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2583     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2584     unsigned NumIntRegs =
2585         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
2586     unsigned NumXMMRegs =
2587         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
2588     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2589            "SSE register cannot be used when SSE is disabled!");
2590
2591     // Gather all the live in physical registers.
2592     SmallVector<SDValue, 6> LiveGPRs;
2593     SmallVector<SDValue, 8> LiveXMMRegs;
2594     SDValue ALVal;
2595     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2596       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2597       LiveGPRs.push_back(
2598           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2599     }
2600     if (!ArgXMMs.empty()) {
2601       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2602       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2603       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2604         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2605         LiveXMMRegs.push_back(
2606             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2607       }
2608     }
2609
2610     if (IsWin64) {
2611       const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
2612       // Get to the caller-allocated home save location.  Add 8 to account
2613       // for the return address.
2614       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2615       FuncInfo->setRegSaveFrameIndex(
2616           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2617       // Fixup to set vararg frame on shadow area (4 x i64).
2618       if (NumIntRegs < 4)
2619         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2620     } else {
2621       // For X86-64, if there are vararg parameters that are passed via
2622       // registers, then we must store them to their spots on the stack so
2623       // they may be loaded by deferencing the result of va_next.
2624       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2625       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2626       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2627           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2628     }
2629
2630     // Store the integer parameter registers.
2631     SmallVector<SDValue, 8> MemOps;
2632     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2633                                       getPointerTy());
2634     unsigned Offset = FuncInfo->getVarArgsGPOffset();
2635     for (SDValue Val : LiveGPRs) {
2636       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2637                                 DAG.getIntPtrConstant(Offset));
2638       SDValue Store =
2639         DAG.getStore(Val.getValue(1), dl, Val, FIN,
2640                      MachinePointerInfo::getFixedStack(
2641                        FuncInfo->getRegSaveFrameIndex(), Offset),
2642                      false, false, 0);
2643       MemOps.push_back(Store);
2644       Offset += 8;
2645     }
2646
2647     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2648       // Now store the XMM (fp + vector) parameter registers.
2649       SmallVector<SDValue, 12> SaveXMMOps;
2650       SaveXMMOps.push_back(Chain);
2651       SaveXMMOps.push_back(ALVal);
2652       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2653                              FuncInfo->getRegSaveFrameIndex()));
2654       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2655                              FuncInfo->getVarArgsFPOffset()));
2656       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2657                         LiveXMMRegs.end());
2658       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2659                                    MVT::Other, SaveXMMOps));
2660     }
2661
2662     if (!MemOps.empty())
2663       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2664   }
2665
2666   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2667     // Find the largest legal vector type.
2668     MVT VecVT = MVT::Other;
2669     // FIXME: Only some x86_32 calling conventions support AVX512.
2670     if (Subtarget->hasAVX512() &&
2671         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2672                      CallConv == CallingConv::Intel_OCL_BI)))
2673       VecVT = MVT::v16f32;
2674     else if (Subtarget->hasAVX())
2675       VecVT = MVT::v8f32;
2676     else if (Subtarget->hasSSE2())
2677       VecVT = MVT::v4f32;
2678
2679     // We forward some GPRs and some vector types.
2680     SmallVector<MVT, 2> RegParmTypes;
2681     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2682     RegParmTypes.push_back(IntVT);
2683     if (VecVT != MVT::Other)
2684       RegParmTypes.push_back(VecVT);
2685
2686     // Compute the set of forwarded registers. The rest are scratch.
2687     SmallVectorImpl<ForwardedRegister> &Forwards =
2688         FuncInfo->getForwardedMustTailRegParms();
2689     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2690
2691     // Conservatively forward AL on x86_64, since it might be used for varargs.
2692     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2693       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2694       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2695     }
2696
2697     // Copy all forwards from physical to virtual registers.
2698     for (ForwardedRegister &F : Forwards) {
2699       // FIXME: Can we use a less constrained schedule?
2700       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2701       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2702       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2703     }
2704   }
2705
2706   // Some CCs need callee pop.
2707   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2708                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
2709     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2710   } else {
2711     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2712     // If this is an sret function, the return should pop the hidden pointer.
2713     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2714         !Subtarget->getTargetTriple().isOSMSVCRT() &&
2715         argsAreStructReturn(Ins) == StackStructReturn)
2716       FuncInfo->setBytesToPopOnReturn(4);
2717   }
2718
2719   if (!Is64Bit) {
2720     // RegSaveFrameIndex is X86-64 only.
2721     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2722     if (CallConv == CallingConv::X86_FastCall ||
2723         CallConv == CallingConv::X86_ThisCall)
2724       // fastcc functions can't have varargs.
2725       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2726   }
2727
2728   FuncInfo->setArgumentStackSize(StackSize);
2729
2730   return Chain;
2731 }
2732
2733 SDValue
2734 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2735                                     SDValue StackPtr, SDValue Arg,
2736                                     SDLoc dl, SelectionDAG &DAG,
2737                                     const CCValAssign &VA,
2738                                     ISD::ArgFlagsTy Flags) const {
2739   unsigned LocMemOffset = VA.getLocMemOffset();
2740   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2741   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2742   if (Flags.isByVal())
2743     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2744
2745   return DAG.getStore(Chain, dl, Arg, PtrOff,
2746                       MachinePointerInfo::getStack(LocMemOffset),
2747                       false, false, 0);
2748 }
2749
2750 /// Emit a load of return address if tail call
2751 /// optimization is performed and it is required.
2752 SDValue
2753 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2754                                            SDValue &OutRetAddr, SDValue Chain,
2755                                            bool IsTailCall, bool Is64Bit,
2756                                            int FPDiff, SDLoc dl) const {
2757   // Adjust the Return address stack slot.
2758   EVT VT = getPointerTy();
2759   OutRetAddr = getReturnAddressFrameIndex(DAG);
2760
2761   // Load the "old" Return address.
2762   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2763                            false, false, false, 0);
2764   return SDValue(OutRetAddr.getNode(), 1);
2765 }
2766
2767 /// Emit a store of the return address if tail call
2768 /// optimization is performed and it is required (FPDiff!=0).
2769 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2770                                         SDValue Chain, SDValue RetAddrFrIdx,
2771                                         EVT PtrVT, unsigned SlotSize,
2772                                         int FPDiff, SDLoc dl) {
2773   // Store the return address to the appropriate stack slot.
2774   if (!FPDiff) return Chain;
2775   // Calculate the new stack slot for the return address.
2776   int NewReturnAddrFI =
2777     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2778                                          false);
2779   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2780   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2781                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2782                        false, false, 0);
2783   return Chain;
2784 }
2785
2786 SDValue
2787 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2788                              SmallVectorImpl<SDValue> &InVals) const {
2789   SelectionDAG &DAG                     = CLI.DAG;
2790   SDLoc &dl                             = CLI.DL;
2791   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2792   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2793   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2794   SDValue Chain                         = CLI.Chain;
2795   SDValue Callee                        = CLI.Callee;
2796   CallingConv::ID CallConv              = CLI.CallConv;
2797   bool &isTailCall                      = CLI.IsTailCall;
2798   bool isVarArg                         = CLI.IsVarArg;
2799
2800   MachineFunction &MF = DAG.getMachineFunction();
2801   bool Is64Bit        = Subtarget->is64Bit();
2802   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
2803   StructReturnType SR = callIsStructReturn(Outs);
2804   bool IsSibcall      = false;
2805   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2806
2807   if (MF.getTarget().Options.DisableTailCalls)
2808     isTailCall = false;
2809
2810   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2811   if (IsMustTail) {
2812     // Force this to be a tail call.  The verifier rules are enough to ensure
2813     // that we can lower this successfully without moving the return address
2814     // around.
2815     isTailCall = true;
2816   } else if (isTailCall) {
2817     // Check if it's really possible to do a tail call.
2818     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2819                     isVarArg, SR != NotStructReturn,
2820                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2821                     Outs, OutVals, Ins, DAG);
2822
2823     // Sibcalls are automatically detected tailcalls which do not require
2824     // ABI changes.
2825     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2826       IsSibcall = true;
2827
2828     if (isTailCall)
2829       ++NumTailCalls;
2830   }
2831
2832   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2833          "Var args not supported with calling convention fastcc, ghc or hipe");
2834
2835   // Analyze operands of the call, assigning locations to each operand.
2836   SmallVector<CCValAssign, 16> ArgLocs;
2837   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2838
2839   // Allocate shadow area for Win64
2840   if (IsWin64)
2841     CCInfo.AllocateStack(32, 8);
2842
2843   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2844
2845   // Get a count of how many bytes are to be pushed on the stack.
2846   unsigned NumBytes = CCInfo.getNextStackOffset();
2847   if (IsSibcall)
2848     // This is a sibcall. The memory operands are available in caller's
2849     // own caller's stack.
2850     NumBytes = 0;
2851   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2852            IsTailCallConvention(CallConv))
2853     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2854
2855   int FPDiff = 0;
2856   if (isTailCall && !IsSibcall && !IsMustTail) {
2857     // Lower arguments at fp - stackoffset + fpdiff.
2858     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2859
2860     FPDiff = NumBytesCallerPushed - NumBytes;
2861
2862     // Set the delta of movement of the returnaddr stackslot.
2863     // But only set if delta is greater than previous delta.
2864     if (FPDiff < X86Info->getTCReturnAddrDelta())
2865       X86Info->setTCReturnAddrDelta(FPDiff);
2866   }
2867
2868   unsigned NumBytesToPush = NumBytes;
2869   unsigned NumBytesToPop = NumBytes;
2870
2871   // If we have an inalloca argument, all stack space has already been allocated
2872   // for us and be right at the top of the stack.  We don't support multiple
2873   // arguments passed in memory when using inalloca.
2874   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2875     NumBytesToPush = 0;
2876     if (!ArgLocs.back().isMemLoc())
2877       report_fatal_error("cannot use inalloca attribute on a register "
2878                          "parameter");
2879     if (ArgLocs.back().getLocMemOffset() != 0)
2880       report_fatal_error("any parameter with the inalloca attribute must be "
2881                          "the only memory argument");
2882   }
2883
2884   if (!IsSibcall)
2885     Chain = DAG.getCALLSEQ_START(
2886         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
2887
2888   SDValue RetAddrFrIdx;
2889   // Load return address for tail calls.
2890   if (isTailCall && FPDiff)
2891     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2892                                     Is64Bit, FPDiff, dl);
2893
2894   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2895   SmallVector<SDValue, 8> MemOpChains;
2896   SDValue StackPtr;
2897
2898   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2899   // of tail call optimization arguments are handle later.
2900   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2901   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2902     // Skip inalloca arguments, they have already been written.
2903     ISD::ArgFlagsTy Flags = Outs[i].Flags;
2904     if (Flags.isInAlloca())
2905       continue;
2906
2907     CCValAssign &VA = ArgLocs[i];
2908     EVT RegVT = VA.getLocVT();
2909     SDValue Arg = OutVals[i];
2910     bool isByVal = Flags.isByVal();
2911
2912     // Promote the value if needed.
2913     switch (VA.getLocInfo()) {
2914     default: llvm_unreachable("Unknown loc info!");
2915     case CCValAssign::Full: break;
2916     case CCValAssign::SExt:
2917       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2918       break;
2919     case CCValAssign::ZExt:
2920       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2921       break;
2922     case CCValAssign::AExt:
2923       if (RegVT.is128BitVector()) {
2924         // Special case: passing MMX values in XMM registers.
2925         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2926         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2927         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2928       } else
2929         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2930       break;
2931     case CCValAssign::BCvt:
2932       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2933       break;
2934     case CCValAssign::Indirect: {
2935       // Store the argument.
2936       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2937       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2938       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2939                            MachinePointerInfo::getFixedStack(FI),
2940                            false, false, 0);
2941       Arg = SpillSlot;
2942       break;
2943     }
2944     }
2945
2946     if (VA.isRegLoc()) {
2947       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2948       if (isVarArg && IsWin64) {
2949         // Win64 ABI requires argument XMM reg to be copied to the corresponding
2950         // shadow reg if callee is a varargs function.
2951         unsigned ShadowReg = 0;
2952         switch (VA.getLocReg()) {
2953         case X86::XMM0: ShadowReg = X86::RCX; break;
2954         case X86::XMM1: ShadowReg = X86::RDX; break;
2955         case X86::XMM2: ShadowReg = X86::R8; break;
2956         case X86::XMM3: ShadowReg = X86::R9; break;
2957         }
2958         if (ShadowReg)
2959           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2960       }
2961     } else if (!IsSibcall && (!isTailCall || isByVal)) {
2962       assert(VA.isMemLoc());
2963       if (!StackPtr.getNode())
2964         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2965                                       getPointerTy());
2966       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2967                                              dl, DAG, VA, Flags));
2968     }
2969   }
2970
2971   if (!MemOpChains.empty())
2972     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2973
2974   if (Subtarget->isPICStyleGOT()) {
2975     // ELF / PIC requires GOT in the EBX register before function calls via PLT
2976     // GOT pointer.
2977     if (!isTailCall) {
2978       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2979                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2980     } else {
2981       // If we are tail calling and generating PIC/GOT style code load the
2982       // address of the callee into ECX. The value in ecx is used as target of
2983       // the tail jump. This is done to circumvent the ebx/callee-saved problem
2984       // for tail calls on PIC/GOT architectures. Normally we would just put the
2985       // address of GOT into ebx and then call target@PLT. But for tail calls
2986       // ebx would be restored (since ebx is callee saved) before jumping to the
2987       // target@PLT.
2988
2989       // Note: The actual moving to ECX is done further down.
2990       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2991       if (G && !G->getGlobal()->hasHiddenVisibility() &&
2992           !G->getGlobal()->hasProtectedVisibility())
2993         Callee = LowerGlobalAddress(Callee, DAG);
2994       else if (isa<ExternalSymbolSDNode>(Callee))
2995         Callee = LowerExternalSymbol(Callee, DAG);
2996     }
2997   }
2998
2999   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3000     // From AMD64 ABI document:
3001     // For calls that may call functions that use varargs or stdargs
3002     // (prototype-less calls or calls to functions containing ellipsis (...) in
3003     // the declaration) %al is used as hidden argument to specify the number
3004     // of SSE registers used. The contents of %al do not need to match exactly
3005     // the number of registers, but must be an ubound on the number of SSE
3006     // registers used and is in the range 0 - 8 inclusive.
3007
3008     // Count the number of XMM registers allocated.
3009     static const MCPhysReg XMMArgRegs[] = {
3010       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3011       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3012     };
3013     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
3014     assert((Subtarget->hasSSE1() || !NumXMMRegs)
3015            && "SSE registers cannot be used when SSE is disabled");
3016
3017     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3018                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
3019   }
3020
3021   if (isVarArg && IsMustTail) {
3022     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3023     for (const auto &F : Forwards) {
3024       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3025       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3026     }
3027   }
3028
3029   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3030   // don't need this because the eligibility check rejects calls that require
3031   // shuffling arguments passed in memory.
3032   if (!IsSibcall && isTailCall) {
3033     // Force all the incoming stack arguments to be loaded from the stack
3034     // before any new outgoing arguments are stored to the stack, because the
3035     // outgoing stack slots may alias the incoming argument stack slots, and
3036     // the alias isn't otherwise explicit. This is slightly more conservative
3037     // than necessary, because it means that each store effectively depends
3038     // on every argument instead of just those arguments it would clobber.
3039     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3040
3041     SmallVector<SDValue, 8> MemOpChains2;
3042     SDValue FIN;
3043     int FI = 0;
3044     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3045       CCValAssign &VA = ArgLocs[i];
3046       if (VA.isRegLoc())
3047         continue;
3048       assert(VA.isMemLoc());
3049       SDValue Arg = OutVals[i];
3050       ISD::ArgFlagsTy Flags = Outs[i].Flags;
3051       // Skip inalloca arguments.  They don't require any work.
3052       if (Flags.isInAlloca())
3053         continue;
3054       // Create frame index.
3055       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3056       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3057       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3058       FIN = DAG.getFrameIndex(FI, getPointerTy());
3059
3060       if (Flags.isByVal()) {
3061         // Copy relative to framepointer.
3062         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
3063         if (!StackPtr.getNode())
3064           StackPtr = DAG.getCopyFromReg(Chain, dl,
3065                                         RegInfo->getStackRegister(),
3066                                         getPointerTy());
3067         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
3068
3069         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3070                                                          ArgChain,
3071                                                          Flags, DAG, dl));
3072       } else {
3073         // Store relative to framepointer.
3074         MemOpChains2.push_back(
3075           DAG.getStore(ArgChain, dl, Arg, FIN,
3076                        MachinePointerInfo::getFixedStack(FI),
3077                        false, false, 0));
3078       }
3079     }
3080
3081     if (!MemOpChains2.empty())
3082       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3083
3084     // Store the return address to the appropriate stack slot.
3085     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3086                                      getPointerTy(), RegInfo->getSlotSize(),
3087                                      FPDiff, dl);
3088   }
3089
3090   // Build a sequence of copy-to-reg nodes chained together with token chain
3091   // and flag operands which copy the outgoing args into registers.
3092   SDValue InFlag;
3093   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3094     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3095                              RegsToPass[i].second, InFlag);
3096     InFlag = Chain.getValue(1);
3097   }
3098
3099   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3100     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3101     // In the 64-bit large code model, we have to make all calls
3102     // through a register, since the call instruction's 32-bit
3103     // pc-relative offset may not be large enough to hold the whole
3104     // address.
3105   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3106     // If the callee is a GlobalAddress node (quite common, every direct call
3107     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3108     // it.
3109     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3110
3111     // We should use extra load for direct calls to dllimported functions in
3112     // non-JIT mode.
3113     const GlobalValue *GV = G->getGlobal();
3114     if (!GV->hasDLLImportStorageClass()) {
3115       unsigned char OpFlags = 0;
3116       bool ExtraLoad = false;
3117       unsigned WrapperKind = ISD::DELETED_NODE;
3118
3119       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
3120       // external symbols most go through the PLT in PIC mode.  If the symbol
3121       // has hidden or protected visibility, or if it is static or local, then
3122       // we don't need to use the PLT - we can directly call it.
3123       if (Subtarget->isTargetELF() &&
3124           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
3125           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
3126         OpFlags = X86II::MO_PLT;
3127       } else if (Subtarget->isPICStyleStubAny() &&
3128                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
3129                  (!Subtarget->getTargetTriple().isMacOSX() ||
3130                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3131         // PC-relative references to external symbols should go through $stub,
3132         // unless we're building with the leopard linker or later, which
3133         // automatically synthesizes these stubs.
3134         OpFlags = X86II::MO_DARWIN_STUB;
3135       } else if (Subtarget->isPICStyleRIPRel() &&
3136                  isa<Function>(GV) &&
3137                  cast<Function>(GV)->getAttributes().
3138                    hasAttribute(AttributeSet::FunctionIndex,
3139                                 Attribute::NonLazyBind)) {
3140         // If the function is marked as non-lazy, generate an indirect call
3141         // which loads from the GOT directly. This avoids runtime overhead
3142         // at the cost of eager binding (and one extra byte of encoding).
3143         OpFlags = X86II::MO_GOTPCREL;
3144         WrapperKind = X86ISD::WrapperRIP;
3145         ExtraLoad = true;
3146       }
3147
3148       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
3149                                           G->getOffset(), OpFlags);
3150
3151       // Add a wrapper if needed.
3152       if (WrapperKind != ISD::DELETED_NODE)
3153         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
3154       // Add extra indirection if needed.
3155       if (ExtraLoad)
3156         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
3157                              MachinePointerInfo::getGOT(),
3158                              false, false, false, 0);
3159     }
3160   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3161     unsigned char OpFlags = 0;
3162
3163     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
3164     // external symbols should go through the PLT.
3165     if (Subtarget->isTargetELF() &&
3166         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
3167       OpFlags = X86II::MO_PLT;
3168     } else if (Subtarget->isPICStyleStubAny() &&
3169                (!Subtarget->getTargetTriple().isMacOSX() ||
3170                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3171       // PC-relative references to external symbols should go through $stub,
3172       // unless we're building with the leopard linker or later, which
3173       // automatically synthesizes these stubs.
3174       OpFlags = X86II::MO_DARWIN_STUB;
3175     }
3176
3177     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
3178                                          OpFlags);
3179   } else if (Subtarget->isTarget64BitILP32() &&
3180              Callee->getValueType(0) == MVT::i32) {
3181     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3182     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3183   }
3184
3185   // Returns a chain & a flag for retval copy to use.
3186   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3187   SmallVector<SDValue, 8> Ops;
3188
3189   if (!IsSibcall && isTailCall) {
3190     Chain = DAG.getCALLSEQ_END(Chain,
3191                                DAG.getIntPtrConstant(NumBytesToPop, true),
3192                                DAG.getIntPtrConstant(0, true), InFlag, dl);
3193     InFlag = Chain.getValue(1);
3194   }
3195
3196   Ops.push_back(Chain);
3197   Ops.push_back(Callee);
3198
3199   if (isTailCall)
3200     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
3201
3202   // Add argument registers to the end of the list so that they are known live
3203   // into the call.
3204   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3205     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3206                                   RegsToPass[i].second.getValueType()));
3207
3208   // Add a register mask operand representing the call-preserved registers.
3209   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
3210   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
3211   assert(Mask && "Missing call preserved mask for calling convention");
3212   Ops.push_back(DAG.getRegisterMask(Mask));
3213
3214   if (InFlag.getNode())
3215     Ops.push_back(InFlag);
3216
3217   if (isTailCall) {
3218     // We used to do:
3219     //// If this is the first return lowered for this function, add the regs
3220     //// to the liveout set for the function.
3221     // This isn't right, although it's probably harmless on x86; liveouts
3222     // should be computed from returns not tail calls.  Consider a void
3223     // function making a tail call to a function returning int.
3224     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3225   }
3226
3227   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3228   InFlag = Chain.getValue(1);
3229
3230   // Create the CALLSEQ_END node.
3231   unsigned NumBytesForCalleeToPop;
3232   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3233                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3234     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3235   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3236            !Subtarget->getTargetTriple().isOSMSVCRT() &&
3237            SR == StackStructReturn)
3238     // If this is a call to a struct-return function, the callee
3239     // pops the hidden struct pointer, so we have to push it back.
3240     // This is common for Darwin/X86, Linux & Mingw32 targets.
3241     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3242     NumBytesForCalleeToPop = 4;
3243   else
3244     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3245
3246   // Returns a flag for retval copy to use.
3247   if (!IsSibcall) {
3248     Chain = DAG.getCALLSEQ_END(Chain,
3249                                DAG.getIntPtrConstant(NumBytesToPop, true),
3250                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
3251                                                      true),
3252                                InFlag, dl);
3253     InFlag = Chain.getValue(1);
3254   }
3255
3256   // Handle result values, copying them out of physregs into vregs that we
3257   // return.
3258   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3259                          Ins, dl, DAG, InVals);
3260 }
3261
3262 //===----------------------------------------------------------------------===//
3263 //                Fast Calling Convention (tail call) implementation
3264 //===----------------------------------------------------------------------===//
3265
3266 //  Like std call, callee cleans arguments, convention except that ECX is
3267 //  reserved for storing the tail called function address. Only 2 registers are
3268 //  free for argument passing (inreg). Tail call optimization is performed
3269 //  provided:
3270 //                * tailcallopt is enabled
3271 //                * caller/callee are fastcc
3272 //  On X86_64 architecture with GOT-style position independent code only local
3273 //  (within module) calls are supported at the moment.
3274 //  To keep the stack aligned according to platform abi the function
3275 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3276 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3277 //  If a tail called function callee has more arguments than the caller the
3278 //  caller needs to make sure that there is room to move the RETADDR to. This is
3279 //  achieved by reserving an area the size of the argument delta right after the
3280 //  original RETADDR, but before the saved framepointer or the spilled registers
3281 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3282 //  stack layout:
3283 //    arg1
3284 //    arg2
3285 //    RETADDR
3286 //    [ new RETADDR
3287 //      move area ]
3288 //    (possible EBP)
3289 //    ESI
3290 //    EDI
3291 //    local1 ..
3292
3293 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3294 /// for a 16 byte align requirement.
3295 unsigned
3296 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3297                                                SelectionDAG& DAG) const {
3298   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3299   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
3300   unsigned StackAlignment = TFI.getStackAlignment();
3301   uint64_t AlignMask = StackAlignment - 1;
3302   int64_t Offset = StackSize;
3303   unsigned SlotSize = RegInfo->getSlotSize();
3304   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3305     // Number smaller than 12 so just add the difference.
3306     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3307   } else {
3308     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3309     Offset = ((~AlignMask) & Offset) + StackAlignment +
3310       (StackAlignment-SlotSize);
3311   }
3312   return Offset;
3313 }
3314
3315 /// MatchingStackOffset - Return true if the given stack call argument is
3316 /// already available in the same position (relatively) of the caller's
3317 /// incoming argument stack.
3318 static
3319 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3320                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3321                          const X86InstrInfo *TII) {
3322   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3323   int FI = INT_MAX;
3324   if (Arg.getOpcode() == ISD::CopyFromReg) {
3325     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3326     if (!TargetRegisterInfo::isVirtualRegister(VR))
3327       return false;
3328     MachineInstr *Def = MRI->getVRegDef(VR);
3329     if (!Def)
3330       return false;
3331     if (!Flags.isByVal()) {
3332       if (!TII->isLoadFromStackSlot(Def, FI))
3333         return false;
3334     } else {
3335       unsigned Opcode = Def->getOpcode();
3336       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3337            Opcode == X86::LEA64_32r) &&
3338           Def->getOperand(1).isFI()) {
3339         FI = Def->getOperand(1).getIndex();
3340         Bytes = Flags.getByValSize();
3341       } else
3342         return false;
3343     }
3344   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3345     if (Flags.isByVal())
3346       // ByVal argument is passed in as a pointer but it's now being
3347       // dereferenced. e.g.
3348       // define @foo(%struct.X* %A) {
3349       //   tail call @bar(%struct.X* byval %A)
3350       // }
3351       return false;
3352     SDValue Ptr = Ld->getBasePtr();
3353     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3354     if (!FINode)
3355       return false;
3356     FI = FINode->getIndex();
3357   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3358     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3359     FI = FINode->getIndex();
3360     Bytes = Flags.getByValSize();
3361   } else
3362     return false;
3363
3364   assert(FI != INT_MAX);
3365   if (!MFI->isFixedObjectIndex(FI))
3366     return false;
3367   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3368 }
3369
3370 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
3371 /// for tail call optimization. Targets which want to do tail call
3372 /// optimization should implement this function.
3373 bool
3374 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3375                                                      CallingConv::ID CalleeCC,
3376                                                      bool isVarArg,
3377                                                      bool isCalleeStructRet,
3378                                                      bool isCallerStructRet,
3379                                                      Type *RetTy,
3380                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
3381                                     const SmallVectorImpl<SDValue> &OutVals,
3382                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3383                                                      SelectionDAG &DAG) const {
3384   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3385     return false;
3386
3387   // If -tailcallopt is specified, make fastcc functions tail-callable.
3388   const MachineFunction &MF = DAG.getMachineFunction();
3389   const Function *CallerF = MF.getFunction();
3390
3391   // If the function return type is x86_fp80 and the callee return type is not,
3392   // then the FP_EXTEND of the call result is not a nop. It's not safe to
3393   // perform a tailcall optimization here.
3394   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3395     return false;
3396
3397   CallingConv::ID CallerCC = CallerF->getCallingConv();
3398   bool CCMatch = CallerCC == CalleeCC;
3399   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3400   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3401
3402   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3403     if (IsTailCallConvention(CalleeCC) && CCMatch)
3404       return true;
3405     return false;
3406   }
3407
3408   // Look for obvious safe cases to perform tail call optimization that do not
3409   // require ABI changes. This is what gcc calls sibcall.
3410
3411   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3412   // emit a special epilogue.
3413   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3414   if (RegInfo->needsStackRealignment(MF))
3415     return false;
3416
3417   // Also avoid sibcall optimization if either caller or callee uses struct
3418   // return semantics.
3419   if (isCalleeStructRet || isCallerStructRet)
3420     return false;
3421
3422   // An stdcall/thiscall caller is expected to clean up its arguments; the
3423   // callee isn't going to do that.
3424   // FIXME: this is more restrictive than needed. We could produce a tailcall
3425   // when the stack adjustment matches. For example, with a thiscall that takes
3426   // only one argument.
3427   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3428                    CallerCC == CallingConv::X86_ThisCall))
3429     return false;
3430
3431   // Do not sibcall optimize vararg calls unless all arguments are passed via
3432   // registers.
3433   if (isVarArg && !Outs.empty()) {
3434
3435     // Optimizing for varargs on Win64 is unlikely to be safe without
3436     // additional testing.
3437     if (IsCalleeWin64 || IsCallerWin64)
3438       return false;
3439
3440     SmallVector<CCValAssign, 16> ArgLocs;
3441     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3442                    *DAG.getContext());
3443
3444     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3445     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3446       if (!ArgLocs[i].isRegLoc())
3447         return false;
3448   }
3449
3450   // If the call result is in ST0 / ST1, it needs to be popped off the x87
3451   // stack.  Therefore, if it's not used by the call it is not safe to optimize
3452   // this into a sibcall.
3453   bool Unused = false;
3454   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3455     if (!Ins[i].Used) {
3456       Unused = true;
3457       break;
3458     }
3459   }
3460   if (Unused) {
3461     SmallVector<CCValAssign, 16> RVLocs;
3462     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
3463                    *DAG.getContext());
3464     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3465     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3466       CCValAssign &VA = RVLocs[i];
3467       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3468         return false;
3469     }
3470   }
3471
3472   // If the calling conventions do not match, then we'd better make sure the
3473   // results are returned in the same way as what the caller expects.
3474   if (!CCMatch) {
3475     SmallVector<CCValAssign, 16> RVLocs1;
3476     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
3477                     *DAG.getContext());
3478     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3479
3480     SmallVector<CCValAssign, 16> RVLocs2;
3481     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
3482                     *DAG.getContext());
3483     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3484
3485     if (RVLocs1.size() != RVLocs2.size())
3486       return false;
3487     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3488       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3489         return false;
3490       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3491         return false;
3492       if (RVLocs1[i].isRegLoc()) {
3493         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3494           return false;
3495       } else {
3496         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3497           return false;
3498       }
3499     }
3500   }
3501
3502   // If the callee takes no arguments then go on to check the results of the
3503   // call.
3504   if (!Outs.empty()) {
3505     // Check if stack adjustment is needed. For now, do not do this if any
3506     // argument is passed on the stack.
3507     SmallVector<CCValAssign, 16> ArgLocs;
3508     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3509                    *DAG.getContext());
3510
3511     // Allocate shadow area for Win64
3512     if (IsCalleeWin64)
3513       CCInfo.AllocateStack(32, 8);
3514
3515     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3516     if (CCInfo.getNextStackOffset()) {
3517       MachineFunction &MF = DAG.getMachineFunction();
3518       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3519         return false;
3520
3521       // Check if the arguments are already laid out in the right way as
3522       // the caller's fixed stack objects.
3523       MachineFrameInfo *MFI = MF.getFrameInfo();
3524       const MachineRegisterInfo *MRI = &MF.getRegInfo();
3525       const X86InstrInfo *TII = Subtarget->getInstrInfo();
3526       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3527         CCValAssign &VA = ArgLocs[i];
3528         SDValue Arg = OutVals[i];
3529         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3530         if (VA.getLocInfo() == CCValAssign::Indirect)
3531           return false;
3532         if (!VA.isRegLoc()) {
3533           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3534                                    MFI, MRI, TII))
3535             return false;
3536         }
3537       }
3538     }
3539
3540     // If the tailcall address may be in a register, then make sure it's
3541     // possible to register allocate for it. In 32-bit, the call address can
3542     // only target EAX, EDX, or ECX since the tail call must be scheduled after
3543     // callee-saved registers are restored. These happen to be the same
3544     // registers used to pass 'inreg' arguments so watch out for those.
3545     if (!Subtarget->is64Bit() &&
3546         ((!isa<GlobalAddressSDNode>(Callee) &&
3547           !isa<ExternalSymbolSDNode>(Callee)) ||
3548          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3549       unsigned NumInRegs = 0;
3550       // In PIC we need an extra register to formulate the address computation
3551       // for the callee.
3552       unsigned MaxInRegs =
3553         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3554
3555       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3556         CCValAssign &VA = ArgLocs[i];
3557         if (!VA.isRegLoc())
3558           continue;
3559         unsigned Reg = VA.getLocReg();
3560         switch (Reg) {
3561         default: break;
3562         case X86::EAX: case X86::EDX: case X86::ECX:
3563           if (++NumInRegs == MaxInRegs)
3564             return false;
3565           break;
3566         }
3567       }
3568     }
3569   }
3570
3571   return true;
3572 }
3573
3574 FastISel *
3575 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3576                                   const TargetLibraryInfo *libInfo) const {
3577   return X86::createFastISel(funcInfo, libInfo);
3578 }
3579
3580 //===----------------------------------------------------------------------===//
3581 //                           Other Lowering Hooks
3582 //===----------------------------------------------------------------------===//
3583
3584 static bool MayFoldLoad(SDValue Op) {
3585   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3586 }
3587
3588 static bool MayFoldIntoStore(SDValue Op) {
3589   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3590 }
3591
3592 static bool isTargetShuffle(unsigned Opcode) {
3593   switch(Opcode) {
3594   default: return false;
3595   case X86ISD::BLENDI:
3596   case X86ISD::PSHUFB:
3597   case X86ISD::PSHUFD:
3598   case X86ISD::PSHUFHW:
3599   case X86ISD::PSHUFLW:
3600   case X86ISD::SHUFP:
3601   case X86ISD::PALIGNR:
3602   case X86ISD::MOVLHPS:
3603   case X86ISD::MOVLHPD:
3604   case X86ISD::MOVHLPS:
3605   case X86ISD::MOVLPS:
3606   case X86ISD::MOVLPD:
3607   case X86ISD::MOVSHDUP:
3608   case X86ISD::MOVSLDUP:
3609   case X86ISD::MOVDDUP:
3610   case X86ISD::MOVSS:
3611   case X86ISD::MOVSD:
3612   case X86ISD::UNPCKL:
3613   case X86ISD::UNPCKH:
3614   case X86ISD::VPERMILPI:
3615   case X86ISD::VPERM2X128:
3616   case X86ISD::VPERMI:
3617     return true;
3618   }
3619 }
3620
3621 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3622                                     SDValue V1, SelectionDAG &DAG) {
3623   switch(Opc) {
3624   default: llvm_unreachable("Unknown x86 shuffle node");
3625   case X86ISD::MOVSHDUP:
3626   case X86ISD::MOVSLDUP:
3627   case X86ISD::MOVDDUP:
3628     return DAG.getNode(Opc, dl, VT, V1);
3629   }
3630 }
3631
3632 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3633                                     SDValue V1, unsigned TargetMask,
3634                                     SelectionDAG &DAG) {
3635   switch(Opc) {
3636   default: llvm_unreachable("Unknown x86 shuffle node");
3637   case X86ISD::PSHUFD:
3638   case X86ISD::PSHUFHW:
3639   case X86ISD::PSHUFLW:
3640   case X86ISD::VPERMILPI:
3641   case X86ISD::VPERMI:
3642     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3643   }
3644 }
3645
3646 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3647                                     SDValue V1, SDValue V2, unsigned TargetMask,
3648                                     SelectionDAG &DAG) {
3649   switch(Opc) {
3650   default: llvm_unreachable("Unknown x86 shuffle node");
3651   case X86ISD::PALIGNR:
3652   case X86ISD::VALIGN:
3653   case X86ISD::SHUFP:
3654   case X86ISD::VPERM2X128:
3655     return DAG.getNode(Opc, dl, VT, V1, V2,
3656                        DAG.getConstant(TargetMask, MVT::i8));
3657   }
3658 }
3659
3660 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3661                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
3662   switch(Opc) {
3663   default: llvm_unreachable("Unknown x86 shuffle node");
3664   case X86ISD::MOVLHPS:
3665   case X86ISD::MOVLHPD:
3666   case X86ISD::MOVHLPS:
3667   case X86ISD::MOVLPS:
3668   case X86ISD::MOVLPD:
3669   case X86ISD::MOVSS:
3670   case X86ISD::MOVSD:
3671   case X86ISD::UNPCKL:
3672   case X86ISD::UNPCKH:
3673     return DAG.getNode(Opc, dl, VT, V1, V2);
3674   }
3675 }
3676
3677 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3678   MachineFunction &MF = DAG.getMachineFunction();
3679   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3680   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3681   int ReturnAddrIndex = FuncInfo->getRAIndex();
3682
3683   if (ReturnAddrIndex == 0) {
3684     // Set up a frame object for the return address.
3685     unsigned SlotSize = RegInfo->getSlotSize();
3686     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3687                                                            -(int64_t)SlotSize,
3688                                                            false);
3689     FuncInfo->setRAIndex(ReturnAddrIndex);
3690   }
3691
3692   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3693 }
3694
3695 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3696                                        bool hasSymbolicDisplacement) {
3697   // Offset should fit into 32 bit immediate field.
3698   if (!isInt<32>(Offset))
3699     return false;
3700
3701   // If we don't have a symbolic displacement - we don't have any extra
3702   // restrictions.
3703   if (!hasSymbolicDisplacement)
3704     return true;
3705
3706   // FIXME: Some tweaks might be needed for medium code model.
3707   if (M != CodeModel::Small && M != CodeModel::Kernel)
3708     return false;
3709
3710   // For small code model we assume that latest object is 16MB before end of 31
3711   // bits boundary. We may also accept pretty large negative constants knowing
3712   // that all objects are in the positive half of address space.
3713   if (M == CodeModel::Small && Offset < 16*1024*1024)
3714     return true;
3715
3716   // For kernel code model we know that all object resist in the negative half
3717   // of 32bits address space. We may not accept negative offsets, since they may
3718   // be just off and we may accept pretty large positive ones.
3719   if (M == CodeModel::Kernel && Offset >= 0)
3720     return true;
3721
3722   return false;
3723 }
3724
3725 /// isCalleePop - Determines whether the callee is required to pop its
3726 /// own arguments. Callee pop is necessary to support tail calls.
3727 bool X86::isCalleePop(CallingConv::ID CallingConv,
3728                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3729   switch (CallingConv) {
3730   default:
3731     return false;
3732   case CallingConv::X86_StdCall:
3733   case CallingConv::X86_FastCall:
3734   case CallingConv::X86_ThisCall:
3735     return !is64Bit;
3736   case CallingConv::Fast:
3737   case CallingConv::GHC:
3738   case CallingConv::HiPE:
3739     if (IsVarArg)
3740       return false;
3741     return TailCallOpt;
3742   }
3743 }
3744
3745 /// \brief Return true if the condition is an unsigned comparison operation.
3746 static bool isX86CCUnsigned(unsigned X86CC) {
3747   switch (X86CC) {
3748   default: llvm_unreachable("Invalid integer condition!");
3749   case X86::COND_E:     return true;
3750   case X86::COND_G:     return false;
3751   case X86::COND_GE:    return false;
3752   case X86::COND_L:     return false;
3753   case X86::COND_LE:    return false;
3754   case X86::COND_NE:    return true;
3755   case X86::COND_B:     return true;
3756   case X86::COND_A:     return true;
3757   case X86::COND_BE:    return true;
3758   case X86::COND_AE:    return true;
3759   }
3760   llvm_unreachable("covered switch fell through?!");
3761 }
3762
3763 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3764 /// specific condition code, returning the condition code and the LHS/RHS of the
3765 /// comparison to make.
3766 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3767                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3768   if (!isFP) {
3769     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3770       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3771         // X > -1   -> X == 0, jump !sign.
3772         RHS = DAG.getConstant(0, RHS.getValueType());
3773         return X86::COND_NS;
3774       }
3775       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3776         // X < 0   -> X == 0, jump on sign.
3777         return X86::COND_S;
3778       }
3779       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3780         // X < 1   -> X <= 0
3781         RHS = DAG.getConstant(0, RHS.getValueType());
3782         return X86::COND_LE;
3783       }
3784     }
3785
3786     switch (SetCCOpcode) {
3787     default: llvm_unreachable("Invalid integer condition!");
3788     case ISD::SETEQ:  return X86::COND_E;
3789     case ISD::SETGT:  return X86::COND_G;
3790     case ISD::SETGE:  return X86::COND_GE;
3791     case ISD::SETLT:  return X86::COND_L;
3792     case ISD::SETLE:  return X86::COND_LE;
3793     case ISD::SETNE:  return X86::COND_NE;
3794     case ISD::SETULT: return X86::COND_B;
3795     case ISD::SETUGT: return X86::COND_A;
3796     case ISD::SETULE: return X86::COND_BE;
3797     case ISD::SETUGE: return X86::COND_AE;
3798     }
3799   }
3800
3801   // First determine if it is required or is profitable to flip the operands.
3802
3803   // If LHS is a foldable load, but RHS is not, flip the condition.
3804   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3805       !ISD::isNON_EXTLoad(RHS.getNode())) {
3806     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3807     std::swap(LHS, RHS);
3808   }
3809
3810   switch (SetCCOpcode) {
3811   default: break;
3812   case ISD::SETOLT:
3813   case ISD::SETOLE:
3814   case ISD::SETUGT:
3815   case ISD::SETUGE:
3816     std::swap(LHS, RHS);
3817     break;
3818   }
3819
3820   // On a floating point condition, the flags are set as follows:
3821   // ZF  PF  CF   op
3822   //  0 | 0 | 0 | X > Y
3823   //  0 | 0 | 1 | X < Y
3824   //  1 | 0 | 0 | X == Y
3825   //  1 | 1 | 1 | unordered
3826   switch (SetCCOpcode) {
3827   default: llvm_unreachable("Condcode should be pre-legalized away");
3828   case ISD::SETUEQ:
3829   case ISD::SETEQ:   return X86::COND_E;
3830   case ISD::SETOLT:              // flipped
3831   case ISD::SETOGT:
3832   case ISD::SETGT:   return X86::COND_A;
3833   case ISD::SETOLE:              // flipped
3834   case ISD::SETOGE:
3835   case ISD::SETGE:   return X86::COND_AE;
3836   case ISD::SETUGT:              // flipped
3837   case ISD::SETULT:
3838   case ISD::SETLT:   return X86::COND_B;
3839   case ISD::SETUGE:              // flipped
3840   case ISD::SETULE:
3841   case ISD::SETLE:   return X86::COND_BE;
3842   case ISD::SETONE:
3843   case ISD::SETNE:   return X86::COND_NE;
3844   case ISD::SETUO:   return X86::COND_P;
3845   case ISD::SETO:    return X86::COND_NP;
3846   case ISD::SETOEQ:
3847   case ISD::SETUNE:  return X86::COND_INVALID;
3848   }
3849 }
3850
3851 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
3852 /// code. Current x86 isa includes the following FP cmov instructions:
3853 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3854 static bool hasFPCMov(unsigned X86CC) {
3855   switch (X86CC) {
3856   default:
3857     return false;
3858   case X86::COND_B:
3859   case X86::COND_BE:
3860   case X86::COND_E:
3861   case X86::COND_P:
3862   case X86::COND_A:
3863   case X86::COND_AE:
3864   case X86::COND_NE:
3865   case X86::COND_NP:
3866     return true;
3867   }
3868 }
3869
3870 /// isFPImmLegal - Returns true if the target can instruction select the
3871 /// specified FP immediate natively. If false, the legalizer will
3872 /// materialize the FP immediate as a load from a constant pool.
3873 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3874   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3875     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3876       return true;
3877   }
3878   return false;
3879 }
3880
3881 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
3882                                               ISD::LoadExtType ExtTy,
3883                                               EVT NewVT) const {
3884   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3885   // relocation target a movq or addq instruction: don't let the load shrink.
3886   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3887   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3888     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3889       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3890   return true;
3891 }
3892
3893 /// \brief Returns true if it is beneficial to convert a load of a constant
3894 /// to just the constant itself.
3895 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3896                                                           Type *Ty) const {
3897   assert(Ty->isIntegerTy());
3898
3899   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3900   if (BitSize == 0 || BitSize > 64)
3901     return false;
3902   return true;
3903 }
3904
3905 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
3906                                                 unsigned Index) const {
3907   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3908     return false;
3909
3910   return (Index == 0 || Index == ResVT.getVectorNumElements());
3911 }
3912
3913 bool X86TargetLowering::isCheapToSpeculateCttz() const {
3914   // Speculate cttz only if we can directly use TZCNT.
3915   return Subtarget->hasBMI();
3916 }
3917
3918 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
3919   // Speculate ctlz only if we can directly use LZCNT.
3920   return Subtarget->hasLZCNT();
3921 }
3922
3923 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
3924 /// the specified range (L, H].
3925 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3926   return (Val < 0) || (Val >= Low && Val < Hi);
3927 }
3928
3929 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3930 /// specified value.
3931 static bool isUndefOrEqual(int Val, int CmpVal) {
3932   return (Val < 0 || Val == CmpVal);
3933 }
3934
3935 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3936 /// from position Pos and ending in Pos+Size, falls within the specified
3937 /// sequential range (Low, Low+Size]. or is undef.
3938 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3939                                        unsigned Pos, unsigned Size, int Low) {
3940   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3941     if (!isUndefOrEqual(Mask[i], Low))
3942       return false;
3943   return true;
3944 }
3945
3946 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3947 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
3948 /// operand - by default will match for first operand.
3949 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
3950                          bool TestSecondOperand = false) {
3951   if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
3952       VT != MVT::v2f64 && VT != MVT::v2i64)
3953     return false;
3954
3955   unsigned NumElems = VT.getVectorNumElements();
3956   unsigned Lo = TestSecondOperand ? NumElems : 0;
3957   unsigned Hi = Lo + NumElems;
3958
3959   for (unsigned i = 0; i < NumElems; ++i)
3960     if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
3961       return false;
3962
3963   return true;
3964 }
3965
3966 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3967 /// is suitable for input to PSHUFHW.
3968 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3969   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3970     return false;
3971
3972   // Lower quadword copied in order or undef.
3973   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3974     return false;
3975
3976   // Upper quadword shuffled.
3977   for (unsigned i = 4; i != 8; ++i)
3978     if (!isUndefOrInRange(Mask[i], 4, 8))
3979       return false;
3980
3981   if (VT == MVT::v16i16) {
3982     // Lower quadword copied in order or undef.
3983     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3984       return false;
3985
3986     // Upper quadword shuffled.
3987     for (unsigned i = 12; i != 16; ++i)
3988       if (!isUndefOrInRange(Mask[i], 12, 16))
3989         return false;
3990   }
3991
3992   return true;
3993 }
3994
3995 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3996 /// is suitable for input to PSHUFLW.
3997 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3998   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3999     return false;
4000
4001   // Upper quadword copied in order.
4002   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
4003     return false;
4004
4005   // Lower quadword shuffled.
4006   for (unsigned i = 0; i != 4; ++i)
4007     if (!isUndefOrInRange(Mask[i], 0, 4))
4008       return false;
4009
4010   if (VT == MVT::v16i16) {
4011     // Upper quadword copied in order.
4012     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
4013       return false;
4014
4015     // Lower quadword shuffled.
4016     for (unsigned i = 8; i != 12; ++i)
4017       if (!isUndefOrInRange(Mask[i], 8, 12))
4018         return false;
4019   }
4020
4021   return true;
4022 }
4023
4024 /// \brief Return true if the mask specifies a shuffle of elements that is
4025 /// suitable for input to intralane (palignr) or interlane (valign) vector
4026 /// right-shift.
4027 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
4028   unsigned NumElts = VT.getVectorNumElements();
4029   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
4030   unsigned NumLaneElts = NumElts/NumLanes;
4031
4032   // Do not handle 64-bit element shuffles with palignr.
4033   if (NumLaneElts == 2)
4034     return false;
4035
4036   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
4037     unsigned i;
4038     for (i = 0; i != NumLaneElts; ++i) {
4039       if (Mask[i+l] >= 0)
4040         break;
4041     }
4042
4043     // Lane is all undef, go to next lane
4044     if (i == NumLaneElts)
4045       continue;
4046
4047     int Start = Mask[i+l];
4048
4049     // Make sure its in this lane in one of the sources
4050     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
4051         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
4052       return false;
4053
4054     // If not lane 0, then we must match lane 0
4055     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
4056       return false;
4057
4058     // Correct second source to be contiguous with first source
4059     if (Start >= (int)NumElts)
4060       Start -= NumElts - NumLaneElts;
4061
4062     // Make sure we're shifting in the right direction.
4063     if (Start <= (int)(i+l))
4064       return false;
4065
4066     Start -= i;
4067
4068     // Check the rest of the elements to see if they are consecutive.
4069     for (++i; i != NumLaneElts; ++i) {
4070       int Idx = Mask[i+l];
4071
4072       // Make sure its in this lane
4073       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
4074           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
4075         return false;
4076
4077       // If not lane 0, then we must match lane 0
4078       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
4079         return false;
4080
4081       if (Idx >= (int)NumElts)
4082         Idx -= NumElts - NumLaneElts;
4083
4084       if (!isUndefOrEqual(Idx, Start+i))
4085         return false;
4086
4087     }
4088   }
4089
4090   return true;
4091 }
4092
4093 /// \brief Return true if the node specifies a shuffle of elements that is
4094 /// suitable for input to PALIGNR.
4095 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
4096                           const X86Subtarget *Subtarget) {
4097   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
4098       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
4099       VT.is512BitVector())
4100     // FIXME: Add AVX512BW.
4101     return false;
4102
4103   return isAlignrMask(Mask, VT, false);
4104 }
4105
4106 /// \brief Return true if the node specifies a shuffle of elements that is
4107 /// suitable for input to VALIGN.
4108 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
4109                           const X86Subtarget *Subtarget) {
4110   // FIXME: Add AVX512VL.
4111   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
4112     return false;
4113   return isAlignrMask(Mask, VT, true);
4114 }
4115
4116 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
4117 /// the two vector operands have swapped position.
4118 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
4119                                      unsigned NumElems) {
4120   for (unsigned i = 0; i != NumElems; ++i) {
4121     int idx = Mask[i];
4122     if (idx < 0)
4123       continue;
4124     else if (idx < (int)NumElems)
4125       Mask[i] = idx + NumElems;
4126     else
4127       Mask[i] = idx - NumElems;
4128   }
4129 }
4130
4131 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
4132 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
4133 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
4134 /// reverse of what x86 shuffles want.
4135 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
4136
4137   unsigned NumElems = VT.getVectorNumElements();
4138   unsigned NumLanes = VT.getSizeInBits()/128;
4139   unsigned NumLaneElems = NumElems/NumLanes;
4140
4141   if (NumLaneElems != 2 && NumLaneElems != 4)
4142     return false;
4143
4144   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4145   bool symetricMaskRequired =
4146     (VT.getSizeInBits() >= 256) && (EltSize == 32);
4147
4148   // VSHUFPSY divides the resulting vector into 4 chunks.
4149   // The sources are also splitted into 4 chunks, and each destination
4150   // chunk must come from a different source chunk.
4151   //
4152   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
4153   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
4154   //
4155   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
4156   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
4157   //
4158   // VSHUFPDY divides the resulting vector into 4 chunks.
4159   // The sources are also splitted into 4 chunks, and each destination
4160   // chunk must come from a different source chunk.
4161   //
4162   //  SRC1 =>      X3       X2       X1       X0
4163   //  SRC2 =>      Y3       Y2       Y1       Y0
4164   //
4165   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
4166   //
4167   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
4168   unsigned HalfLaneElems = NumLaneElems/2;
4169   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
4170     for (unsigned i = 0; i != NumLaneElems; ++i) {
4171       int Idx = Mask[i+l];
4172       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
4173       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
4174         return false;
4175       // For VSHUFPSY, the mask of the second half must be the same as the
4176       // first but with the appropriate offsets. This works in the same way as
4177       // VPERMILPS works with masks.
4178       if (!symetricMaskRequired || Idx < 0)
4179         continue;
4180       if (MaskVal[i] < 0) {
4181         MaskVal[i] = Idx - l;
4182         continue;
4183       }
4184       if ((signed)(Idx - l) != MaskVal[i])
4185         return false;
4186     }
4187   }
4188
4189   return true;
4190 }
4191
4192 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
4193 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
4194 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
4195   if (!VT.is128BitVector())
4196     return false;
4197
4198   unsigned NumElems = VT.getVectorNumElements();
4199
4200   if (NumElems != 4)
4201     return false;
4202
4203   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
4204   return isUndefOrEqual(Mask[0], 6) &&
4205          isUndefOrEqual(Mask[1], 7) &&
4206          isUndefOrEqual(Mask[2], 2) &&
4207          isUndefOrEqual(Mask[3], 3);
4208 }
4209
4210 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
4211 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
4212 /// <2, 3, 2, 3>
4213 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
4214   if (!VT.is128BitVector())
4215     return false;
4216
4217   unsigned NumElems = VT.getVectorNumElements();
4218
4219   if (NumElems != 4)
4220     return false;
4221
4222   return isUndefOrEqual(Mask[0], 2) &&
4223          isUndefOrEqual(Mask[1], 3) &&
4224          isUndefOrEqual(Mask[2], 2) &&
4225          isUndefOrEqual(Mask[3], 3);
4226 }
4227
4228 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
4229 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
4230 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
4231   if (!VT.is128BitVector())
4232     return false;
4233
4234   unsigned NumElems = VT.getVectorNumElements();
4235
4236   if (NumElems != 2 && NumElems != 4)
4237     return false;
4238
4239   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4240     if (!isUndefOrEqual(Mask[i], i + NumElems))
4241       return false;
4242
4243   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4244     if (!isUndefOrEqual(Mask[i], i))
4245       return false;
4246
4247   return true;
4248 }
4249
4250 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
4251 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
4252 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
4253   if (!VT.is128BitVector())
4254     return false;
4255
4256   unsigned NumElems = VT.getVectorNumElements();
4257
4258   if (NumElems != 2 && NumElems != 4)
4259     return false;
4260
4261   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4262     if (!isUndefOrEqual(Mask[i], i))
4263       return false;
4264
4265   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4266     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
4267       return false;
4268
4269   return true;
4270 }
4271
4272 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
4273 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
4274 /// i. e: If all but one element come from the same vector.
4275 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
4276   // TODO: Deal with AVX's VINSERTPS
4277   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
4278     return false;
4279
4280   unsigned CorrectPosV1 = 0;
4281   unsigned CorrectPosV2 = 0;
4282   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
4283     if (Mask[i] == -1) {
4284       ++CorrectPosV1;
4285       ++CorrectPosV2;
4286       continue;
4287     }
4288
4289     if (Mask[i] == i)
4290       ++CorrectPosV1;
4291     else if (Mask[i] == i + 4)
4292       ++CorrectPosV2;
4293   }
4294
4295   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
4296     // We have 3 elements (undefs count as elements from any vector) from one
4297     // vector, and one from another.
4298     return true;
4299
4300   return false;
4301 }
4302
4303 //
4304 // Some special combinations that can be optimized.
4305 //
4306 static
4307 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
4308                                SelectionDAG &DAG) {
4309   MVT VT = SVOp->getSimpleValueType(0);
4310   SDLoc dl(SVOp);
4311
4312   if (VT != MVT::v8i32 && VT != MVT::v8f32)
4313     return SDValue();
4314
4315   ArrayRef<int> Mask = SVOp->getMask();
4316
4317   // These are the special masks that may be optimized.
4318   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
4319   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
4320   bool MatchEvenMask = true;
4321   bool MatchOddMask  = true;
4322   for (int i=0; i<8; ++i) {
4323     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
4324       MatchEvenMask = false;
4325     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
4326       MatchOddMask = false;
4327   }
4328
4329   if (!MatchEvenMask && !MatchOddMask)
4330     return SDValue();
4331
4332   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
4333
4334   SDValue Op0 = SVOp->getOperand(0);
4335   SDValue Op1 = SVOp->getOperand(1);
4336
4337   if (MatchEvenMask) {
4338     // Shift the second operand right to 32 bits.
4339     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
4340     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
4341   } else {
4342     // Shift the first operand left to 32 bits.
4343     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
4344     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
4345   }
4346   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
4347   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
4348 }
4349
4350 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
4351 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
4352 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
4353                          bool HasInt256, bool V2IsSplat = false) {
4354
4355   assert(VT.getSizeInBits() >= 128 &&
4356          "Unsupported vector type for unpckl");
4357
4358   unsigned NumElts = VT.getVectorNumElements();
4359   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4360       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4361     return false;
4362
4363   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4364          "Unsupported vector type for unpckh");
4365
4366   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4367   unsigned NumLanes = VT.getSizeInBits()/128;
4368   unsigned NumLaneElts = NumElts/NumLanes;
4369
4370   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4371     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4372       int BitI  = Mask[l+i];
4373       int BitI1 = Mask[l+i+1];
4374       if (!isUndefOrEqual(BitI, j))
4375         return false;
4376       if (V2IsSplat) {
4377         if (!isUndefOrEqual(BitI1, NumElts))
4378           return false;
4379       } else {
4380         if (!isUndefOrEqual(BitI1, j + NumElts))
4381           return false;
4382       }
4383     }
4384   }
4385
4386   return true;
4387 }
4388
4389 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
4390 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
4391 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
4392                          bool HasInt256, bool V2IsSplat = false) {
4393   assert(VT.getSizeInBits() >= 128 &&
4394          "Unsupported vector type for unpckh");
4395
4396   unsigned NumElts = VT.getVectorNumElements();
4397   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4398       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4399     return false;
4400
4401   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4402          "Unsupported vector type for unpckh");
4403
4404   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4405   unsigned NumLanes = VT.getSizeInBits()/128;
4406   unsigned NumLaneElts = NumElts/NumLanes;
4407
4408   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4409     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4410       int BitI  = Mask[l+i];
4411       int BitI1 = Mask[l+i+1];
4412       if (!isUndefOrEqual(BitI, j))
4413         return false;
4414       if (V2IsSplat) {
4415         if (isUndefOrEqual(BitI1, NumElts))
4416           return false;
4417       } else {
4418         if (!isUndefOrEqual(BitI1, j+NumElts))
4419           return false;
4420       }
4421     }
4422   }
4423   return true;
4424 }
4425
4426 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4427 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4428 /// <0, 0, 1, 1>
4429 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4430   unsigned NumElts = VT.getVectorNumElements();
4431   bool Is256BitVec = VT.is256BitVector();
4432
4433   if (VT.is512BitVector())
4434     return false;
4435   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4436          "Unsupported vector type for unpckh");
4437
4438   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4439       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4440     return false;
4441
4442   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4443   // FIXME: Need a better way to get rid of this, there's no latency difference
4444   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4445   // the former later. We should also remove the "_undef" special mask.
4446   if (NumElts == 4 && Is256BitVec)
4447     return false;
4448
4449   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4450   // independently on 128-bit lanes.
4451   unsigned NumLanes = VT.getSizeInBits()/128;
4452   unsigned NumLaneElts = NumElts/NumLanes;
4453
4454   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4455     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4456       int BitI  = Mask[l+i];
4457       int BitI1 = Mask[l+i+1];
4458
4459       if (!isUndefOrEqual(BitI, j))
4460         return false;
4461       if (!isUndefOrEqual(BitI1, j))
4462         return false;
4463     }
4464   }
4465
4466   return true;
4467 }
4468
4469 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4470 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4471 /// <2, 2, 3, 3>
4472 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4473   unsigned NumElts = VT.getVectorNumElements();
4474
4475   if (VT.is512BitVector())
4476     return false;
4477
4478   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4479          "Unsupported vector type for unpckh");
4480
4481   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4482       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4483     return false;
4484
4485   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4486   // independently on 128-bit lanes.
4487   unsigned NumLanes = VT.getSizeInBits()/128;
4488   unsigned NumLaneElts = NumElts/NumLanes;
4489
4490   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4491     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4492       int BitI  = Mask[l+i];
4493       int BitI1 = Mask[l+i+1];
4494       if (!isUndefOrEqual(BitI, j))
4495         return false;
4496       if (!isUndefOrEqual(BitI1, j))
4497         return false;
4498     }
4499   }
4500   return true;
4501 }
4502
4503 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
4504 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
4505 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
4506   if (!VT.is512BitVector())
4507     return false;
4508
4509   unsigned NumElts = VT.getVectorNumElements();
4510   unsigned HalfSize = NumElts/2;
4511   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
4512     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
4513       *Imm = 1;
4514       return true;
4515     }
4516   }
4517   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
4518     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
4519       *Imm = 0;
4520       return true;
4521     }
4522   }
4523   return false;
4524 }
4525
4526 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4527 /// specifies a shuffle of elements that is suitable for input to MOVSS,
4528 /// MOVSD, and MOVD, i.e. setting the lowest element.
4529 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4530   if (VT.getVectorElementType().getSizeInBits() < 32)
4531     return false;
4532   if (!VT.is128BitVector())
4533     return false;
4534
4535   unsigned NumElts = VT.getVectorNumElements();
4536
4537   if (!isUndefOrEqual(Mask[0], NumElts))
4538     return false;
4539
4540   for (unsigned i = 1; i != NumElts; ++i)
4541     if (!isUndefOrEqual(Mask[i], i))
4542       return false;
4543
4544   return true;
4545 }
4546
4547 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4548 /// as permutations between 128-bit chunks or halves. As an example: this
4549 /// shuffle bellow:
4550 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4551 /// The first half comes from the second half of V1 and the second half from the
4552 /// the second half of V2.
4553 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4554   if (!HasFp256 || !VT.is256BitVector())
4555     return false;
4556
4557   // The shuffle result is divided into half A and half B. In total the two
4558   // sources have 4 halves, namely: C, D, E, F. The final values of A and
4559   // B must come from C, D, E or F.
4560   unsigned HalfSize = VT.getVectorNumElements()/2;
4561   bool MatchA = false, MatchB = false;
4562
4563   // Check if A comes from one of C, D, E, F.
4564   for (unsigned Half = 0; Half != 4; ++Half) {
4565     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4566       MatchA = true;
4567       break;
4568     }
4569   }
4570
4571   // Check if B comes from one of C, D, E, F.
4572   for (unsigned Half = 0; Half != 4; ++Half) {
4573     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4574       MatchB = true;
4575       break;
4576     }
4577   }
4578
4579   return MatchA && MatchB;
4580 }
4581
4582 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4583 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4584 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4585   MVT VT = SVOp->getSimpleValueType(0);
4586
4587   unsigned HalfSize = VT.getVectorNumElements()/2;
4588
4589   unsigned FstHalf = 0, SndHalf = 0;
4590   for (unsigned i = 0; i < HalfSize; ++i) {
4591     if (SVOp->getMaskElt(i) > 0) {
4592       FstHalf = SVOp->getMaskElt(i)/HalfSize;
4593       break;
4594     }
4595   }
4596   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4597     if (SVOp->getMaskElt(i) > 0) {
4598       SndHalf = SVOp->getMaskElt(i)/HalfSize;
4599       break;
4600     }
4601   }
4602
4603   return (FstHalf | (SndHalf << 4));
4604 }
4605
4606 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
4607 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4608   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4609   if (EltSize < 32)
4610     return false;
4611
4612   unsigned NumElts = VT.getVectorNumElements();
4613   Imm8 = 0;
4614   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4615     for (unsigned i = 0; i != NumElts; ++i) {
4616       if (Mask[i] < 0)
4617         continue;
4618       Imm8 |= Mask[i] << (i*2);
4619     }
4620     return true;
4621   }
4622
4623   unsigned LaneSize = 4;
4624   SmallVector<int, 4> MaskVal(LaneSize, -1);
4625
4626   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4627     for (unsigned i = 0; i != LaneSize; ++i) {
4628       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4629         return false;
4630       if (Mask[i+l] < 0)
4631         continue;
4632       if (MaskVal[i] < 0) {
4633         MaskVal[i] = Mask[i+l] - l;
4634         Imm8 |= MaskVal[i] << (i*2);
4635         continue;
4636       }
4637       if (Mask[i+l] != (signed)(MaskVal[i]+l))
4638         return false;
4639     }
4640   }
4641   return true;
4642 }
4643
4644 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4645 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4646 /// Note that VPERMIL mask matching is different depending whether theunderlying
4647 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4648 /// to the same elements of the low, but to the higher half of the source.
4649 /// In VPERMILPD the two lanes could be shuffled independently of each other
4650 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4651 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4652   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4653   if (VT.getSizeInBits() < 256 || EltSize < 32)
4654     return false;
4655   bool symetricMaskRequired = (EltSize == 32);
4656   unsigned NumElts = VT.getVectorNumElements();
4657
4658   unsigned NumLanes = VT.getSizeInBits()/128;
4659   unsigned LaneSize = NumElts/NumLanes;
4660   // 2 or 4 elements in one lane
4661
4662   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4663   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4664     for (unsigned i = 0; i != LaneSize; ++i) {
4665       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4666         return false;
4667       if (symetricMaskRequired) {
4668         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4669           ExpectedMaskVal[i] = Mask[i+l] - l;
4670           continue;
4671         }
4672         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4673           return false;
4674       }
4675     }
4676   }
4677   return true;
4678 }
4679
4680 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4681 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
4682 /// element of vector 2 and the other elements to come from vector 1 in order.
4683 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4684                                bool V2IsSplat = false, bool V2IsUndef = false) {
4685   if (!VT.is128BitVector())
4686     return false;
4687
4688   unsigned NumOps = VT.getVectorNumElements();
4689   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4690     return false;
4691
4692   if (!isUndefOrEqual(Mask[0], 0))
4693     return false;
4694
4695   for (unsigned i = 1; i != NumOps; ++i)
4696     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4697           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4698           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4699       return false;
4700
4701   return true;
4702 }
4703
4704 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4705 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4706 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4707 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4708                            const X86Subtarget *Subtarget) {
4709   if (!Subtarget->hasSSE3())
4710     return false;
4711
4712   unsigned NumElems = VT.getVectorNumElements();
4713
4714   if ((VT.is128BitVector() && NumElems != 4) ||
4715       (VT.is256BitVector() && NumElems != 8) ||
4716       (VT.is512BitVector() && NumElems != 16))
4717     return false;
4718
4719   // "i+1" is the value the indexed mask element must have
4720   for (unsigned i = 0; i != NumElems; i += 2)
4721     if (!isUndefOrEqual(Mask[i], i+1) ||
4722         !isUndefOrEqual(Mask[i+1], i+1))
4723       return false;
4724
4725   return true;
4726 }
4727
4728 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4729 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4730 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4731 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4732                            const X86Subtarget *Subtarget) {
4733   if (!Subtarget->hasSSE3())
4734     return false;
4735
4736   unsigned NumElems = VT.getVectorNumElements();
4737
4738   if ((VT.is128BitVector() && NumElems != 4) ||
4739       (VT.is256BitVector() && NumElems != 8) ||
4740       (VT.is512BitVector() && NumElems != 16))
4741     return false;
4742
4743   // "i" is the value the indexed mask element must have
4744   for (unsigned i = 0; i != NumElems; i += 2)
4745     if (!isUndefOrEqual(Mask[i], i) ||
4746         !isUndefOrEqual(Mask[i+1], i))
4747       return false;
4748
4749   return true;
4750 }
4751
4752 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4753 /// specifies a shuffle of elements that is suitable for input to 256-bit
4754 /// version of MOVDDUP.
4755 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4756   if (!HasFp256 || !VT.is256BitVector())
4757     return false;
4758
4759   unsigned NumElts = VT.getVectorNumElements();
4760   if (NumElts != 4)
4761     return false;
4762
4763   for (unsigned i = 0; i != NumElts/2; ++i)
4764     if (!isUndefOrEqual(Mask[i], 0))
4765       return false;
4766   for (unsigned i = NumElts/2; i != NumElts; ++i)
4767     if (!isUndefOrEqual(Mask[i], NumElts/2))
4768       return false;
4769   return true;
4770 }
4771
4772 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4773 /// specifies a shuffle of elements that is suitable for input to 128-bit
4774 /// version of MOVDDUP.
4775 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4776   if (!VT.is128BitVector())
4777     return false;
4778
4779   unsigned e = VT.getVectorNumElements() / 2;
4780   for (unsigned i = 0; i != e; ++i)
4781     if (!isUndefOrEqual(Mask[i], i))
4782       return false;
4783   for (unsigned i = 0; i != e; ++i)
4784     if (!isUndefOrEqual(Mask[e+i], i))
4785       return false;
4786   return true;
4787 }
4788
4789 /// isVEXTRACTIndex - Return true if the specified
4790 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4791 /// suitable for instruction that extract 128 or 256 bit vectors
4792 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4793   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4794   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4795     return false;
4796
4797   // The index should be aligned on a vecWidth-bit boundary.
4798   uint64_t Index =
4799     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4800
4801   MVT VT = N->getSimpleValueType(0);
4802   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4803   bool Result = (Index * ElSize) % vecWidth == 0;
4804
4805   return Result;
4806 }
4807
4808 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4809 /// operand specifies a subvector insert that is suitable for input to
4810 /// insertion of 128 or 256-bit subvectors
4811 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4812   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4813   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4814     return false;
4815   // The index should be aligned on a vecWidth-bit boundary.
4816   uint64_t Index =
4817     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4818
4819   MVT VT = N->getSimpleValueType(0);
4820   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4821   bool Result = (Index * ElSize) % vecWidth == 0;
4822
4823   return Result;
4824 }
4825
4826 bool X86::isVINSERT128Index(SDNode *N) {
4827   return isVINSERTIndex(N, 128);
4828 }
4829
4830 bool X86::isVINSERT256Index(SDNode *N) {
4831   return isVINSERTIndex(N, 256);
4832 }
4833
4834 bool X86::isVEXTRACT128Index(SDNode *N) {
4835   return isVEXTRACTIndex(N, 128);
4836 }
4837
4838 bool X86::isVEXTRACT256Index(SDNode *N) {
4839   return isVEXTRACTIndex(N, 256);
4840 }
4841
4842 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4843 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4844 /// Handles 128-bit and 256-bit.
4845 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4846   MVT VT = N->getSimpleValueType(0);
4847
4848   assert((VT.getSizeInBits() >= 128) &&
4849          "Unsupported vector type for PSHUF/SHUFP");
4850
4851   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4852   // independently on 128-bit lanes.
4853   unsigned NumElts = VT.getVectorNumElements();
4854   unsigned NumLanes = VT.getSizeInBits()/128;
4855   unsigned NumLaneElts = NumElts/NumLanes;
4856
4857   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
4858          "Only supports 2, 4 or 8 elements per lane");
4859
4860   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4861   unsigned Mask = 0;
4862   for (unsigned i = 0; i != NumElts; ++i) {
4863     int Elt = N->getMaskElt(i);
4864     if (Elt < 0) continue;
4865     Elt &= NumLaneElts - 1;
4866     unsigned ShAmt = (i << Shift) % 8;
4867     Mask |= Elt << ShAmt;
4868   }
4869
4870   return Mask;
4871 }
4872
4873 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4874 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4875 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4876   MVT VT = N->getSimpleValueType(0);
4877
4878   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4879          "Unsupported vector type for PSHUFHW");
4880
4881   unsigned NumElts = VT.getVectorNumElements();
4882
4883   unsigned Mask = 0;
4884   for (unsigned l = 0; l != NumElts; l += 8) {
4885     // 8 nodes per lane, but we only care about the last 4.
4886     for (unsigned i = 0; i < 4; ++i) {
4887       int Elt = N->getMaskElt(l+i+4);
4888       if (Elt < 0) continue;
4889       Elt &= 0x3; // only 2-bits.
4890       Mask |= Elt << (i * 2);
4891     }
4892   }
4893
4894   return Mask;
4895 }
4896
4897 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4898 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4899 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4900   MVT VT = N->getSimpleValueType(0);
4901
4902   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4903          "Unsupported vector type for PSHUFHW");
4904
4905   unsigned NumElts = VT.getVectorNumElements();
4906
4907   unsigned Mask = 0;
4908   for (unsigned l = 0; l != NumElts; l += 8) {
4909     // 8 nodes per lane, but we only care about the first 4.
4910     for (unsigned i = 0; i < 4; ++i) {
4911       int Elt = N->getMaskElt(l+i);
4912       if (Elt < 0) continue;
4913       Elt &= 0x3; // only 2-bits
4914       Mask |= Elt << (i * 2);
4915     }
4916   }
4917
4918   return Mask;
4919 }
4920
4921 /// \brief Return the appropriate immediate to shuffle the specified
4922 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
4923 /// VALIGN (if Interlane is true) instructions.
4924 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
4925                                            bool InterLane) {
4926   MVT VT = SVOp->getSimpleValueType(0);
4927   unsigned EltSize = InterLane ? 1 :
4928     VT.getVectorElementType().getSizeInBits() >> 3;
4929
4930   unsigned NumElts = VT.getVectorNumElements();
4931   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4932   unsigned NumLaneElts = NumElts/NumLanes;
4933
4934   int Val = 0;
4935   unsigned i;
4936   for (i = 0; i != NumElts; ++i) {
4937     Val = SVOp->getMaskElt(i);
4938     if (Val >= 0)
4939       break;
4940   }
4941   if (Val >= (int)NumElts)
4942     Val -= NumElts - NumLaneElts;
4943
4944   assert(Val - i > 0 && "PALIGNR imm should be positive");
4945   return (Val - i) * EltSize;
4946 }
4947
4948 /// \brief Return the appropriate immediate to shuffle the specified
4949 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
4950 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4951   return getShuffleAlignrImmediate(SVOp, false);
4952 }
4953
4954 /// \brief Return the appropriate immediate to shuffle the specified
4955 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
4956 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
4957   return getShuffleAlignrImmediate(SVOp, true);
4958 }
4959
4960
4961 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4962   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4963   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4964     llvm_unreachable("Illegal extract subvector for VEXTRACT");
4965
4966   uint64_t Index =
4967     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4968
4969   MVT VecVT = N->getOperand(0).getSimpleValueType();
4970   MVT ElVT = VecVT.getVectorElementType();
4971
4972   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4973   return Index / NumElemsPerChunk;
4974 }
4975
4976 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4977   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4978   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4979     llvm_unreachable("Illegal insert subvector for VINSERT");
4980
4981   uint64_t Index =
4982     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4983
4984   MVT VecVT = N->getSimpleValueType(0);
4985   MVT ElVT = VecVT.getVectorElementType();
4986
4987   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4988   return Index / NumElemsPerChunk;
4989 }
4990
4991 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
4992 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4993 /// and VINSERTI128 instructions.
4994 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4995   return getExtractVEXTRACTImmediate(N, 128);
4996 }
4997
4998 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
4999 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
5000 /// and VINSERTI64x4 instructions.
5001 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
5002   return getExtractVEXTRACTImmediate(N, 256);
5003 }
5004
5005 /// getInsertVINSERT128Immediate - Return the appropriate immediate
5006 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
5007 /// and VINSERTI128 instructions.
5008 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
5009   return getInsertVINSERTImmediate(N, 128);
5010 }
5011
5012 /// getInsertVINSERT256Immediate - Return the appropriate immediate
5013 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
5014 /// and VINSERTI64x4 instructions.
5015 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
5016   return getInsertVINSERTImmediate(N, 256);
5017 }
5018
5019 /// isZero - Returns true if Elt is a constant integer zero
5020 static bool isZero(SDValue V) {
5021   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
5022   return C && C->isNullValue();
5023 }
5024
5025 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
5026 /// constant +0.0.
5027 bool X86::isZeroNode(SDValue Elt) {
5028   if (isZero(Elt))
5029     return true;
5030   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
5031     return CFP->getValueAPF().isPosZero();
5032   return false;
5033 }
5034
5035 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
5036 /// match movhlps. The lower half elements should come from upper half of
5037 /// V1 (and in order), and the upper half elements should come from the upper
5038 /// half of V2 (and in order).
5039 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
5040   if (!VT.is128BitVector())
5041     return false;
5042   if (VT.getVectorNumElements() != 4)
5043     return false;
5044   for (unsigned i = 0, e = 2; i != e; ++i)
5045     if (!isUndefOrEqual(Mask[i], i+2))
5046       return false;
5047   for (unsigned i = 2; i != 4; ++i)
5048     if (!isUndefOrEqual(Mask[i], i+4))
5049       return false;
5050   return true;
5051 }
5052
5053 /// isScalarLoadToVector - Returns true if the node is a scalar load that
5054 /// is promoted to a vector. It also returns the LoadSDNode by reference if
5055 /// required.
5056 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
5057   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
5058     return false;
5059   N = N->getOperand(0).getNode();
5060   if (!ISD::isNON_EXTLoad(N))
5061     return false;
5062   if (LD)
5063     *LD = cast<LoadSDNode>(N);
5064   return true;
5065 }
5066
5067 // Test whether the given value is a vector value which will be legalized
5068 // into a load.
5069 static bool WillBeConstantPoolLoad(SDNode *N) {
5070   if (N->getOpcode() != ISD::BUILD_VECTOR)
5071     return false;
5072
5073   // Check for any non-constant elements.
5074   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
5075     switch (N->getOperand(i).getNode()->getOpcode()) {
5076     case ISD::UNDEF:
5077     case ISD::ConstantFP:
5078     case ISD::Constant:
5079       break;
5080     default:
5081       return false;
5082     }
5083
5084   // Vectors of all-zeros and all-ones are materialized with special
5085   // instructions rather than being loaded.
5086   return !ISD::isBuildVectorAllZeros(N) &&
5087          !ISD::isBuildVectorAllOnes(N);
5088 }
5089
5090 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
5091 /// match movlp{s|d}. The lower half elements should come from lower half of
5092 /// V1 (and in order), and the upper half elements should come from the upper
5093 /// half of V2 (and in order). And since V1 will become the source of the
5094 /// MOVLP, it must be either a vector load or a scalar load to vector.
5095 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
5096                                ArrayRef<int> Mask, MVT VT) {
5097   if (!VT.is128BitVector())
5098     return false;
5099
5100   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
5101     return false;
5102   // Is V2 is a vector load, don't do this transformation. We will try to use
5103   // load folding shufps op.
5104   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
5105     return false;
5106
5107   unsigned NumElems = VT.getVectorNumElements();
5108
5109   if (NumElems != 2 && NumElems != 4)
5110     return false;
5111   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
5112     if (!isUndefOrEqual(Mask[i], i))
5113       return false;
5114   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
5115     if (!isUndefOrEqual(Mask[i], i+NumElems))
5116       return false;
5117   return true;
5118 }
5119
5120 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
5121 /// to an zero vector.
5122 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
5123 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
5124   SDValue V1 = N->getOperand(0);
5125   SDValue V2 = N->getOperand(1);
5126   unsigned NumElems = N->getValueType(0).getVectorNumElements();
5127   for (unsigned i = 0; i != NumElems; ++i) {
5128     int Idx = N->getMaskElt(i);
5129     if (Idx >= (int)NumElems) {
5130       unsigned Opc = V2.getOpcode();
5131       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
5132         continue;
5133       if (Opc != ISD::BUILD_VECTOR ||
5134           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
5135         return false;
5136     } else if (Idx >= 0) {
5137       unsigned Opc = V1.getOpcode();
5138       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
5139         continue;
5140       if (Opc != ISD::BUILD_VECTOR ||
5141           !X86::isZeroNode(V1.getOperand(Idx)))
5142         return false;
5143     }
5144   }
5145   return true;
5146 }
5147
5148 /// getZeroVector - Returns a vector of specified type with all zero elements.
5149 ///
5150 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
5151                              SelectionDAG &DAG, SDLoc dl) {
5152   assert(VT.isVector() && "Expected a vector type");
5153
5154   // Always build SSE zero vectors as <4 x i32> bitcasted
5155   // to their dest type. This ensures they get CSE'd.
5156   SDValue Vec;
5157   if (VT.is128BitVector()) {  // SSE
5158     if (Subtarget->hasSSE2()) {  // SSE2
5159       SDValue Cst = DAG.getConstant(0, MVT::i32);
5160       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5161     } else { // SSE1
5162       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5163       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
5164     }
5165   } else if (VT.is256BitVector()) { // AVX
5166     if (Subtarget->hasInt256()) { // AVX2
5167       SDValue Cst = DAG.getConstant(0, MVT::i32);
5168       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5169       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5170     } else {
5171       // 256-bit logic and arithmetic instructions in AVX are all
5172       // floating-point, no support for integer ops. Emit fp zeroed vectors.
5173       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5174       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5175       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
5176     }
5177   } else if (VT.is512BitVector()) { // AVX-512
5178       SDValue Cst = DAG.getConstant(0, MVT::i32);
5179       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5180                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5181       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
5182   } else if (VT.getScalarType() == MVT::i1) {
5183     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
5184     SDValue Cst = DAG.getConstant(0, MVT::i1);
5185     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5186     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5187   } else
5188     llvm_unreachable("Unexpected vector type");
5189
5190   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5191 }
5192
5193 /// getOnesVector - Returns a vector of specified type with all bits set.
5194 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5195 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
5196 /// Then bitcast to their original type, ensuring they get CSE'd.
5197 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
5198                              SDLoc dl) {
5199   assert(VT.isVector() && "Expected a vector type");
5200
5201   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
5202   SDValue Vec;
5203   if (VT.is256BitVector()) {
5204     if (HasInt256) { // AVX2
5205       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5206       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5207     } else { // AVX
5208       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5209       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5210     }
5211   } else if (VT.is128BitVector()) {
5212     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5213   } else
5214     llvm_unreachable("Unexpected vector type");
5215
5216   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5217 }
5218
5219 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
5220 /// that point to V2 points to its first element.
5221 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
5222   for (unsigned i = 0; i != NumElems; ++i) {
5223     if (Mask[i] > (int)NumElems) {
5224       Mask[i] = NumElems;
5225     }
5226   }
5227 }
5228
5229 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
5230 /// operation of specified width.
5231 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
5232                        SDValue V2) {
5233   unsigned NumElems = VT.getVectorNumElements();
5234   SmallVector<int, 8> Mask;
5235   Mask.push_back(NumElems);
5236   for (unsigned i = 1; i != NumElems; ++i)
5237     Mask.push_back(i);
5238   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5239 }
5240
5241 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
5242 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5243                           SDValue V2) {
5244   unsigned NumElems = VT.getVectorNumElements();
5245   SmallVector<int, 8> Mask;
5246   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
5247     Mask.push_back(i);
5248     Mask.push_back(i + NumElems);
5249   }
5250   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5251 }
5252
5253 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
5254 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5255                           SDValue V2) {
5256   unsigned NumElems = VT.getVectorNumElements();
5257   SmallVector<int, 8> Mask;
5258   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
5259     Mask.push_back(i + Half);
5260     Mask.push_back(i + NumElems + Half);
5261   }
5262   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5263 }
5264
5265 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
5266 // a generic shuffle instruction because the target has no such instructions.
5267 // Generate shuffles which repeat i16 and i8 several times until they can be
5268 // represented by v4f32 and then be manipulated by target suported shuffles.
5269 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
5270   MVT VT = V.getSimpleValueType();
5271   int NumElems = VT.getVectorNumElements();
5272   SDLoc dl(V);
5273
5274   while (NumElems > 4) {
5275     if (EltNo < NumElems/2) {
5276       V = getUnpackl(DAG, dl, VT, V, V);
5277     } else {
5278       V = getUnpackh(DAG, dl, VT, V, V);
5279       EltNo -= NumElems/2;
5280     }
5281     NumElems >>= 1;
5282   }
5283   return V;
5284 }
5285
5286 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
5287 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
5288   MVT VT = V.getSimpleValueType();
5289   SDLoc dl(V);
5290
5291   if (VT.is128BitVector()) {
5292     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
5293     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
5294     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
5295                              &SplatMask[0]);
5296   } else if (VT.is256BitVector()) {
5297     // To use VPERMILPS to splat scalars, the second half of indicies must
5298     // refer to the higher part, which is a duplication of the lower one,
5299     // because VPERMILPS can only handle in-lane permutations.
5300     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
5301                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
5302
5303     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
5304     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
5305                              &SplatMask[0]);
5306   } else
5307     llvm_unreachable("Vector size not supported");
5308
5309   return DAG.getNode(ISD::BITCAST, dl, VT, V);
5310 }
5311
5312 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
5313 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
5314   MVT SrcVT = SV->getSimpleValueType(0);
5315   SDValue V1 = SV->getOperand(0);
5316   SDLoc dl(SV);
5317
5318   int EltNo = SV->getSplatIndex();
5319   int NumElems = SrcVT.getVectorNumElements();
5320   bool Is256BitVec = SrcVT.is256BitVector();
5321
5322   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
5323          "Unknown how to promote splat for type");
5324
5325   // Extract the 128-bit part containing the splat element and update
5326   // the splat element index when it refers to the higher register.
5327   if (Is256BitVec) {
5328     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
5329     if (EltNo >= NumElems/2)
5330       EltNo -= NumElems/2;
5331   }
5332
5333   // All i16 and i8 vector types can't be used directly by a generic shuffle
5334   // instruction because the target has no such instruction. Generate shuffles
5335   // which repeat i16 and i8 several times until they fit in i32, and then can
5336   // be manipulated by target suported shuffles.
5337   MVT EltVT = SrcVT.getVectorElementType();
5338   if (EltVT == MVT::i8 || EltVT == MVT::i16)
5339     V1 = PromoteSplati8i16(V1, DAG, EltNo);
5340
5341   // Recreate the 256-bit vector and place the same 128-bit vector
5342   // into the low and high part. This is necessary because we want
5343   // to use VPERM* to shuffle the vectors
5344   if (Is256BitVec) {
5345     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
5346   }
5347
5348   return getLegalSplat(DAG, V1, EltNo);
5349 }
5350
5351 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
5352 /// vector of zero or undef vector.  This produces a shuffle where the low
5353 /// element of V2 is swizzled into the zero/undef vector, landing at element
5354 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
5355 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
5356                                            bool IsZero,
5357                                            const X86Subtarget *Subtarget,
5358                                            SelectionDAG &DAG) {
5359   MVT VT = V2.getSimpleValueType();
5360   SDValue V1 = IsZero
5361     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5362   unsigned NumElems = VT.getVectorNumElements();
5363   SmallVector<int, 16> MaskVec;
5364   for (unsigned i = 0; i != NumElems; ++i)
5365     // If this is the insertion idx, put the low elt of V2 here.
5366     MaskVec.push_back(i == Idx ? NumElems : i);
5367   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
5368 }
5369
5370 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
5371 /// target specific opcode. Returns true if the Mask could be calculated. Sets
5372 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
5373 /// shuffles which use a single input multiple times, and in those cases it will
5374 /// adjust the mask to only have indices within that single input.
5375 static bool getTargetShuffleMask(SDNode *N, MVT VT,
5376                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
5377   unsigned NumElems = VT.getVectorNumElements();
5378   SDValue ImmN;
5379
5380   IsUnary = false;
5381   bool IsFakeUnary = false;
5382   switch(N->getOpcode()) {
5383   case X86ISD::BLENDI:
5384     ImmN = N->getOperand(N->getNumOperands()-1);
5385     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5386     break;
5387   case X86ISD::SHUFP:
5388     ImmN = N->getOperand(N->getNumOperands()-1);
5389     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5390     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5391     break;
5392   case X86ISD::UNPCKH:
5393     DecodeUNPCKHMask(VT, Mask);
5394     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5395     break;
5396   case X86ISD::UNPCKL:
5397     DecodeUNPCKLMask(VT, Mask);
5398     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5399     break;
5400   case X86ISD::MOVHLPS:
5401     DecodeMOVHLPSMask(NumElems, Mask);
5402     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5403     break;
5404   case X86ISD::MOVLHPS:
5405     DecodeMOVLHPSMask(NumElems, Mask);
5406     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5407     break;
5408   case X86ISD::PALIGNR:
5409     ImmN = N->getOperand(N->getNumOperands()-1);
5410     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5411     break;
5412   case X86ISD::PSHUFD:
5413   case X86ISD::VPERMILPI:
5414     ImmN = N->getOperand(N->getNumOperands()-1);
5415     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5416     IsUnary = true;
5417     break;
5418   case X86ISD::PSHUFHW:
5419     ImmN = N->getOperand(N->getNumOperands()-1);
5420     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5421     IsUnary = true;
5422     break;
5423   case X86ISD::PSHUFLW:
5424     ImmN = N->getOperand(N->getNumOperands()-1);
5425     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5426     IsUnary = true;
5427     break;
5428   case X86ISD::PSHUFB: {
5429     IsUnary = true;
5430     SDValue MaskNode = N->getOperand(1);
5431     while (MaskNode->getOpcode() == ISD::BITCAST)
5432       MaskNode = MaskNode->getOperand(0);
5433
5434     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
5435       // If we have a build-vector, then things are easy.
5436       EVT VT = MaskNode.getValueType();
5437       assert(VT.isVector() &&
5438              "Can't produce a non-vector with a build_vector!");
5439       if (!VT.isInteger())
5440         return false;
5441
5442       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
5443
5444       SmallVector<uint64_t, 32> RawMask;
5445       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
5446         SDValue Op = MaskNode->getOperand(i);
5447         if (Op->getOpcode() == ISD::UNDEF) {
5448           RawMask.push_back((uint64_t)SM_SentinelUndef);
5449           continue;
5450         }
5451         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
5452         if (!CN)
5453           return false;
5454         APInt MaskElement = CN->getAPIntValue();
5455
5456         // We now have to decode the element which could be any integer size and
5457         // extract each byte of it.
5458         for (int j = 0; j < NumBytesPerElement; ++j) {
5459           // Note that this is x86 and so always little endian: the low byte is
5460           // the first byte of the mask.
5461           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
5462           MaskElement = MaskElement.lshr(8);
5463         }
5464       }
5465       DecodePSHUFBMask(RawMask, Mask);
5466       break;
5467     }
5468
5469     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
5470     if (!MaskLoad)
5471       return false;
5472
5473     SDValue Ptr = MaskLoad->getBasePtr();
5474     if (Ptr->getOpcode() == X86ISD::Wrapper)
5475       Ptr = Ptr->getOperand(0);
5476
5477     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
5478     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
5479       return false;
5480
5481     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
5482       DecodePSHUFBMask(C, Mask);
5483       break;
5484     }
5485
5486     return false;
5487   }
5488   case X86ISD::VPERMI:
5489     ImmN = N->getOperand(N->getNumOperands()-1);
5490     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5491     IsUnary = true;
5492     break;
5493   case X86ISD::MOVSS:
5494   case X86ISD::MOVSD:
5495     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5496     break;
5497   case X86ISD::VPERM2X128:
5498     ImmN = N->getOperand(N->getNumOperands()-1);
5499     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5500     if (Mask.empty()) return false;
5501     break;
5502   case X86ISD::MOVSLDUP:
5503     DecodeMOVSLDUPMask(VT, Mask);
5504     IsUnary = true;
5505     break;
5506   case X86ISD::MOVSHDUP:
5507     DecodeMOVSHDUPMask(VT, Mask);
5508     IsUnary = true;
5509     break;
5510   case X86ISD::MOVDDUP:
5511     DecodeMOVDDUPMask(VT, Mask);
5512     IsUnary = true;
5513     break;
5514   case X86ISD::MOVLHPD:
5515   case X86ISD::MOVLPD:
5516   case X86ISD::MOVLPS:
5517     // Not yet implemented
5518     return false;
5519   default: llvm_unreachable("unknown target shuffle node");
5520   }
5521
5522   // If we have a fake unary shuffle, the shuffle mask is spread across two
5523   // inputs that are actually the same node. Re-map the mask to always point
5524   // into the first input.
5525   if (IsFakeUnary)
5526     for (int &M : Mask)
5527       if (M >= (int)Mask.size())
5528         M -= Mask.size();
5529
5530   return true;
5531 }
5532
5533 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
5534 /// element of the result of the vector shuffle.
5535 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5536                                    unsigned Depth) {
5537   if (Depth == 6)
5538     return SDValue();  // Limit search depth.
5539
5540   SDValue V = SDValue(N, 0);
5541   EVT VT = V.getValueType();
5542   unsigned Opcode = V.getOpcode();
5543
5544   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5545   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5546     int Elt = SV->getMaskElt(Index);
5547
5548     if (Elt < 0)
5549       return DAG.getUNDEF(VT.getVectorElementType());
5550
5551     unsigned NumElems = VT.getVectorNumElements();
5552     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5553                                          : SV->getOperand(1);
5554     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5555   }
5556
5557   // Recurse into target specific vector shuffles to find scalars.
5558   if (isTargetShuffle(Opcode)) {
5559     MVT ShufVT = V.getSimpleValueType();
5560     unsigned NumElems = ShufVT.getVectorNumElements();
5561     SmallVector<int, 16> ShuffleMask;
5562     bool IsUnary;
5563
5564     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5565       return SDValue();
5566
5567     int Elt = ShuffleMask[Index];
5568     if (Elt < 0)
5569       return DAG.getUNDEF(ShufVT.getVectorElementType());
5570
5571     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5572                                          : N->getOperand(1);
5573     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5574                                Depth+1);
5575   }
5576
5577   // Actual nodes that may contain scalar elements
5578   if (Opcode == ISD::BITCAST) {
5579     V = V.getOperand(0);
5580     EVT SrcVT = V.getValueType();
5581     unsigned NumElems = VT.getVectorNumElements();
5582
5583     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5584       return SDValue();
5585   }
5586
5587   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5588     return (Index == 0) ? V.getOperand(0)
5589                         : DAG.getUNDEF(VT.getVectorElementType());
5590
5591   if (V.getOpcode() == ISD::BUILD_VECTOR)
5592     return V.getOperand(Index);
5593
5594   return SDValue();
5595 }
5596
5597 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
5598 /// shuffle operation which come from a consecutively from a zero. The
5599 /// search can start in two different directions, from left or right.
5600 /// We count undefs as zeros until PreferredNum is reached.
5601 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
5602                                          unsigned NumElems, bool ZerosFromLeft,
5603                                          SelectionDAG &DAG,
5604                                          unsigned PreferredNum = -1U) {
5605   unsigned NumZeros = 0;
5606   for (unsigned i = 0; i != NumElems; ++i) {
5607     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5608     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5609     if (!Elt.getNode())
5610       break;
5611
5612     if (X86::isZeroNode(Elt))
5613       ++NumZeros;
5614     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5615       NumZeros = std::min(NumZeros + 1, PreferredNum);
5616     else
5617       break;
5618   }
5619
5620   return NumZeros;
5621 }
5622
5623 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5624 /// correspond consecutively to elements from one of the vector operands,
5625 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
5626 static
5627 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
5628                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5629                               unsigned NumElems, unsigned &OpNum) {
5630   bool SeenV1 = false;
5631   bool SeenV2 = false;
5632
5633   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5634     int Idx = SVOp->getMaskElt(i);
5635     // Ignore undef indicies
5636     if (Idx < 0)
5637       continue;
5638
5639     if (Idx < (int)NumElems)
5640       SeenV1 = true;
5641     else
5642       SeenV2 = true;
5643
5644     // Only accept consecutive elements from the same vector
5645     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5646       return false;
5647   }
5648
5649   OpNum = SeenV1 ? 0 : 1;
5650   return true;
5651 }
5652
5653 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5654 /// logical left shift of a vector.
5655 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5656                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5657   unsigned NumElems =
5658     SVOp->getSimpleValueType(0).getVectorNumElements();
5659   unsigned NumZeros = getNumOfConsecutiveZeros(
5660       SVOp, NumElems, false /* check zeros from right */, DAG,
5661       SVOp->getMaskElt(0));
5662   unsigned OpSrc;
5663
5664   if (!NumZeros)
5665     return false;
5666
5667   // Considering the elements in the mask that are not consecutive zeros,
5668   // check if they consecutively come from only one of the source vectors.
5669   //
5670   //               V1 = {X, A, B, C}     0
5671   //                         \  \  \    /
5672   //   vector_shuffle V1, V2 <1, 2, 3, X>
5673   //
5674   if (!isShuffleMaskConsecutive(SVOp,
5675             0,                   // Mask Start Index
5676             NumElems-NumZeros,   // Mask End Index(exclusive)
5677             NumZeros,            // Where to start looking in the src vector
5678             NumElems,            // Number of elements in vector
5679             OpSrc))              // Which source operand ?
5680     return false;
5681
5682   isLeft = false;
5683   ShAmt = NumZeros;
5684   ShVal = SVOp->getOperand(OpSrc);
5685   return true;
5686 }
5687
5688 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5689 /// logical left shift of a vector.
5690 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5691                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5692   unsigned NumElems =
5693     SVOp->getSimpleValueType(0).getVectorNumElements();
5694   unsigned NumZeros = getNumOfConsecutiveZeros(
5695       SVOp, NumElems, true /* check zeros from left */, DAG,
5696       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5697   unsigned OpSrc;
5698
5699   if (!NumZeros)
5700     return false;
5701
5702   // Considering the elements in the mask that are not consecutive zeros,
5703   // check if they consecutively come from only one of the source vectors.
5704   //
5705   //                           0    { A, B, X, X } = V2
5706   //                          / \    /  /
5707   //   vector_shuffle V1, V2 <X, X, 4, 5>
5708   //
5709   if (!isShuffleMaskConsecutive(SVOp,
5710             NumZeros,     // Mask Start Index
5711             NumElems,     // Mask End Index(exclusive)
5712             0,            // Where to start looking in the src vector
5713             NumElems,     // Number of elements in vector
5714             OpSrc))       // Which source operand ?
5715     return false;
5716
5717   isLeft = true;
5718   ShAmt = NumZeros;
5719   ShVal = SVOp->getOperand(OpSrc);
5720   return true;
5721 }
5722
5723 /// isVectorShift - Returns true if the shuffle can be implemented as a
5724 /// logical left or right shift of a vector.
5725 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5726                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5727   // Although the logic below support any bitwidth size, there are no
5728   // shift instructions which handle more than 128-bit vectors.
5729   if (!SVOp->getSimpleValueType(0).is128BitVector())
5730     return false;
5731
5732   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5733       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5734     return true;
5735
5736   return false;
5737 }
5738
5739 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5740 ///
5741 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5742                                        unsigned NumNonZero, unsigned NumZero,
5743                                        SelectionDAG &DAG,
5744                                        const X86Subtarget* Subtarget,
5745                                        const TargetLowering &TLI) {
5746   if (NumNonZero > 8)
5747     return SDValue();
5748
5749   SDLoc dl(Op);
5750   SDValue V;
5751   bool First = true;
5752   for (unsigned i = 0; i < 16; ++i) {
5753     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5754     if (ThisIsNonZero && First) {
5755       if (NumZero)
5756         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5757       else
5758         V = DAG.getUNDEF(MVT::v8i16);
5759       First = false;
5760     }
5761
5762     if ((i & 1) != 0) {
5763       SDValue ThisElt, LastElt;
5764       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5765       if (LastIsNonZero) {
5766         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5767                               MVT::i16, Op.getOperand(i-1));
5768       }
5769       if (ThisIsNonZero) {
5770         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5771         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5772                               ThisElt, DAG.getConstant(8, MVT::i8));
5773         if (LastIsNonZero)
5774           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5775       } else
5776         ThisElt = LastElt;
5777
5778       if (ThisElt.getNode())
5779         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5780                         DAG.getIntPtrConstant(i/2));
5781     }
5782   }
5783
5784   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5785 }
5786
5787 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5788 ///
5789 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5790                                      unsigned NumNonZero, unsigned NumZero,
5791                                      SelectionDAG &DAG,
5792                                      const X86Subtarget* Subtarget,
5793                                      const TargetLowering &TLI) {
5794   if (NumNonZero > 4)
5795     return SDValue();
5796
5797   SDLoc dl(Op);
5798   SDValue V;
5799   bool First = true;
5800   for (unsigned i = 0; i < 8; ++i) {
5801     bool isNonZero = (NonZeros & (1 << i)) != 0;
5802     if (isNonZero) {
5803       if (First) {
5804         if (NumZero)
5805           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5806         else
5807           V = DAG.getUNDEF(MVT::v8i16);
5808         First = false;
5809       }
5810       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5811                       MVT::v8i16, V, Op.getOperand(i),
5812                       DAG.getIntPtrConstant(i));
5813     }
5814   }
5815
5816   return V;
5817 }
5818
5819 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
5820 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5821                                      const X86Subtarget *Subtarget,
5822                                      const TargetLowering &TLI) {
5823   // Find all zeroable elements.
5824   bool Zeroable[4];
5825   for (int i=0; i < 4; ++i) {
5826     SDValue Elt = Op->getOperand(i);
5827     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
5828   }
5829   assert(std::count_if(&Zeroable[0], &Zeroable[4],
5830                        [](bool M) { return !M; }) > 1 &&
5831          "We expect at least two non-zero elements!");
5832
5833   // We only know how to deal with build_vector nodes where elements are either
5834   // zeroable or extract_vector_elt with constant index.
5835   SDValue FirstNonZero;
5836   unsigned FirstNonZeroIdx;
5837   for (unsigned i=0; i < 4; ++i) {
5838     if (Zeroable[i])
5839       continue;
5840     SDValue Elt = Op->getOperand(i);
5841     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5842         !isa<ConstantSDNode>(Elt.getOperand(1)))
5843       return SDValue();
5844     // Make sure that this node is extracting from a 128-bit vector.
5845     MVT VT = Elt.getOperand(0).getSimpleValueType();
5846     if (!VT.is128BitVector())
5847       return SDValue();
5848     if (!FirstNonZero.getNode()) {
5849       FirstNonZero = Elt;
5850       FirstNonZeroIdx = i;
5851     }
5852   }
5853
5854   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5855   SDValue V1 = FirstNonZero.getOperand(0);
5856   MVT VT = V1.getSimpleValueType();
5857
5858   // See if this build_vector can be lowered as a blend with zero.
5859   SDValue Elt;
5860   unsigned EltMaskIdx, EltIdx;
5861   int Mask[4];
5862   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5863     if (Zeroable[EltIdx]) {
5864       // The zero vector will be on the right hand side.
5865       Mask[EltIdx] = EltIdx+4;
5866       continue;
5867     }
5868
5869     Elt = Op->getOperand(EltIdx);
5870     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5871     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5872     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5873       break;
5874     Mask[EltIdx] = EltIdx;
5875   }
5876
5877   if (EltIdx == 4) {
5878     // Let the shuffle legalizer deal with blend operations.
5879     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5880     if (V1.getSimpleValueType() != VT)
5881       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
5882     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
5883   }
5884
5885   // See if we can lower this build_vector to a INSERTPS.
5886   if (!Subtarget->hasSSE41())
5887     return SDValue();
5888
5889   SDValue V2 = Elt.getOperand(0);
5890   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5891     V1 = SDValue();
5892
5893   bool CanFold = true;
5894   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5895     if (Zeroable[i])
5896       continue;
5897
5898     SDValue Current = Op->getOperand(i);
5899     SDValue SrcVector = Current->getOperand(0);
5900     if (!V1.getNode())
5901       V1 = SrcVector;
5902     CanFold = SrcVector == V1 &&
5903       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5904   }
5905
5906   if (!CanFold)
5907     return SDValue();
5908
5909   assert(V1.getNode() && "Expected at least two non-zero elements!");
5910   if (V1.getSimpleValueType() != MVT::v4f32)
5911     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
5912   if (V2.getSimpleValueType() != MVT::v4f32)
5913     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
5914
5915   // Ok, we can emit an INSERTPS instruction.
5916   unsigned ZMask = 0;
5917   for (int i = 0; i < 4; ++i)
5918     if (Zeroable[i])
5919       ZMask |= 1 << i;
5920
5921   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5922   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5923   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
5924                                DAG.getIntPtrConstant(InsertPSMask));
5925   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
5926 }
5927
5928 /// Return a vector logical shift node.
5929 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5930                          unsigned NumBits, SelectionDAG &DAG,
5931                          const TargetLowering &TLI, SDLoc dl) {
5932   assert(VT.is128BitVector() && "Unknown type for VShift");
5933   MVT ShVT = MVT::v2i64;
5934   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5935   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5936   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
5937   SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy);
5938   return DAG.getNode(ISD::BITCAST, dl, VT,
5939                      DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5940 }
5941
5942 static SDValue
5943 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
5944
5945   // Check if the scalar load can be widened into a vector load. And if
5946   // the address is "base + cst" see if the cst can be "absorbed" into
5947   // the shuffle mask.
5948   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5949     SDValue Ptr = LD->getBasePtr();
5950     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5951       return SDValue();
5952     EVT PVT = LD->getValueType(0);
5953     if (PVT != MVT::i32 && PVT != MVT::f32)
5954       return SDValue();
5955
5956     int FI = -1;
5957     int64_t Offset = 0;
5958     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5959       FI = FINode->getIndex();
5960       Offset = 0;
5961     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5962                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5963       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5964       Offset = Ptr.getConstantOperandVal(1);
5965       Ptr = Ptr.getOperand(0);
5966     } else {
5967       return SDValue();
5968     }
5969
5970     // FIXME: 256-bit vector instructions don't require a strict alignment,
5971     // improve this code to support it better.
5972     unsigned RequiredAlign = VT.getSizeInBits()/8;
5973     SDValue Chain = LD->getChain();
5974     // Make sure the stack object alignment is at least 16 or 32.
5975     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5976     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5977       if (MFI->isFixedObjectIndex(FI)) {
5978         // Can't change the alignment. FIXME: It's possible to compute
5979         // the exact stack offset and reference FI + adjust offset instead.
5980         // If someone *really* cares about this. That's the way to implement it.
5981         return SDValue();
5982       } else {
5983         MFI->setObjectAlignment(FI, RequiredAlign);
5984       }
5985     }
5986
5987     // (Offset % 16 or 32) must be multiple of 4. Then address is then
5988     // Ptr + (Offset & ~15).
5989     if (Offset < 0)
5990       return SDValue();
5991     if ((Offset % RequiredAlign) & 3)
5992       return SDValue();
5993     int64_t StartOffset = Offset & ~(RequiredAlign-1);
5994     if (StartOffset)
5995       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
5996                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
5997
5998     int EltNo = (Offset - StartOffset) >> 2;
5999     unsigned NumElems = VT.getVectorNumElements();
6000
6001     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6002     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6003                              LD->getPointerInfo().getWithOffset(StartOffset),
6004                              false, false, false, 0);
6005
6006     SmallVector<int, 8> Mask;
6007     for (unsigned i = 0; i != NumElems; ++i)
6008       Mask.push_back(EltNo);
6009
6010     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
6011   }
6012
6013   return SDValue();
6014 }
6015
6016 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6017 /// elements can be replaced by a single large load which has the same value as
6018 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6019 ///
6020 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
6021 ///
6022 /// FIXME: we'd also like to handle the case where the last elements are zero
6023 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
6024 /// There's even a handy isZeroNode for that purpose.
6025 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6026                                         SDLoc &DL, SelectionDAG &DAG,
6027                                         bool isAfterLegalize) {
6028   unsigned NumElems = Elts.size();
6029
6030   LoadSDNode *LDBase = nullptr;
6031   unsigned LastLoadedElt = -1U;
6032
6033   // For each element in the initializer, see if we've found a load or an undef.
6034   // If we don't find an initial load element, or later load elements are
6035   // non-consecutive, bail out.
6036   for (unsigned i = 0; i < NumElems; ++i) {
6037     SDValue Elt = Elts[i];
6038     // Look through a bitcast.
6039     if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
6040       Elt = Elt.getOperand(0);
6041     if (!Elt.getNode() ||
6042         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
6043       return SDValue();
6044     if (!LDBase) {
6045       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
6046         return SDValue();
6047       LDBase = cast<LoadSDNode>(Elt.getNode());
6048       LastLoadedElt = i;
6049       continue;
6050     }
6051     if (Elt.getOpcode() == ISD::UNDEF)
6052       continue;
6053
6054     LoadSDNode *LD = cast<LoadSDNode>(Elt);
6055     EVT LdVT = Elt.getValueType();
6056     // Each loaded element must be the correct fractional portion of the
6057     // requested vector load.
6058     if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
6059       return SDValue();
6060     if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
6061       return SDValue();
6062     LastLoadedElt = i;
6063   }
6064
6065   // If we have found an entire vector of loads and undefs, then return a large
6066   // load of the entire vector width starting at the base pointer.  If we found
6067   // consecutive loads for the low half, generate a vzext_load node.
6068   if (LastLoadedElt == NumElems - 1) {
6069     assert(LDBase && "Did not find base load for merging consecutive loads");
6070     EVT EltVT = LDBase->getValueType(0);
6071     // Ensure that the input vector size for the merged loads matches the
6072     // cumulative size of the input elements.
6073     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6074       return SDValue();
6075
6076     if (isAfterLegalize &&
6077         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
6078       return SDValue();
6079
6080     SDValue NewLd = SDValue();
6081
6082     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6083                         LDBase->getPointerInfo(), LDBase->isVolatile(),
6084                         LDBase->isNonTemporal(), LDBase->isInvariant(),
6085                         LDBase->getAlignment());
6086
6087     if (LDBase->hasAnyUseOfValue(1)) {
6088       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6089                                      SDValue(LDBase, 1),
6090                                      SDValue(NewLd.getNode(), 1));
6091       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6092       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6093                              SDValue(NewLd.getNode(), 1));
6094     }
6095
6096     return NewLd;
6097   }
6098
6099   //TODO: The code below fires only for for loading the low v2i32 / v2f32
6100   //of a v4i32 / v4f32. It's probably worth generalizing.
6101   EVT EltVT = VT.getVectorElementType();
6102   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
6103       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
6104     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
6105     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6106     SDValue ResNode =
6107         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
6108                                 LDBase->getPointerInfo(),
6109                                 LDBase->getAlignment(),
6110                                 false/*isVolatile*/, true/*ReadMem*/,
6111                                 false/*WriteMem*/);
6112
6113     // Make sure the newly-created LOAD is in the same position as LDBase in
6114     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
6115     // update uses of LDBase's output chain to use the TokenFactor.
6116     if (LDBase->hasAnyUseOfValue(1)) {
6117       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6118                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
6119       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6120       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6121                              SDValue(ResNode.getNode(), 1));
6122     }
6123
6124     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
6125   }
6126   return SDValue();
6127 }
6128
6129 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
6130 /// to generate a splat value for the following cases:
6131 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
6132 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6133 /// a scalar load, or a constant.
6134 /// The VBROADCAST node is returned when a pattern is found,
6135 /// or SDValue() otherwise.
6136 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
6137                                     SelectionDAG &DAG) {
6138   // VBROADCAST requires AVX.
6139   // TODO: Splats could be generated for non-AVX CPUs using SSE
6140   // instructions, but there's less potential gain for only 128-bit vectors.
6141   if (!Subtarget->hasAVX())
6142     return SDValue();
6143
6144   MVT VT = Op.getSimpleValueType();
6145   SDLoc dl(Op);
6146
6147   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6148          "Unsupported vector type for broadcast.");
6149
6150   SDValue Ld;
6151   bool ConstSplatVal;
6152
6153   switch (Op.getOpcode()) {
6154     default:
6155       // Unknown pattern found.
6156       return SDValue();
6157
6158     case ISD::BUILD_VECTOR: {
6159       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
6160       BitVector UndefElements;
6161       SDValue Splat = BVOp->getSplatValue(&UndefElements);
6162
6163       // We need a splat of a single value to use broadcast, and it doesn't
6164       // make any sense if the value is only in one element of the vector.
6165       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
6166         return SDValue();
6167
6168       Ld = Splat;
6169       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6170                        Ld.getOpcode() == ISD::ConstantFP);
6171
6172       // Make sure that all of the users of a non-constant load are from the
6173       // BUILD_VECTOR node.
6174       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6175         return SDValue();
6176       break;
6177     }
6178
6179     case ISD::VECTOR_SHUFFLE: {
6180       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6181
6182       // Shuffles must have a splat mask where the first element is
6183       // broadcasted.
6184       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
6185         return SDValue();
6186
6187       SDValue Sc = Op.getOperand(0);
6188       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
6189           Sc.getOpcode() != ISD::BUILD_VECTOR) {
6190
6191         if (!Subtarget->hasInt256())
6192           return SDValue();
6193
6194         // Use the register form of the broadcast instruction available on AVX2.
6195         if (VT.getSizeInBits() >= 256)
6196           Sc = Extract128BitVector(Sc, 0, DAG, dl);
6197         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
6198       }
6199
6200       Ld = Sc.getOperand(0);
6201       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6202                        Ld.getOpcode() == ISD::ConstantFP);
6203
6204       // The scalar_to_vector node and the suspected
6205       // load node must have exactly one user.
6206       // Constants may have multiple users.
6207
6208       // AVX-512 has register version of the broadcast
6209       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
6210         Ld.getValueType().getSizeInBits() >= 32;
6211       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
6212           !hasRegVer))
6213         return SDValue();
6214       break;
6215     }
6216   }
6217
6218   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
6219   bool IsGE256 = (VT.getSizeInBits() >= 256);
6220
6221   // When optimizing for size, generate up to 5 extra bytes for a broadcast
6222   // instruction to save 8 or more bytes of constant pool data.
6223   // TODO: If multiple splats are generated to load the same constant,
6224   // it may be detrimental to overall size. There needs to be a way to detect
6225   // that condition to know if this is truly a size win.
6226   const Function *F = DAG.getMachineFunction().getFunction();
6227   bool OptForSize = F->getAttributes().
6228     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
6229
6230   // Handle broadcasting a single constant scalar from the constant pool
6231   // into a vector.
6232   // On Sandybridge (no AVX2), it is still better to load a constant vector
6233   // from the constant pool and not to broadcast it from a scalar.
6234   // But override that restriction when optimizing for size.
6235   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6236   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
6237     EVT CVT = Ld.getValueType();
6238     assert(!CVT.isVector() && "Must not broadcast a vector type");
6239
6240     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6241     // For size optimization, also splat v2f64 and v2i64, and for size opt
6242     // with AVX2, also splat i8 and i16.
6243     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6244     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6245         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
6246       const Constant *C = nullptr;
6247       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6248         C = CI->getConstantIntValue();
6249       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6250         C = CF->getConstantFPValue();
6251
6252       assert(C && "Invalid constant type");
6253
6254       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6255       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
6256       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6257       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
6258                        MachinePointerInfo::getConstantPool(),
6259                        false, false, false, Alignment);
6260
6261       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6262     }
6263   }
6264
6265   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6266
6267   // Handle AVX2 in-register broadcasts.
6268   if (!IsLoad && Subtarget->hasInt256() &&
6269       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6270     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6271
6272   // The scalar source must be a normal load.
6273   if (!IsLoad)
6274     return SDValue();
6275
6276   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6277       (Subtarget->hasVLX() && ScalarSize == 64))
6278     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6279
6280   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6281   // double since there is no vbroadcastsd xmm
6282   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
6283     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6284       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6285   }
6286
6287   // Unsupported broadcast.
6288   return SDValue();
6289 }
6290
6291 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6292 /// underlying vector and index.
6293 ///
6294 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6295 /// index.
6296 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6297                                          SDValue ExtIdx) {
6298   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6299   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6300     return Idx;
6301
6302   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6303   // lowered this:
6304   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6305   // to:
6306   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
6307   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
6308   //                           undef)
6309   //                       Constant<0>)
6310   // In this case the vector is the extract_subvector expression and the index
6311   // is 2, as specified by the shuffle.
6312   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6313   SDValue ShuffleVec = SVOp->getOperand(0);
6314   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6315   assert(ShuffleVecVT.getVectorElementType() ==
6316          ExtractedFromVec.getSimpleValueType().getVectorElementType());
6317
6318   int ShuffleIdx = SVOp->getMaskElt(Idx);
6319   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6320     ExtractedFromVec = ShuffleVec;
6321     return ShuffleIdx;
6322   }
6323   return Idx;
6324 }
6325
6326 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6327   MVT VT = Op.getSimpleValueType();
6328
6329   // Skip if insert_vec_elt is not supported.
6330   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6331   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6332     return SDValue();
6333
6334   SDLoc DL(Op);
6335   unsigned NumElems = Op.getNumOperands();
6336
6337   SDValue VecIn1;
6338   SDValue VecIn2;
6339   SmallVector<unsigned, 4> InsertIndices;
6340   SmallVector<int, 8> Mask(NumElems, -1);
6341
6342   for (unsigned i = 0; i != NumElems; ++i) {
6343     unsigned Opc = Op.getOperand(i).getOpcode();
6344
6345     if (Opc == ISD::UNDEF)
6346       continue;
6347
6348     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6349       // Quit if more than 1 elements need inserting.
6350       if (InsertIndices.size() > 1)
6351         return SDValue();
6352
6353       InsertIndices.push_back(i);
6354       continue;
6355     }
6356
6357     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6358     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6359     // Quit if non-constant index.
6360     if (!isa<ConstantSDNode>(ExtIdx))
6361       return SDValue();
6362     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6363
6364     // Quit if extracted from vector of different type.
6365     if (ExtractedFromVec.getValueType() != VT)
6366       return SDValue();
6367
6368     if (!VecIn1.getNode())
6369       VecIn1 = ExtractedFromVec;
6370     else if (VecIn1 != ExtractedFromVec) {
6371       if (!VecIn2.getNode())
6372         VecIn2 = ExtractedFromVec;
6373       else if (VecIn2 != ExtractedFromVec)
6374         // Quit if more than 2 vectors to shuffle
6375         return SDValue();
6376     }
6377
6378     if (ExtractedFromVec == VecIn1)
6379       Mask[i] = Idx;
6380     else if (ExtractedFromVec == VecIn2)
6381       Mask[i] = Idx + NumElems;
6382   }
6383
6384   if (!VecIn1.getNode())
6385     return SDValue();
6386
6387   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6388   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
6389   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6390     unsigned Idx = InsertIndices[i];
6391     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6392                      DAG.getIntPtrConstant(Idx));
6393   }
6394
6395   return NV;
6396 }
6397
6398 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6399 SDValue
6400 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6401
6402   MVT VT = Op.getSimpleValueType();
6403   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
6404          "Unexpected type in LowerBUILD_VECTORvXi1!");
6405
6406   SDLoc dl(Op);
6407   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6408     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
6409     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6410     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6411   }
6412
6413   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
6414     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
6415     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6416     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6417   }
6418
6419   bool AllContants = true;
6420   uint64_t Immediate = 0;
6421   int NonConstIdx = -1;
6422   bool IsSplat = true;
6423   unsigned NumNonConsts = 0;
6424   unsigned NumConsts = 0;
6425   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6426     SDValue In = Op.getOperand(idx);
6427     if (In.getOpcode() == ISD::UNDEF)
6428       continue;
6429     if (!isa<ConstantSDNode>(In)) {
6430       AllContants = false;
6431       NonConstIdx = idx;
6432       NumNonConsts++;
6433     } else {
6434       NumConsts++;
6435       if (cast<ConstantSDNode>(In)->getZExtValue())
6436       Immediate |= (1ULL << idx);
6437     }
6438     if (In != Op.getOperand(0))
6439       IsSplat = false;
6440   }
6441
6442   if (AllContants) {
6443     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
6444       DAG.getConstant(Immediate, MVT::i16));
6445     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
6446                        DAG.getIntPtrConstant(0));
6447   }
6448
6449   if (NumNonConsts == 1 && NonConstIdx != 0) {
6450     SDValue DstVec;
6451     if (NumConsts) {
6452       SDValue VecAsImm = DAG.getConstant(Immediate,
6453                                          MVT::getIntegerVT(VT.getSizeInBits()));
6454       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
6455     }
6456     else
6457       DstVec = DAG.getUNDEF(VT);
6458     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6459                        Op.getOperand(NonConstIdx),
6460                        DAG.getIntPtrConstant(NonConstIdx));
6461   }
6462   if (!IsSplat && (NonConstIdx != 0))
6463     llvm_unreachable("Unsupported BUILD_VECTOR operation");
6464   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
6465   SDValue Select;
6466   if (IsSplat)
6467     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6468                           DAG.getConstant(-1, SelectVT),
6469                           DAG.getConstant(0, SelectVT));
6470   else
6471     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6472                          DAG.getConstant((Immediate | 1), SelectVT),
6473                          DAG.getConstant(Immediate, SelectVT));
6474   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
6475 }
6476
6477 /// \brief Return true if \p N implements a horizontal binop and return the
6478 /// operands for the horizontal binop into V0 and V1.
6479 ///
6480 /// This is a helper function of PerformBUILD_VECTORCombine.
6481 /// This function checks that the build_vector \p N in input implements a
6482 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6483 /// operation to match.
6484 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6485 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6486 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6487 /// arithmetic sub.
6488 ///
6489 /// This function only analyzes elements of \p N whose indices are
6490 /// in range [BaseIdx, LastIdx).
6491 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6492                               SelectionDAG &DAG,
6493                               unsigned BaseIdx, unsigned LastIdx,
6494                               SDValue &V0, SDValue &V1) {
6495   EVT VT = N->getValueType(0);
6496
6497   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6498   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6499          "Invalid Vector in input!");
6500
6501   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6502   bool CanFold = true;
6503   unsigned ExpectedVExtractIdx = BaseIdx;
6504   unsigned NumElts = LastIdx - BaseIdx;
6505   V0 = DAG.getUNDEF(VT);
6506   V1 = DAG.getUNDEF(VT);
6507
6508   // Check if N implements a horizontal binop.
6509   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6510     SDValue Op = N->getOperand(i + BaseIdx);
6511
6512     // Skip UNDEFs.
6513     if (Op->getOpcode() == ISD::UNDEF) {
6514       // Update the expected vector extract index.
6515       if (i * 2 == NumElts)
6516         ExpectedVExtractIdx = BaseIdx;
6517       ExpectedVExtractIdx += 2;
6518       continue;
6519     }
6520
6521     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6522
6523     if (!CanFold)
6524       break;
6525
6526     SDValue Op0 = Op.getOperand(0);
6527     SDValue Op1 = Op.getOperand(1);
6528
6529     // Try to match the following pattern:
6530     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6531     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6532         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6533         Op0.getOperand(0) == Op1.getOperand(0) &&
6534         isa<ConstantSDNode>(Op0.getOperand(1)) &&
6535         isa<ConstantSDNode>(Op1.getOperand(1)));
6536     if (!CanFold)
6537       break;
6538
6539     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6540     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6541
6542     if (i * 2 < NumElts) {
6543       if (V0.getOpcode() == ISD::UNDEF)
6544         V0 = Op0.getOperand(0);
6545     } else {
6546       if (V1.getOpcode() == ISD::UNDEF)
6547         V1 = Op0.getOperand(0);
6548       if (i * 2 == NumElts)
6549         ExpectedVExtractIdx = BaseIdx;
6550     }
6551
6552     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6553     if (I0 == ExpectedVExtractIdx)
6554       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6555     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6556       // Try to match the following dag sequence:
6557       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6558       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6559     } else
6560       CanFold = false;
6561
6562     ExpectedVExtractIdx += 2;
6563   }
6564
6565   return CanFold;
6566 }
6567
6568 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6569 /// a concat_vector.
6570 ///
6571 /// This is a helper function of PerformBUILD_VECTORCombine.
6572 /// This function expects two 256-bit vectors called V0 and V1.
6573 /// At first, each vector is split into two separate 128-bit vectors.
6574 /// Then, the resulting 128-bit vectors are used to implement two
6575 /// horizontal binary operations.
6576 ///
6577 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6578 ///
6579 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6580 /// the two new horizontal binop.
6581 /// When Mode is set, the first horizontal binop dag node would take as input
6582 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6583 /// horizontal binop dag node would take as input the lower 128-bit of V1
6584 /// and the upper 128-bit of V1.
6585 ///   Example:
6586 ///     HADD V0_LO, V0_HI
6587 ///     HADD V1_LO, V1_HI
6588 ///
6589 /// Otherwise, the first horizontal binop dag node takes as input the lower
6590 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6591 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
6592 ///   Example:
6593 ///     HADD V0_LO, V1_LO
6594 ///     HADD V0_HI, V1_HI
6595 ///
6596 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6597 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6598 /// the upper 128-bits of the result.
6599 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6600                                      SDLoc DL, SelectionDAG &DAG,
6601                                      unsigned X86Opcode, bool Mode,
6602                                      bool isUndefLO, bool isUndefHI) {
6603   EVT VT = V0.getValueType();
6604   assert(VT.is256BitVector() && VT == V1.getValueType() &&
6605          "Invalid nodes in input!");
6606
6607   unsigned NumElts = VT.getVectorNumElements();
6608   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
6609   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
6610   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
6611   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
6612   EVT NewVT = V0_LO.getValueType();
6613
6614   SDValue LO = DAG.getUNDEF(NewVT);
6615   SDValue HI = DAG.getUNDEF(NewVT);
6616
6617   if (Mode) {
6618     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6619     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
6620       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6621     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
6622       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6623   } else {
6624     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6625     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
6626                        V1_LO->getOpcode() != ISD::UNDEF))
6627       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6628
6629     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
6630                        V1_HI->getOpcode() != ISD::UNDEF))
6631       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6632   }
6633
6634   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6635 }
6636
6637 /// \brief Try to fold a build_vector that performs an 'addsub' into the
6638 /// sequence of 'vadd + vsub + blendi'.
6639 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
6640                            const X86Subtarget *Subtarget) {
6641   SDLoc DL(BV);
6642   EVT VT = BV->getValueType(0);
6643   unsigned NumElts = VT.getVectorNumElements();
6644   SDValue InVec0 = DAG.getUNDEF(VT);
6645   SDValue InVec1 = DAG.getUNDEF(VT);
6646
6647   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6648           VT == MVT::v2f64) && "build_vector with an invalid type found!");
6649
6650   // Odd-numbered elements in the input build vector are obtained from
6651   // adding two integer/float elements.
6652   // Even-numbered elements in the input build vector are obtained from
6653   // subtracting two integer/float elements.
6654   unsigned ExpectedOpcode = ISD::FSUB;
6655   unsigned NextExpectedOpcode = ISD::FADD;
6656   bool AddFound = false;
6657   bool SubFound = false;
6658
6659   for (unsigned i = 0, e = NumElts; i != e; i++) {
6660     SDValue Op = BV->getOperand(i);
6661
6662     // Skip 'undef' values.
6663     unsigned Opcode = Op.getOpcode();
6664     if (Opcode == ISD::UNDEF) {
6665       std::swap(ExpectedOpcode, NextExpectedOpcode);
6666       continue;
6667     }
6668
6669     // Early exit if we found an unexpected opcode.
6670     if (Opcode != ExpectedOpcode)
6671       return SDValue();
6672
6673     SDValue Op0 = Op.getOperand(0);
6674     SDValue Op1 = Op.getOperand(1);
6675
6676     // Try to match the following pattern:
6677     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6678     // Early exit if we cannot match that sequence.
6679     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6680         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6681         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6682         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6683         Op0.getOperand(1) != Op1.getOperand(1))
6684       return SDValue();
6685
6686     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6687     if (I0 != i)
6688       return SDValue();
6689
6690     // We found a valid add/sub node. Update the information accordingly.
6691     if (i & 1)
6692       AddFound = true;
6693     else
6694       SubFound = true;
6695
6696     // Update InVec0 and InVec1.
6697     if (InVec0.getOpcode() == ISD::UNDEF)
6698       InVec0 = Op0.getOperand(0);
6699     if (InVec1.getOpcode() == ISD::UNDEF)
6700       InVec1 = Op1.getOperand(0);
6701
6702     // Make sure that operands in input to each add/sub node always
6703     // come from a same pair of vectors.
6704     if (InVec0 != Op0.getOperand(0)) {
6705       if (ExpectedOpcode == ISD::FSUB)
6706         return SDValue();
6707
6708       // FADD is commutable. Try to commute the operands
6709       // and then test again.
6710       std::swap(Op0, Op1);
6711       if (InVec0 != Op0.getOperand(0))
6712         return SDValue();
6713     }
6714
6715     if (InVec1 != Op1.getOperand(0))
6716       return SDValue();
6717
6718     // Update the pair of expected opcodes.
6719     std::swap(ExpectedOpcode, NextExpectedOpcode);
6720   }
6721
6722   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6723   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
6724       InVec1.getOpcode() != ISD::UNDEF)
6725     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6726
6727   return SDValue();
6728 }
6729
6730 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
6731                                           const X86Subtarget *Subtarget) {
6732   SDLoc DL(N);
6733   EVT VT = N->getValueType(0);
6734   unsigned NumElts = VT.getVectorNumElements();
6735   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
6736   SDValue InVec0, InVec1;
6737
6738   // Try to match an ADDSUB.
6739   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
6740       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
6741     SDValue Value = matchAddSub(BV, DAG, Subtarget);
6742     if (Value.getNode())
6743       return Value;
6744   }
6745
6746   // Try to match horizontal ADD/SUB.
6747   unsigned NumUndefsLO = 0;
6748   unsigned NumUndefsHI = 0;
6749   unsigned Half = NumElts/2;
6750
6751   // Count the number of UNDEF operands in the build_vector in input.
6752   for (unsigned i = 0, e = Half; i != e; ++i)
6753     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6754       NumUndefsLO++;
6755
6756   for (unsigned i = Half, e = NumElts; i != e; ++i)
6757     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6758       NumUndefsHI++;
6759
6760   // Early exit if this is either a build_vector of all UNDEFs or all the
6761   // operands but one are UNDEF.
6762   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6763     return SDValue();
6764
6765   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
6766     // Try to match an SSE3 float HADD/HSUB.
6767     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6768       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6769
6770     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6771       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6772   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
6773     // Try to match an SSSE3 integer HADD/HSUB.
6774     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6775       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6776
6777     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6778       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6779   }
6780
6781   if (!Subtarget->hasAVX())
6782     return SDValue();
6783
6784   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6785     // Try to match an AVX horizontal add/sub of packed single/double
6786     // precision floating point values from 256-bit vectors.
6787     SDValue InVec2, InVec3;
6788     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6789         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6790         ((InVec0.getOpcode() == ISD::UNDEF ||
6791           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6792         ((InVec1.getOpcode() == ISD::UNDEF ||
6793           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6794       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6795
6796     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6797         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6798         ((InVec0.getOpcode() == ISD::UNDEF ||
6799           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6800         ((InVec1.getOpcode() == ISD::UNDEF ||
6801           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6802       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6803   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6804     // Try to match an AVX2 horizontal add/sub of signed integers.
6805     SDValue InVec2, InVec3;
6806     unsigned X86Opcode;
6807     bool CanFold = true;
6808
6809     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6810         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6811         ((InVec0.getOpcode() == ISD::UNDEF ||
6812           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6813         ((InVec1.getOpcode() == ISD::UNDEF ||
6814           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6815       X86Opcode = X86ISD::HADD;
6816     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6817         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6818         ((InVec0.getOpcode() == ISD::UNDEF ||
6819           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6820         ((InVec1.getOpcode() == ISD::UNDEF ||
6821           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6822       X86Opcode = X86ISD::HSUB;
6823     else
6824       CanFold = false;
6825
6826     if (CanFold) {
6827       // Fold this build_vector into a single horizontal add/sub.
6828       // Do this only if the target has AVX2.
6829       if (Subtarget->hasAVX2())
6830         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6831
6832       // Do not try to expand this build_vector into a pair of horizontal
6833       // add/sub if we can emit a pair of scalar add/sub.
6834       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6835         return SDValue();
6836
6837       // Convert this build_vector into a pair of horizontal binop followed by
6838       // a concat vector.
6839       bool isUndefLO = NumUndefsLO == Half;
6840       bool isUndefHI = NumUndefsHI == Half;
6841       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6842                                    isUndefLO, isUndefHI);
6843     }
6844   }
6845
6846   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6847        VT == MVT::v16i16) && Subtarget->hasAVX()) {
6848     unsigned X86Opcode;
6849     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6850       X86Opcode = X86ISD::HADD;
6851     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6852       X86Opcode = X86ISD::HSUB;
6853     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6854       X86Opcode = X86ISD::FHADD;
6855     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6856       X86Opcode = X86ISD::FHSUB;
6857     else
6858       return SDValue();
6859
6860     // Don't try to expand this build_vector into a pair of horizontal add/sub
6861     // if we can simply emit a pair of scalar add/sub.
6862     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6863       return SDValue();
6864
6865     // Convert this build_vector into two horizontal add/sub followed by
6866     // a concat vector.
6867     bool isUndefLO = NumUndefsLO == Half;
6868     bool isUndefHI = NumUndefsHI == Half;
6869     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6870                                  isUndefLO, isUndefHI);
6871   }
6872
6873   return SDValue();
6874 }
6875
6876 SDValue
6877 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6878   SDLoc dl(Op);
6879
6880   MVT VT = Op.getSimpleValueType();
6881   MVT ExtVT = VT.getVectorElementType();
6882   unsigned NumElems = Op.getNumOperands();
6883
6884   // Generate vectors for predicate vectors.
6885   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
6886     return LowerBUILD_VECTORvXi1(Op, DAG);
6887
6888   // Vectors containing all zeros can be matched by pxor and xorps later
6889   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6890     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6891     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6892     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6893       return Op;
6894
6895     return getZeroVector(VT, Subtarget, DAG, dl);
6896   }
6897
6898   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6899   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6900   // vpcmpeqd on 256-bit vectors.
6901   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6902     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
6903       return Op;
6904
6905     if (!VT.is512BitVector())
6906       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
6907   }
6908
6909   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
6910   if (Broadcast.getNode())
6911     return Broadcast;
6912
6913   unsigned EVTBits = ExtVT.getSizeInBits();
6914
6915   unsigned NumZero  = 0;
6916   unsigned NumNonZero = 0;
6917   unsigned NonZeros = 0;
6918   bool IsAllConstants = true;
6919   SmallSet<SDValue, 8> Values;
6920   for (unsigned i = 0; i < NumElems; ++i) {
6921     SDValue Elt = Op.getOperand(i);
6922     if (Elt.getOpcode() == ISD::UNDEF)
6923       continue;
6924     Values.insert(Elt);
6925     if (Elt.getOpcode() != ISD::Constant &&
6926         Elt.getOpcode() != ISD::ConstantFP)
6927       IsAllConstants = false;
6928     if (X86::isZeroNode(Elt))
6929       NumZero++;
6930     else {
6931       NonZeros |= (1 << i);
6932       NumNonZero++;
6933     }
6934   }
6935
6936   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
6937   if (NumNonZero == 0)
6938     return DAG.getUNDEF(VT);
6939
6940   // Special case for single non-zero, non-undef, element.
6941   if (NumNonZero == 1) {
6942     unsigned Idx = countTrailingZeros(NonZeros);
6943     SDValue Item = Op.getOperand(Idx);
6944
6945     // If this is an insertion of an i64 value on x86-32, and if the top bits of
6946     // the value are obviously zero, truncate the value to i32 and do the
6947     // insertion that way.  Only do this if the value is non-constant or if the
6948     // value is a constant being inserted into element 0.  It is cheaper to do
6949     // a constant pool load than it is to do a movd + shuffle.
6950     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
6951         (!IsAllConstants || Idx == 0)) {
6952       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6953         // Handle SSE only.
6954         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6955         EVT VecVT = MVT::v4i32;
6956         unsigned VecElts = 4;
6957
6958         // Truncate the value (which may itself be a constant) to i32, and
6959         // convert it to a vector with movd (S2V+shuffle to zero extend).
6960         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6961         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6962
6963         // If using the new shuffle lowering, just directly insert this.
6964         if (ExperimentalVectorShuffleLowering)
6965           return DAG.getNode(
6966               ISD::BITCAST, dl, VT,
6967               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
6968
6969         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6970
6971         // Now we have our 32-bit value zero extended in the low element of
6972         // a vector.  If Idx != 0, swizzle it into place.
6973         if (Idx != 0) {
6974           SmallVector<int, 4> Mask;
6975           Mask.push_back(Idx);
6976           for (unsigned i = 1; i != VecElts; ++i)
6977             Mask.push_back(i);
6978           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
6979                                       &Mask[0]);
6980         }
6981         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
6982       }
6983     }
6984
6985     // If we have a constant or non-constant insertion into the low element of
6986     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6987     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
6988     // depending on what the source datatype is.
6989     if (Idx == 0) {
6990       if (NumZero == 0)
6991         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6992
6993       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6994           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
6995         if (VT.is256BitVector() || VT.is512BitVector()) {
6996           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6997           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6998                              Item, DAG.getIntPtrConstant(0));
6999         }
7000         assert(VT.is128BitVector() && "Expected an SSE value type!");
7001         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7002         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7003         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7004       }
7005
7006       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7007         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7008         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7009         if (VT.is256BitVector()) {
7010           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
7011           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7012         } else {
7013           assert(VT.is128BitVector() && "Expected an SSE value type!");
7014           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7015         }
7016         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
7017       }
7018     }
7019
7020     // Is it a vector logical left shift?
7021     if (NumElems == 2 && Idx == 1 &&
7022         X86::isZeroNode(Op.getOperand(0)) &&
7023         !X86::isZeroNode(Op.getOperand(1))) {
7024       unsigned NumBits = VT.getSizeInBits();
7025       return getVShift(true, VT,
7026                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7027                                    VT, Op.getOperand(1)),
7028                        NumBits/2, DAG, *this, dl);
7029     }
7030
7031     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7032       return SDValue();
7033
7034     // Otherwise, if this is a vector with i32 or f32 elements, and the element
7035     // is a non-constant being inserted into an element other than the low one,
7036     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
7037     // movd/movss) to move this into the low element, then shuffle it into
7038     // place.
7039     if (EVTBits == 32) {
7040       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7041
7042       // If using the new shuffle lowering, just directly insert this.
7043       if (ExperimentalVectorShuffleLowering)
7044         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7045
7046       // Turn it into a shuffle of zero and zero-extended scalar to vector.
7047       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
7048       SmallVector<int, 8> MaskVec;
7049       for (unsigned i = 0; i != NumElems; ++i)
7050         MaskVec.push_back(i == Idx ? 0 : 1);
7051       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
7052     }
7053   }
7054
7055   // Splat is obviously ok. Let legalizer expand it to a shuffle.
7056   if (Values.size() == 1) {
7057     if (EVTBits == 32) {
7058       // Instead of a shuffle like this:
7059       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7060       // Check if it's possible to issue this instead.
7061       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7062       unsigned Idx = countTrailingZeros(NonZeros);
7063       SDValue Item = Op.getOperand(Idx);
7064       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7065         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7066     }
7067     return SDValue();
7068   }
7069
7070   // A vector full of immediates; various special cases are already
7071   // handled, so this is best done with a single constant-pool load.
7072   if (IsAllConstants)
7073     return SDValue();
7074
7075   // For AVX-length vectors, see if we can use a vector load to get all of the
7076   // elements, otherwise build the individual 128-bit pieces and use
7077   // shuffles to put them in place.
7078   if (VT.is256BitVector() || VT.is512BitVector()) {
7079     SmallVector<SDValue, 64> V;
7080     for (unsigned i = 0; i != NumElems; ++i)
7081       V.push_back(Op.getOperand(i));
7082
7083     // Check for a build vector of consecutive loads.
7084     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
7085       return LD;
7086
7087     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7088
7089     // Build both the lower and upper subvector.
7090     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7091                                 makeArrayRef(&V[0], NumElems/2));
7092     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7093                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
7094
7095     // Recreate the wider vector with the lower and upper part.
7096     if (VT.is256BitVector())
7097       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7098     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7099   }
7100
7101   // Let legalizer expand 2-wide build_vectors.
7102   if (EVTBits == 64) {
7103     if (NumNonZero == 1) {
7104       // One half is zero or undef.
7105       unsigned Idx = countTrailingZeros(NonZeros);
7106       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7107                                  Op.getOperand(Idx));
7108       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7109     }
7110     return SDValue();
7111   }
7112
7113   // If element VT is < 32 bits, convert it to inserts into a zero vector.
7114   if (EVTBits == 8 && NumElems == 16) {
7115     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
7116                                         Subtarget, *this);
7117     if (V.getNode()) return V;
7118   }
7119
7120   if (EVTBits == 16 && NumElems == 8) {
7121     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
7122                                       Subtarget, *this);
7123     if (V.getNode()) return V;
7124   }
7125
7126   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7127   if (EVTBits == 32 && NumElems == 4) {
7128     SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
7129     if (V.getNode())
7130       return V;
7131   }
7132
7133   // If element VT is == 32 bits, turn it into a number of shuffles.
7134   SmallVector<SDValue, 8> V(NumElems);
7135   if (NumElems == 4 && NumZero > 0) {
7136     for (unsigned i = 0; i < 4; ++i) {
7137       bool isZero = !(NonZeros & (1 << i));
7138       if (isZero)
7139         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
7140       else
7141         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7142     }
7143
7144     for (unsigned i = 0; i < 2; ++i) {
7145       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7146         default: break;
7147         case 0:
7148           V[i] = V[i*2];  // Must be a zero vector.
7149           break;
7150         case 1:
7151           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
7152           break;
7153         case 2:
7154           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
7155           break;
7156         case 3:
7157           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
7158           break;
7159       }
7160     }
7161
7162     bool Reverse1 = (NonZeros & 0x3) == 2;
7163     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7164     int MaskVec[] = {
7165       Reverse1 ? 1 : 0,
7166       Reverse1 ? 0 : 1,
7167       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7168       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
7169     };
7170     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
7171   }
7172
7173   if (Values.size() > 1 && VT.is128BitVector()) {
7174     // Check for a build vector of consecutive loads.
7175     for (unsigned i = 0; i < NumElems; ++i)
7176       V[i] = Op.getOperand(i);
7177
7178     // Check for elements which are consecutive loads.
7179     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
7180     if (LD.getNode())
7181       return LD;
7182
7183     // Check for a build vector from mostly shuffle plus few inserting.
7184     SDValue Sh = buildFromShuffleMostly(Op, DAG);
7185     if (Sh.getNode())
7186       return Sh;
7187
7188     // For SSE 4.1, use insertps to put the high elements into the low element.
7189     if (Subtarget->hasSSE41()) {
7190       SDValue Result;
7191       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
7192         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7193       else
7194         Result = DAG.getUNDEF(VT);
7195
7196       for (unsigned i = 1; i < NumElems; ++i) {
7197         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
7198         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7199                              Op.getOperand(i), DAG.getIntPtrConstant(i));
7200       }
7201       return Result;
7202     }
7203
7204     // Otherwise, expand into a number of unpckl*, start by extending each of
7205     // our (non-undef) elements to the full vector width with the element in the
7206     // bottom slot of the vector (which generates no code for SSE).
7207     for (unsigned i = 0; i < NumElems; ++i) {
7208       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
7209         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7210       else
7211         V[i] = DAG.getUNDEF(VT);
7212     }
7213
7214     // Next, we iteratively mix elements, e.g. for v4f32:
7215     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7216     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7217     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
7218     unsigned EltStride = NumElems >> 1;
7219     while (EltStride != 0) {
7220       for (unsigned i = 0; i < EltStride; ++i) {
7221         // If V[i+EltStride] is undef and this is the first round of mixing,
7222         // then it is safe to just drop this shuffle: V[i] is already in the
7223         // right place, the one element (since it's the first round) being
7224         // inserted as undef can be dropped.  This isn't safe for successive
7225         // rounds because they will permute elements within both vectors.
7226         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
7227             EltStride == NumElems/2)
7228           continue;
7229
7230         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
7231       }
7232       EltStride >>= 1;
7233     }
7234     return V[0];
7235   }
7236   return SDValue();
7237 }
7238
7239 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
7240 // to create 256-bit vectors from two other 128-bit ones.
7241 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7242   SDLoc dl(Op);
7243   MVT ResVT = Op.getSimpleValueType();
7244
7245   assert((ResVT.is256BitVector() ||
7246           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7247
7248   SDValue V1 = Op.getOperand(0);
7249   SDValue V2 = Op.getOperand(1);
7250   unsigned NumElems = ResVT.getVectorNumElements();
7251   if(ResVT.is256BitVector())
7252     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7253
7254   if (Op.getNumOperands() == 4) {
7255     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
7256                                 ResVT.getVectorNumElements()/2);
7257     SDValue V3 = Op.getOperand(2);
7258     SDValue V4 = Op.getOperand(3);
7259     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
7260       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
7261   }
7262   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7263 }
7264
7265 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7266   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
7267   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7268          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7269           Op.getNumOperands() == 4)));
7270
7271   // AVX can use the vinsertf128 instruction to create 256-bit vectors
7272   // from two other 128-bit ones.
7273
7274   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7275   return LowerAVXCONCAT_VECTORS(Op, DAG);
7276 }
7277
7278
7279 //===----------------------------------------------------------------------===//
7280 // Vector shuffle lowering
7281 //
7282 // This is an experimental code path for lowering vector shuffles on x86. It is
7283 // designed to handle arbitrary vector shuffles and blends, gracefully
7284 // degrading performance as necessary. It works hard to recognize idiomatic
7285 // shuffles and lower them to optimal instruction patterns without leaving
7286 // a framework that allows reasonably efficient handling of all vector shuffle
7287 // patterns.
7288 //===----------------------------------------------------------------------===//
7289
7290 /// \brief Tiny helper function to identify a no-op mask.
7291 ///
7292 /// This is a somewhat boring predicate function. It checks whether the mask
7293 /// array input, which is assumed to be a single-input shuffle mask of the kind
7294 /// used by the X86 shuffle instructions (not a fully general
7295 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7296 /// in-place shuffle are 'no-op's.
7297 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7298   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7299     if (Mask[i] != -1 && Mask[i] != i)
7300       return false;
7301   return true;
7302 }
7303
7304 /// \brief Helper function to classify a mask as a single-input mask.
7305 ///
7306 /// This isn't a generic single-input test because in the vector shuffle
7307 /// lowering we canonicalize single inputs to be the first input operand. This
7308 /// means we can more quickly test for a single input by only checking whether
7309 /// an input from the second operand exists. We also assume that the size of
7310 /// mask corresponds to the size of the input vectors which isn't true in the
7311 /// fully general case.
7312 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
7313   for (int M : Mask)
7314     if (M >= (int)Mask.size())
7315       return false;
7316   return true;
7317 }
7318
7319 /// \brief Test whether there are elements crossing 128-bit lanes in this
7320 /// shuffle mask.
7321 ///
7322 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7323 /// and we routinely test for these.
7324 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7325   int LaneSize = 128 / VT.getScalarSizeInBits();
7326   int Size = Mask.size();
7327   for (int i = 0; i < Size; ++i)
7328     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7329       return true;
7330   return false;
7331 }
7332
7333 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
7334 ///
7335 /// This checks a shuffle mask to see if it is performing the same
7336 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
7337 /// that it is also not lane-crossing. It may however involve a blend from the
7338 /// same lane of a second vector.
7339 ///
7340 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7341 /// non-trivial to compute in the face of undef lanes. The representation is
7342 /// *not* suitable for use with existing 128-bit shuffles as it will contain
7343 /// entries from both V1 and V2 inputs to the wider mask.
7344 static bool
7345 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7346                                 SmallVectorImpl<int> &RepeatedMask) {
7347   int LaneSize = 128 / VT.getScalarSizeInBits();
7348   RepeatedMask.resize(LaneSize, -1);
7349   int Size = Mask.size();
7350   for (int i = 0; i < Size; ++i) {
7351     if (Mask[i] < 0)
7352       continue;
7353     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7354       // This entry crosses lanes, so there is no way to model this shuffle.
7355       return false;
7356
7357     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7358     if (RepeatedMask[i % LaneSize] == -1)
7359       // This is the first non-undef entry in this slot of a 128-bit lane.
7360       RepeatedMask[i % LaneSize] =
7361           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
7362     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
7363       // Found a mismatch with the repeated mask.
7364       return false;
7365   }
7366   return true;
7367 }
7368
7369 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
7370 // 2013 will allow us to use it as a non-type template parameter.
7371 namespace {
7372
7373 /// \brief Implementation of the \c isShuffleEquivalent variadic functor.
7374 ///
7375 /// See its documentation for details.
7376 bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
7377   if (Mask.size() != Args.size())
7378     return false;
7379   for (int i = 0, e = Mask.size(); i < e; ++i) {
7380     assert(*Args[i] >= 0 && "Arguments must be positive integers!");
7381     if (Mask[i] != -1 && Mask[i] != *Args[i])
7382       return false;
7383   }
7384   return true;
7385 }
7386
7387 } // namespace
7388
7389 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7390 /// arguments.
7391 ///
7392 /// This is a fast way to test a shuffle mask against a fixed pattern:
7393 ///
7394 ///   if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
7395 ///
7396 /// It returns true if the mask is exactly as wide as the argument list, and
7397 /// each element of the mask is either -1 (signifying undef) or the value given
7398 /// in the argument.
7399 static const VariadicFunction1<
7400     bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
7401
7402 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7403 ///
7404 /// This helper function produces an 8-bit shuffle immediate corresponding to
7405 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7406 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7407 /// example.
7408 ///
7409 /// NB: We rely heavily on "undef" masks preserving the input lane.
7410 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
7411                                           SelectionDAG &DAG) {
7412   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7413   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7414   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7415   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7416   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7417
7418   unsigned Imm = 0;
7419   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
7420   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
7421   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
7422   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
7423   return DAG.getConstant(Imm, MVT::i8);
7424 }
7425
7426 /// \brief Try to emit a blend instruction for a shuffle.
7427 ///
7428 /// This doesn't do any checks for the availability of instructions for blending
7429 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7430 /// be matched in the backend with the type given. What it does check for is
7431 /// that the shuffle mask is in fact a blend.
7432 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
7433                                          SDValue V2, ArrayRef<int> Mask,
7434                                          const X86Subtarget *Subtarget,
7435                                          SelectionDAG &DAG) {
7436
7437   unsigned BlendMask = 0;
7438   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7439     if (Mask[i] >= Size) {
7440       if (Mask[i] != i + Size)
7441         return SDValue(); // Shuffled V2 input!
7442       BlendMask |= 1u << i;
7443       continue;
7444     }
7445     if (Mask[i] >= 0 && Mask[i] != i)
7446       return SDValue(); // Shuffled V1 input!
7447   }
7448   switch (VT.SimpleTy) {
7449   case MVT::v2f64:
7450   case MVT::v4f32:
7451   case MVT::v4f64:
7452   case MVT::v8f32:
7453     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7454                        DAG.getConstant(BlendMask, MVT::i8));
7455
7456   case MVT::v4i64:
7457   case MVT::v8i32:
7458     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7459     // FALLTHROUGH
7460   case MVT::v2i64:
7461   case MVT::v4i32:
7462     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7463     // that instruction.
7464     if (Subtarget->hasAVX2()) {
7465       // Scale the blend by the number of 32-bit dwords per element.
7466       int Scale =  VT.getScalarSizeInBits() / 32;
7467       BlendMask = 0;
7468       for (int i = 0, Size = Mask.size(); i < Size; ++i)
7469         if (Mask[i] >= Size)
7470           for (int j = 0; j < Scale; ++j)
7471             BlendMask |= 1u << (i * Scale + j);
7472
7473       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7474       V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
7475       V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
7476       return DAG.getNode(ISD::BITCAST, DL, VT,
7477                          DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7478                                      DAG.getConstant(BlendMask, MVT::i8)));
7479     }
7480     // FALLTHROUGH
7481   case MVT::v8i16: {
7482     // For integer shuffles we need to expand the mask and cast the inputs to
7483     // v8i16s prior to blending.
7484     int Scale = 8 / VT.getVectorNumElements();
7485     BlendMask = 0;
7486     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7487       if (Mask[i] >= Size)
7488         for (int j = 0; j < Scale; ++j)
7489           BlendMask |= 1u << (i * Scale + j);
7490
7491     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
7492     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
7493     return DAG.getNode(ISD::BITCAST, DL, VT,
7494                        DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7495                                    DAG.getConstant(BlendMask, MVT::i8)));
7496   }
7497
7498   case MVT::v16i16: {
7499     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7500     SmallVector<int, 8> RepeatedMask;
7501     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7502       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7503       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7504       BlendMask = 0;
7505       for (int i = 0; i < 8; ++i)
7506         if (RepeatedMask[i] >= 16)
7507           BlendMask |= 1u << i;
7508       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7509                          DAG.getConstant(BlendMask, MVT::i8));
7510     }
7511   }
7512     // FALLTHROUGH
7513   case MVT::v32i8: {
7514     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7515     // Scale the blend by the number of bytes per element.
7516     int Scale =  VT.getScalarSizeInBits() / 8;
7517     assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
7518
7519     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7520     // mix of LLVM's code generator and the x86 backend. We tell the code
7521     // generator that boolean values in the elements of an x86 vector register
7522     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7523     // mapping a select to operand #1, and 'false' mapping to operand #2. The
7524     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7525     // of the element (the remaining are ignored) and 0 in that high bit would
7526     // mean operand #1 while 1 in the high bit would mean operand #2. So while
7527     // the LLVM model for boolean values in vector elements gets the relevant
7528     // bit set, it is set backwards and over constrained relative to x86's
7529     // actual model.
7530     SDValue VSELECTMask[32];
7531     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7532       for (int j = 0; j < Scale; ++j)
7533         VSELECTMask[Scale * i + j] =
7534             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7535                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
7536
7537     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
7538     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
7539     return DAG.getNode(
7540         ISD::BITCAST, DL, VT,
7541         DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
7542                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
7543                     V1, V2));
7544   }
7545
7546   default:
7547     llvm_unreachable("Not a supported integer vector type!");
7548   }
7549 }
7550
7551 /// \brief Generic routine to lower a shuffle and blend as a decomposed set of
7552 /// unblended shuffles followed by an unshuffled blend.
7553 ///
7554 /// This matches the extremely common pattern for handling combined
7555 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7556 /// operations.
7557 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
7558                                                           SDValue V1,
7559                                                           SDValue V2,
7560                                                           ArrayRef<int> Mask,
7561                                                           SelectionDAG &DAG) {
7562   // Shuffle the input elements into the desired positions in V1 and V2 and
7563   // blend them together.
7564   SmallVector<int, 32> V1Mask(Mask.size(), -1);
7565   SmallVector<int, 32> V2Mask(Mask.size(), -1);
7566   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7567   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7568     if (Mask[i] >= 0 && Mask[i] < Size) {
7569       V1Mask[i] = Mask[i];
7570       BlendMask[i] = i;
7571     } else if (Mask[i] >= Size) {
7572       V2Mask[i] = Mask[i] - Size;
7573       BlendMask[i] = i + Size;
7574     }
7575
7576   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7577   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7578   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7579 }
7580
7581 /// \brief Try to lower a vector shuffle as a byte rotation.
7582 ///
7583 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7584 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7585 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7586 /// try to generically lower a vector shuffle through such an pattern. It
7587 /// does not check for the profitability of lowering either as PALIGNR or
7588 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7589 /// This matches shuffle vectors that look like:
7590 ///
7591 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7592 ///
7593 /// Essentially it concatenates V1 and V2, shifts right by some number of
7594 /// elements, and takes the low elements as the result. Note that while this is
7595 /// specified as a *right shift* because x86 is little-endian, it is a *left
7596 /// rotate* of the vector lanes.
7597 ///
7598 /// Note that this only handles 128-bit vector widths currently.
7599 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
7600                                               SDValue V2,
7601                                               ArrayRef<int> Mask,
7602                                               const X86Subtarget *Subtarget,
7603                                               SelectionDAG &DAG) {
7604   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7605
7606   // We need to detect various ways of spelling a rotation:
7607   //   [11, 12, 13, 14, 15,  0,  1,  2]
7608   //   [-1, 12, 13, 14, -1, -1,  1, -1]
7609   //   [-1, -1, -1, -1, -1, -1,  1,  2]
7610   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
7611   //   [-1,  4,  5,  6, -1, -1,  9, -1]
7612   //   [-1,  4,  5,  6, -1, -1, -1, -1]
7613   int Rotation = 0;
7614   SDValue Lo, Hi;
7615   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7616     if (Mask[i] == -1)
7617       continue;
7618     assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
7619
7620     // Based on the mod-Size value of this mask element determine where
7621     // a rotated vector would have started.
7622     int StartIdx = i - (Mask[i] % Size);
7623     if (StartIdx == 0)
7624       // The identity rotation isn't interesting, stop.
7625       return SDValue();
7626
7627     // If we found the tail of a vector the rotation must be the missing
7628     // front. If we found the head of a vector, it must be how much of the head.
7629     int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
7630
7631     if (Rotation == 0)
7632       Rotation = CandidateRotation;
7633     else if (Rotation != CandidateRotation)
7634       // The rotations don't match, so we can't match this mask.
7635       return SDValue();
7636
7637     // Compute which value this mask is pointing at.
7638     SDValue MaskV = Mask[i] < Size ? V1 : V2;
7639
7640     // Compute which of the two target values this index should be assigned to.
7641     // This reflects whether the high elements are remaining or the low elements
7642     // are remaining.
7643     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7644
7645     // Either set up this value if we've not encountered it before, or check
7646     // that it remains consistent.
7647     if (!TargetV)
7648       TargetV = MaskV;
7649     else if (TargetV != MaskV)
7650       // This may be a rotation, but it pulls from the inputs in some
7651       // unsupported interleaving.
7652       return SDValue();
7653   }
7654
7655   // Check that we successfully analyzed the mask, and normalize the results.
7656   assert(Rotation != 0 && "Failed to locate a viable rotation!");
7657   assert((Lo || Hi) && "Failed to find a rotated input vector!");
7658   if (!Lo)
7659     Lo = Hi;
7660   else if (!Hi)
7661     Hi = Lo;
7662
7663   assert(VT.getSizeInBits() == 128 &&
7664          "Rotate-based lowering only supports 128-bit lowering!");
7665   assert(Mask.size() <= 16 &&
7666          "Can shuffle at most 16 bytes in a 128-bit vector!");
7667
7668   // The actual rotate instruction rotates bytes, so we need to scale the
7669   // rotation based on how many bytes are in the vector.
7670   int Scale = 16 / Mask.size();
7671
7672   // SSSE3 targets can use the palignr instruction
7673   if (Subtarget->hasSSSE3()) {
7674     // Cast the inputs to v16i8 to match PALIGNR.
7675     Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
7676     Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
7677
7678     return DAG.getNode(ISD::BITCAST, DL, VT,
7679                        DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
7680                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
7681   }
7682
7683   // Default SSE2 implementation
7684   int LoByteShift = 16 - Rotation * Scale;
7685   int HiByteShift = Rotation * Scale;
7686
7687   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
7688   Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
7689   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
7690
7691   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
7692                                 DAG.getConstant(8 * LoByteShift, MVT::i8));
7693   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
7694                                 DAG.getConstant(8 * HiByteShift, MVT::i8));
7695   return DAG.getNode(ISD::BITCAST, DL, VT,
7696                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
7697 }
7698
7699 /// \brief Compute whether each element of a shuffle is zeroable.
7700 ///
7701 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7702 /// Either it is an undef element in the shuffle mask, the element of the input
7703 /// referenced is undef, or the element of the input referenced is known to be
7704 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7705 /// as many lanes with this technique as possible to simplify the remaining
7706 /// shuffle.
7707 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7708                                                      SDValue V1, SDValue V2) {
7709   SmallBitVector Zeroable(Mask.size(), false);
7710
7711   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7712   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7713
7714   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7715     int M = Mask[i];
7716     // Handle the easy cases.
7717     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7718       Zeroable[i] = true;
7719       continue;
7720     }
7721
7722     // If this is an index into a build_vector node, dig out the input value and
7723     // use it.
7724     SDValue V = M < Size ? V1 : V2;
7725     if (V.getOpcode() != ISD::BUILD_VECTOR)
7726       continue;
7727
7728     SDValue Input = V.getOperand(M % Size);
7729     // The UNDEF opcode check really should be dead code here, but not quite
7730     // worth asserting on (it isn't invalid, just unexpected).
7731     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
7732       Zeroable[i] = true;
7733   }
7734
7735   return Zeroable;
7736 }
7737
7738 /// \brief Try to emit a bitmask instruction for a shuffle.
7739 ///
7740 /// This handles cases where we can model a blend exactly as a bitmask due to
7741 /// one of the inputs being zeroable.
7742 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
7743                                            SDValue V2, ArrayRef<int> Mask,
7744                                            SelectionDAG &DAG) {
7745   MVT EltVT = VT.getScalarType();
7746   int NumEltBits = EltVT.getSizeInBits();
7747   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
7748   SDValue Zero = DAG.getConstant(0, IntEltVT);
7749   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
7750   if (EltVT.isFloatingPoint()) {
7751     Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
7752     AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
7753   }
7754   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
7755   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7756   SDValue V;
7757   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7758     if (Zeroable[i])
7759       continue;
7760     if (Mask[i] % Size != i)
7761       return SDValue(); // Not a blend.
7762     if (!V)
7763       V = Mask[i] < Size ? V1 : V2;
7764     else if (V != (Mask[i] < Size ? V1 : V2))
7765       return SDValue(); // Can only let one input through the mask.
7766
7767     VMaskOps[i] = AllOnes;
7768   }
7769   if (!V)
7770     return SDValue(); // No non-zeroable elements!
7771
7772   SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
7773   V = DAG.getNode(VT.isFloatingPoint()
7774                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
7775                   DL, VT, V, VMask);
7776   return V;
7777 }
7778
7779 /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
7780 ///
7781 /// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
7782 /// byte-shift instructions. The mask must consist of a shifted sequential
7783 /// shuffle from one of the input vectors and zeroable elements for the
7784 /// remaining 'shifted in' elements.
7785 ///
7786 /// Note that this only handles 128-bit vector widths currently.
7787 static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
7788                                              SDValue V2, ArrayRef<int> Mask,
7789                                              SelectionDAG &DAG) {
7790   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7791
7792   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7793
7794   int Size = Mask.size();
7795   int Scale = 16 / Size;
7796
7797   for (int Shift = 1; Shift < Size; Shift++) {
7798     int ByteShift = Shift * Scale;
7799
7800     // PSRLDQ : (little-endian) right byte shift
7801     // [ 5,  6,  7, zz, zz, zz, zz, zz]
7802     // [ -1, 5,  6,  7, zz, zz, zz, zz]
7803     // [  1, 2, -1, -1, -1, -1, zz, zz]
7804     bool ZeroableRight = true;
7805     for (int i = Size - Shift; i < Size; i++) {
7806       ZeroableRight &= Zeroable[i];
7807     }
7808
7809     if (ZeroableRight) {
7810       bool ValidShiftRight1 =
7811           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
7812       bool ValidShiftRight2 =
7813           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
7814
7815       if (ValidShiftRight1 || ValidShiftRight2) {
7816         // Cast the inputs to v2i64 to match PSRLDQ.
7817         SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
7818         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7819         SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
7820                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7821         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7822       }
7823     }
7824
7825     // PSLLDQ : (little-endian) left byte shift
7826     // [ zz,  0,  1,  2,  3,  4,  5,  6]
7827     // [ zz, zz, -1, -1,  2,  3,  4, -1]
7828     // [ zz, zz, zz, zz, zz, zz, -1,  1]
7829     bool ZeroableLeft = true;
7830     for (int i = 0; i < Shift; i++) {
7831       ZeroableLeft &= Zeroable[i];
7832     }
7833
7834     if (ZeroableLeft) {
7835       bool ValidShiftLeft1 =
7836           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
7837       bool ValidShiftLeft2 =
7838           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
7839
7840       if (ValidShiftLeft1 || ValidShiftLeft2) {
7841         // Cast the inputs to v2i64 to match PSLLDQ.
7842         SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
7843         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7844         SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
7845                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7846         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7847       }
7848     }
7849   }
7850
7851   return SDValue();
7852 }
7853
7854 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
7855 ///
7856 /// Attempts to match a shuffle mask against the PSRL(W/D/Q) and PSLL(W/D/Q)
7857 /// SSE2 and AVX2 logical bit-shift instructions. The function matches
7858 /// elements from one of the input vectors shuffled to the left or right
7859 /// with zeroable elements 'shifted in'.
7860 static SDValue lowerVectorShuffleAsBitShift(SDLoc DL, MVT VT, SDValue V1,
7861                                             SDValue V2, ArrayRef<int> Mask,
7862                                             SelectionDAG &DAG) {
7863   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7864
7865   int Size = Mask.size();
7866   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7867
7868   // PSRL : (little-endian) right bit shift.
7869   // [  1, zz,  3, zz]
7870   // [ -1, -1,  7, zz]
7871   // PSHL : (little-endian) left bit shift.
7872   // [ zz, 0, zz,  2 ]
7873   // [ -1, 4, zz, -1 ]
7874   auto MatchBitShift = [&](int Shift, int Scale) -> SDValue {
7875     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
7876     MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
7877     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
7878            "Illegal integer vector type");
7879
7880     bool MatchLeft = true, MatchRight = true;
7881     for (int i = 0; i != Size; i += Scale) {
7882       for (int j = 0; j != Shift; j++) {
7883         MatchLeft &= Zeroable[i + j];
7884       }
7885       for (int j = Scale - Shift; j != Scale; j++) {
7886         MatchRight &= Zeroable[i + j];
7887       }
7888     }
7889     if (!(MatchLeft || MatchRight))
7890       return SDValue();
7891
7892     bool MatchV1 = true, MatchV2 = true;
7893     for (int i = 0; i != Size; i += Scale) {
7894       unsigned Pos = MatchLeft ? i + Shift : i;
7895       unsigned Low = MatchLeft ? i : i + Shift;
7896       unsigned Len = Scale - Shift;
7897       MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
7898       MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + Size);
7899     }
7900     if (!(MatchV1 || MatchV2))
7901       return SDValue();
7902
7903     // Cast the inputs to ShiftVT to match VSRLI/VSHLI and back again.
7904     unsigned OpCode = MatchLeft ? X86ISD::VSHLI : X86ISD::VSRLI;
7905     int ShiftAmt = Shift * VT.getScalarSizeInBits();
7906     SDValue V = MatchV1 ? V1 : V2;
7907     V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
7908     V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
7909     return DAG.getNode(ISD::BITCAST, DL, VT, V);
7910   };
7911
7912   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
7913   // keep doubling the size of the integer elements up to that. We can
7914   // then shift the elements of the integer vector by whole multiples of
7915   // their width within the elements of the larger integer vector. Test each
7916   // multiple to see if we can find a match with the moved element indices
7917   // and that the shifted in elements are all zeroable.
7918   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 64; Scale *= 2)
7919     for (int Shift = 1; Shift != Scale; Shift++)
7920       if (SDValue BitShift = MatchBitShift(Shift, Scale))
7921         return BitShift;
7922
7923   // no match
7924   return SDValue();
7925 }
7926
7927 /// \brief Lower a vector shuffle as a zero or any extension.
7928 ///
7929 /// Given a specific number of elements, element bit width, and extension
7930 /// stride, produce either a zero or any extension based on the available
7931 /// features of the subtarget.
7932 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7933     SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
7934     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7935   assert(Scale > 1 && "Need a scale to extend.");
7936   int NumElements = VT.getVectorNumElements();
7937   int EltBits = VT.getScalarSizeInBits();
7938   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
7939          "Only 8, 16, and 32 bit elements can be extended.");
7940   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
7941
7942   // Found a valid zext mask! Try various lowering strategies based on the
7943   // input type and available ISA extensions.
7944   if (Subtarget->hasSSE41()) {
7945     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
7946                                  NumElements / Scale);
7947     return DAG.getNode(ISD::BITCAST, DL, VT,
7948                        DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
7949   }
7950
7951   // For any extends we can cheat for larger element sizes and use shuffle
7952   // instructions that can fold with a load and/or copy.
7953   if (AnyExt && EltBits == 32) {
7954     int PSHUFDMask[4] = {0, -1, 1, -1};
7955     return DAG.getNode(
7956         ISD::BITCAST, DL, VT,
7957         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7958                     DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7959                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
7960   }
7961   if (AnyExt && EltBits == 16 && Scale > 2) {
7962     int PSHUFDMask[4] = {0, -1, 0, -1};
7963     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7964                          DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7965                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
7966     int PSHUFHWMask[4] = {1, -1, -1, -1};
7967     return DAG.getNode(
7968         ISD::BITCAST, DL, VT,
7969         DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
7970                     DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
7971                     getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
7972   }
7973
7974   // If this would require more than 2 unpack instructions to expand, use
7975   // pshufb when available. We can only use more than 2 unpack instructions
7976   // when zero extending i8 elements which also makes it easier to use pshufb.
7977   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
7978     assert(NumElements == 16 && "Unexpected byte vector width!");
7979     SDValue PSHUFBMask[16];
7980     for (int i = 0; i < 16; ++i)
7981       PSHUFBMask[i] =
7982           DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
7983     InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
7984     return DAG.getNode(ISD::BITCAST, DL, VT,
7985                        DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
7986                                    DAG.getNode(ISD::BUILD_VECTOR, DL,
7987                                                MVT::v16i8, PSHUFBMask)));
7988   }
7989
7990   // Otherwise emit a sequence of unpacks.
7991   do {
7992     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7993     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
7994                          : getZeroVector(InputVT, Subtarget, DAG, DL);
7995     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7996     InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
7997     Scale /= 2;
7998     EltBits *= 2;
7999     NumElements /= 2;
8000   } while (Scale > 1);
8001   return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
8002 }
8003
8004 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8005 ///
8006 /// This routine will try to do everything in its power to cleverly lower
8007 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
8008 /// check for the profitability of this lowering,  it tries to aggressively
8009 /// match this pattern. It will use all of the micro-architectural details it
8010 /// can to emit an efficient lowering. It handles both blends with all-zero
8011 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
8012 /// masking out later).
8013 ///
8014 /// The reason we have dedicated lowering for zext-style shuffles is that they
8015 /// are both incredibly common and often quite performance sensitive.
8016 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
8017     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8018     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8019   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8020
8021   int Bits = VT.getSizeInBits();
8022   int NumElements = VT.getVectorNumElements();
8023   assert(VT.getScalarSizeInBits() <= 32 &&
8024          "Exceeds 32-bit integer zero extension limit");
8025   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
8026
8027   // Define a helper function to check a particular ext-scale and lower to it if
8028   // valid.
8029   auto Lower = [&](int Scale) -> SDValue {
8030     SDValue InputV;
8031     bool AnyExt = true;
8032     for (int i = 0; i < NumElements; ++i) {
8033       if (Mask[i] == -1)
8034         continue; // Valid anywhere but doesn't tell us anything.
8035       if (i % Scale != 0) {
8036         // Each of the extended elements need to be zeroable.
8037         if (!Zeroable[i])
8038           return SDValue();
8039
8040         // We no longer are in the anyext case.
8041         AnyExt = false;
8042         continue;
8043       }
8044
8045       // Each of the base elements needs to be consecutive indices into the
8046       // same input vector.
8047       SDValue V = Mask[i] < NumElements ? V1 : V2;
8048       if (!InputV)
8049         InputV = V;
8050       else if (InputV != V)
8051         return SDValue(); // Flip-flopping inputs.
8052
8053       if (Mask[i] % NumElements != i / Scale)
8054         return SDValue(); // Non-consecutive strided elements.
8055     }
8056
8057     // If we fail to find an input, we have a zero-shuffle which should always
8058     // have already been handled.
8059     // FIXME: Maybe handle this here in case during blending we end up with one?
8060     if (!InputV)
8061       return SDValue();
8062
8063     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8064         DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
8065   };
8066
8067   // The widest scale possible for extending is to a 64-bit integer.
8068   assert(Bits % 64 == 0 &&
8069          "The number of bits in a vector must be divisible by 64 on x86!");
8070   int NumExtElements = Bits / 64;
8071
8072   // Each iteration, try extending the elements half as much, but into twice as
8073   // many elements.
8074   for (; NumExtElements < NumElements; NumExtElements *= 2) {
8075     assert(NumElements % NumExtElements == 0 &&
8076            "The input vector size must be divisible by the extended size.");
8077     if (SDValue V = Lower(NumElements / NumExtElements))
8078       return V;
8079   }
8080
8081   // General extends failed, but 128-bit vectors may be able to use MOVQ.
8082   if (Bits != 128)
8083     return SDValue();
8084
8085   // Returns one of the source operands if the shuffle can be reduced to a
8086   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
8087   auto CanZExtLowHalf = [&]() {
8088     for (int i = NumElements / 2; i != NumElements; i++)
8089       if (!Zeroable[i])
8090         return SDValue();
8091     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
8092       return V1;
8093     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
8094       return V2;
8095     return SDValue();
8096   };
8097
8098   if (SDValue V = CanZExtLowHalf()) {
8099     V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
8100     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
8101     return DAG.getNode(ISD::BITCAST, DL, VT, V);
8102   }
8103
8104   // No viable ext lowering found.
8105   return SDValue();
8106 }
8107
8108 /// \brief Try to get a scalar value for a specific element of a vector.
8109 ///
8110 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
8111 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
8112                                               SelectionDAG &DAG) {
8113   MVT VT = V.getSimpleValueType();
8114   MVT EltVT = VT.getVectorElementType();
8115   while (V.getOpcode() == ISD::BITCAST)
8116     V = V.getOperand(0);
8117   // If the bitcasts shift the element size, we can't extract an equivalent
8118   // element from it.
8119   MVT NewVT = V.getSimpleValueType();
8120   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
8121     return SDValue();
8122
8123   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8124       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
8125     return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
8126
8127   return SDValue();
8128 }
8129
8130 /// \brief Helper to test for a load that can be folded with x86 shuffles.
8131 ///
8132 /// This is particularly important because the set of instructions varies
8133 /// significantly based on whether the operand is a load or not.
8134 static bool isShuffleFoldableLoad(SDValue V) {
8135   while (V.getOpcode() == ISD::BITCAST)
8136     V = V.getOperand(0);
8137
8138   return ISD::isNON_EXTLoad(V.getNode());
8139 }
8140
8141 /// \brief Try to lower insertion of a single element into a zero vector.
8142 ///
8143 /// This is a common pattern that we have especially efficient patterns to lower
8144 /// across all subtarget feature sets.
8145 static SDValue lowerVectorShuffleAsElementInsertion(
8146     MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8147     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8148   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8149   MVT ExtVT = VT;
8150   MVT EltVT = VT.getVectorElementType();
8151
8152   int V2Index = std::find_if(Mask.begin(), Mask.end(),
8153                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
8154                 Mask.begin();
8155   bool IsV1Zeroable = true;
8156   for (int i = 0, Size = Mask.size(); i < Size; ++i)
8157     if (i != V2Index && !Zeroable[i]) {
8158       IsV1Zeroable = false;
8159       break;
8160     }
8161
8162   // Check for a single input from a SCALAR_TO_VECTOR node.
8163   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8164   // all the smarts here sunk into that routine. However, the current
8165   // lowering of BUILD_VECTOR makes that nearly impossible until the old
8166   // vector shuffle lowering is dead.
8167   if (SDValue V2S = getScalarValueForVectorElement(
8168           V2, Mask[V2Index] - Mask.size(), DAG)) {
8169     // We need to zext the scalar if it is smaller than an i32.
8170     V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
8171     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8172       // Using zext to expand a narrow element won't work for non-zero
8173       // insertions.
8174       if (!IsV1Zeroable)
8175         return SDValue();
8176
8177       // Zero-extend directly to i32.
8178       ExtVT = MVT::v4i32;
8179       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8180     }
8181     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8182   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8183              EltVT == MVT::i16) {
8184     // Either not inserting from the low element of the input or the input
8185     // element size is too small to use VZEXT_MOVL to clear the high bits.
8186     return SDValue();
8187   }
8188
8189   if (!IsV1Zeroable) {
8190     // If V1 can't be treated as a zero vector we have fewer options to lower
8191     // this. We can't support integer vectors or non-zero targets cheaply, and
8192     // the V1 elements can't be permuted in any way.
8193     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8194     if (!VT.isFloatingPoint() || V2Index != 0)
8195       return SDValue();
8196     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8197     V1Mask[V2Index] = -1;
8198     if (!isNoopShuffleMask(V1Mask))
8199       return SDValue();
8200     // This is essentially a special case blend operation, but if we have
8201     // general purpose blend operations, they are always faster. Bail and let
8202     // the rest of the lowering handle these as blends.
8203     if (Subtarget->hasSSE41())
8204       return SDValue();
8205
8206     // Otherwise, use MOVSD or MOVSS.
8207     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8208            "Only two types of floating point element types to handle!");
8209     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8210                        ExtVT, V1, V2);
8211   }
8212
8213   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8214   if (ExtVT != VT)
8215     V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8216
8217   if (V2Index != 0) {
8218     // If we have 4 or fewer lanes we can cheaply shuffle the element into
8219     // the desired position. Otherwise it is more efficient to do a vector
8220     // shift left. We know that we can do a vector shift left because all
8221     // the inputs are zero.
8222     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8223       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8224       V2Shuffle[V2Index] = 0;
8225       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8226     } else {
8227       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
8228       V2 = DAG.getNode(
8229           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
8230           DAG.getConstant(
8231               V2Index * EltVT.getSizeInBits(),
8232               DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
8233       V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8234     }
8235   }
8236   return V2;
8237 }
8238
8239 /// \brief Try to lower broadcast of a single element.
8240 ///
8241 /// For convenience, this code also bundles all of the subtarget feature set
8242 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8243 /// a convenient way to factor it out.
8244 static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
8245                                              ArrayRef<int> Mask,
8246                                              const X86Subtarget *Subtarget,
8247                                              SelectionDAG &DAG) {
8248   if (!Subtarget->hasAVX())
8249     return SDValue();
8250   if (VT.isInteger() && !Subtarget->hasAVX2())
8251     return SDValue();
8252
8253   // Check that the mask is a broadcast.
8254   int BroadcastIdx = -1;
8255   for (int M : Mask)
8256     if (M >= 0 && BroadcastIdx == -1)
8257       BroadcastIdx = M;
8258     else if (M >= 0 && M != BroadcastIdx)
8259       return SDValue();
8260
8261   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8262                                             "a sorted mask where the broadcast "
8263                                             "comes from V1.");
8264
8265   // Go up the chain of (vector) values to try and find a scalar load that
8266   // we can combine with the broadcast.
8267   for (;;) {
8268     switch (V.getOpcode()) {
8269     case ISD::CONCAT_VECTORS: {
8270       int OperandSize = Mask.size() / V.getNumOperands();
8271       V = V.getOperand(BroadcastIdx / OperandSize);
8272       BroadcastIdx %= OperandSize;
8273       continue;
8274     }
8275
8276     case ISD::INSERT_SUBVECTOR: {
8277       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8278       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8279       if (!ConstantIdx)
8280         break;
8281
8282       int BeginIdx = (int)ConstantIdx->getZExtValue();
8283       int EndIdx =
8284           BeginIdx + (int)VInner.getValueType().getVectorNumElements();
8285       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8286         BroadcastIdx -= BeginIdx;
8287         V = VInner;
8288       } else {
8289         V = VOuter;
8290       }
8291       continue;
8292     }
8293     }
8294     break;
8295   }
8296
8297   // Check if this is a broadcast of a scalar. We special case lowering
8298   // for scalars so that we can more effectively fold with loads.
8299   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8300       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8301     V = V.getOperand(BroadcastIdx);
8302
8303     // If the scalar isn't a load we can't broadcast from it in AVX1, only with
8304     // AVX2.
8305     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
8306       return SDValue();
8307   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
8308     // We can't broadcast from a vector register w/o AVX2, and we can only
8309     // broadcast from the zero-element of a vector register.
8310     return SDValue();
8311   }
8312
8313   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
8314 }
8315
8316 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8317 // INSERTPS when the V1 elements are already in the correct locations
8318 // because otherwise we can just always use two SHUFPS instructions which
8319 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8320 // perform INSERTPS if a single V1 element is out of place and all V2
8321 // elements are zeroable.
8322 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8323                                             ArrayRef<int> Mask,
8324                                             SelectionDAG &DAG) {
8325   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8326   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8327   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8328   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8329
8330   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8331
8332   unsigned ZMask = 0;
8333   int V1DstIndex = -1;
8334   int V2DstIndex = -1;
8335   bool V1UsedInPlace = false;
8336
8337   for (int i = 0; i < 4; i++) {
8338     // Synthesize a zero mask from the zeroable elements (includes undefs).
8339     if (Zeroable[i]) {
8340       ZMask |= 1 << i;
8341       continue;
8342     }
8343
8344     // Flag if we use any V1 inputs in place.
8345     if (i == Mask[i]) {
8346       V1UsedInPlace = true;
8347       continue;
8348     }
8349
8350     // We can only insert a single non-zeroable element.
8351     if (V1DstIndex != -1 || V2DstIndex != -1)
8352       return SDValue();
8353
8354     if (Mask[i] < 4) {
8355       // V1 input out of place for insertion.
8356       V1DstIndex = i;
8357     } else {
8358       // V2 input for insertion.
8359       V2DstIndex = i;
8360     }
8361   }
8362
8363   // Don't bother if we have no (non-zeroable) element for insertion.
8364   if (V1DstIndex == -1 && V2DstIndex == -1)
8365     return SDValue();
8366
8367   // Determine element insertion src/dst indices. The src index is from the
8368   // start of the inserted vector, not the start of the concatenated vector.
8369   unsigned V2SrcIndex = 0;
8370   if (V1DstIndex != -1) {
8371     // If we have a V1 input out of place, we use V1 as the V2 element insertion
8372     // and don't use the original V2 at all.
8373     V2SrcIndex = Mask[V1DstIndex];
8374     V2DstIndex = V1DstIndex;
8375     V2 = V1;
8376   } else {
8377     V2SrcIndex = Mask[V2DstIndex] - 4;
8378   }
8379
8380   // If no V1 inputs are used in place, then the result is created only from
8381   // the zero mask and the V2 insertion - so remove V1 dependency.
8382   if (!V1UsedInPlace)
8383     V1 = DAG.getUNDEF(MVT::v4f32);
8384
8385   unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8386   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8387
8388   // Insert the V2 element into the desired position.
8389   SDLoc DL(Op);
8390   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8391                      DAG.getConstant(InsertPSMask, MVT::i8));
8392 }
8393
8394 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8395 ///
8396 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8397 /// support for floating point shuffles but not integer shuffles. These
8398 /// instructions will incur a domain crossing penalty on some chips though so
8399 /// it is better to avoid lowering through this for integer vectors where
8400 /// possible.
8401 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8402                                        const X86Subtarget *Subtarget,
8403                                        SelectionDAG &DAG) {
8404   SDLoc DL(Op);
8405   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
8406   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8407   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8408   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8409   ArrayRef<int> Mask = SVOp->getMask();
8410   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8411
8412   if (isSingleInputShuffleMask(Mask)) {
8413     // Use low duplicate instructions for masks that match their pattern.
8414     if (Subtarget->hasSSE3())
8415       if (isShuffleEquivalent(Mask, 0, 0))
8416         return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
8417
8418     // Straight shuffle of a single input vector. Simulate this by using the
8419     // single input as both of the "inputs" to this instruction..
8420     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8421
8422     if (Subtarget->hasAVX()) {
8423       // If we have AVX, we can use VPERMILPS which will allow folding a load
8424       // into the shuffle.
8425       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8426                          DAG.getConstant(SHUFPDMask, MVT::i8));
8427     }
8428
8429     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
8430                        DAG.getConstant(SHUFPDMask, MVT::i8));
8431   }
8432   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8433   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8434
8435   // Use dedicated unpack instructions for masks that match their pattern.
8436   if (isShuffleEquivalent(Mask, 0, 2))
8437     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
8438   if (isShuffleEquivalent(Mask, 1, 3))
8439     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
8440
8441   // If we have a single input, insert that into V1 if we can do so cheaply.
8442   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8443     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8444             MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
8445       return Insertion;
8446     // Try inverting the insertion since for v2 masks it is easy to do and we
8447     // can't reliably sort the mask one way or the other.
8448     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8449                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8450     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8451             MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
8452       return Insertion;
8453   }
8454
8455   // Try to use one of the special instruction patterns to handle two common
8456   // blend patterns if a zero-blend above didn't work.
8457   if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
8458     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8459       // We can either use a special instruction to load over the low double or
8460       // to move just the low double.
8461       return DAG.getNode(
8462           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8463           DL, MVT::v2f64, V2,
8464           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8465
8466   if (Subtarget->hasSSE41())
8467     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8468                                                   Subtarget, DAG))
8469       return Blend;
8470
8471   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8472   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
8473                      DAG.getConstant(SHUFPDMask, MVT::i8));
8474 }
8475
8476 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8477 ///
8478 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8479 /// the integer unit to minimize domain crossing penalties. However, for blends
8480 /// it falls back to the floating point shuffle operation with appropriate bit
8481 /// casting.
8482 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8483                                        const X86Subtarget *Subtarget,
8484                                        SelectionDAG &DAG) {
8485   SDLoc DL(Op);
8486   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
8487   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8488   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8489   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8490   ArrayRef<int> Mask = SVOp->getMask();
8491   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8492
8493   if (isSingleInputShuffleMask(Mask)) {
8494     // Check for being able to broadcast a single element.
8495     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
8496                                                           Mask, Subtarget, DAG))
8497       return Broadcast;
8498
8499     // Straight shuffle of a single input vector. For everything from SSE2
8500     // onward this has a single fast instruction with no scary immediates.
8501     // We have to map the mask as it is actually a v4i32 shuffle instruction.
8502     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
8503     int WidenedMask[4] = {
8504         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
8505         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
8506     return DAG.getNode(
8507         ISD::BITCAST, DL, MVT::v2i64,
8508         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
8509                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
8510   }
8511
8512   // Try to use byte shift instructions.
8513   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8514           DL, MVT::v2i64, V1, V2, Mask, DAG))
8515     return Shift;
8516
8517   // If we have a single input from V2 insert that into V1 if we can do so
8518   // cheaply.
8519   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8520     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8521             MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
8522       return Insertion;
8523     // Try inverting the insertion since for v2 masks it is easy to do and we
8524     // can't reliably sort the mask one way or the other.
8525     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8526                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8527     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8528             MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
8529       return Insertion;
8530   }
8531
8532   // Use dedicated unpack instructions for masks that match their pattern.
8533   if (isShuffleEquivalent(Mask, 0, 2))
8534     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
8535   if (isShuffleEquivalent(Mask, 1, 3))
8536     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
8537
8538   if (Subtarget->hasSSE41())
8539     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
8540                                                   Subtarget, DAG))
8541       return Blend;
8542
8543   // Try to use byte rotation instructions.
8544   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8545   if (Subtarget->hasSSSE3())
8546     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8547             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8548       return Rotate;
8549
8550   // We implement this with SHUFPD which is pretty lame because it will likely
8551   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
8552   // However, all the alternatives are still more cycles and newer chips don't
8553   // have this problem. It would be really nice if x86 had better shuffles here.
8554   V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
8555   V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
8556   return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
8557                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
8558 }
8559
8560 /// \brief Lower a vector shuffle using the SHUFPS instruction.
8561 ///
8562 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
8563 /// It makes no assumptions about whether this is the *best* lowering, it simply
8564 /// uses it.
8565 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
8566                                             ArrayRef<int> Mask, SDValue V1,
8567                                             SDValue V2, SelectionDAG &DAG) {
8568   SDValue LowV = V1, HighV = V2;
8569   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
8570
8571   int NumV2Elements =
8572       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8573
8574   if (NumV2Elements == 1) {
8575     int V2Index =
8576         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8577         Mask.begin();
8578
8579     // Compute the index adjacent to V2Index and in the same half by toggling
8580     // the low bit.
8581     int V2AdjIndex = V2Index ^ 1;
8582
8583     if (Mask[V2AdjIndex] == -1) {
8584       // Handles all the cases where we have a single V2 element and an undef.
8585       // This will only ever happen in the high lanes because we commute the
8586       // vector otherwise.
8587       if (V2Index < 2)
8588         std::swap(LowV, HighV);
8589       NewMask[V2Index] -= 4;
8590     } else {
8591       // Handle the case where the V2 element ends up adjacent to a V1 element.
8592       // To make this work, blend them together as the first step.
8593       int V1Index = V2AdjIndex;
8594       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
8595       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
8596                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8597
8598       // Now proceed to reconstruct the final blend as we have the necessary
8599       // high or low half formed.
8600       if (V2Index < 2) {
8601         LowV = V2;
8602         HighV = V1;
8603       } else {
8604         HighV = V2;
8605       }
8606       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
8607       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
8608     }
8609   } else if (NumV2Elements == 2) {
8610     if (Mask[0] < 4 && Mask[1] < 4) {
8611       // Handle the easy case where we have V1 in the low lanes and V2 in the
8612       // high lanes.
8613       NewMask[2] -= 4;
8614       NewMask[3] -= 4;
8615     } else if (Mask[2] < 4 && Mask[3] < 4) {
8616       // We also handle the reversed case because this utility may get called
8617       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
8618       // arrange things in the right direction.
8619       NewMask[0] -= 4;
8620       NewMask[1] -= 4;
8621       HighV = V1;
8622       LowV = V2;
8623     } else {
8624       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
8625       // trying to place elements directly, just blend them and set up the final
8626       // shuffle to place them.
8627
8628       // The first two blend mask elements are for V1, the second two are for
8629       // V2.
8630       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
8631                           Mask[2] < 4 ? Mask[2] : Mask[3],
8632                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
8633                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
8634       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
8635                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8636
8637       // Now we do a normal shuffle of V1 by giving V1 as both operands to
8638       // a blend.
8639       LowV = HighV = V1;
8640       NewMask[0] = Mask[0] < 4 ? 0 : 2;
8641       NewMask[1] = Mask[0] < 4 ? 2 : 0;
8642       NewMask[2] = Mask[2] < 4 ? 1 : 3;
8643       NewMask[3] = Mask[2] < 4 ? 3 : 1;
8644     }
8645   }
8646   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
8647                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
8648 }
8649
8650 /// \brief Lower 4-lane 32-bit floating point shuffles.
8651 ///
8652 /// Uses instructions exclusively from the floating point unit to minimize
8653 /// domain crossing penalties, as these are sufficient to implement all v4f32
8654 /// shuffles.
8655 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8656                                        const X86Subtarget *Subtarget,
8657                                        SelectionDAG &DAG) {
8658   SDLoc DL(Op);
8659   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8660   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8661   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8662   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8663   ArrayRef<int> Mask = SVOp->getMask();
8664   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8665
8666   int NumV2Elements =
8667       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8668
8669   if (NumV2Elements == 0) {
8670     // Check for being able to broadcast a single element.
8671     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8672                                                           Mask, Subtarget, DAG))
8673       return Broadcast;
8674
8675     // Use even/odd duplicate instructions for masks that match their pattern.
8676     if (Subtarget->hasSSE3()) {
8677       if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
8678         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
8679       if (isShuffleEquivalent(Mask, 1, 1, 3, 3))
8680         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
8681     }
8682
8683     if (Subtarget->hasAVX()) {
8684       // If we have AVX, we can use VPERMILPS which will allow folding a load
8685       // into the shuffle.
8686       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
8687                          getV4X86ShuffleImm8ForMask(Mask, DAG));
8688     }
8689
8690     // Otherwise, use a straight shuffle of a single input vector. We pass the
8691     // input vector to both operands to simulate this with a SHUFPS.
8692     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
8693                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8694   }
8695
8696   // Use dedicated unpack instructions for masks that match their pattern.
8697   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8698     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
8699   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8700     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
8701
8702   // There are special ways we can lower some single-element blends. However, we
8703   // have custom ways we can lower more complex single-element blends below that
8704   // we defer to if both this and BLENDPS fail to match, so restrict this to
8705   // when the V2 input is targeting element 0 of the mask -- that is the fast
8706   // case here.
8707   if (NumV2Elements == 1 && Mask[0] >= 4)
8708     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
8709                                                          Mask, Subtarget, DAG))
8710       return V;
8711
8712   if (Subtarget->hasSSE41()) {
8713     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
8714                                                   Subtarget, DAG))
8715       return Blend;
8716
8717     // Use INSERTPS if we can complete the shuffle efficiently.
8718     if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8719       return V;
8720   }
8721
8722   // Otherwise fall back to a SHUFPS lowering strategy.
8723   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
8724 }
8725
8726 /// \brief Lower 4-lane i32 vector shuffles.
8727 ///
8728 /// We try to handle these with integer-domain shuffles where we can, but for
8729 /// blends we use the floating point domain blend instructions.
8730 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8731                                        const X86Subtarget *Subtarget,
8732                                        SelectionDAG &DAG) {
8733   SDLoc DL(Op);
8734   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
8735   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8736   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8737   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8738   ArrayRef<int> Mask = SVOp->getMask();
8739   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8740
8741   // Whenever we can lower this as a zext, that instruction is strictly faster
8742   // than any alternative. It also allows us to fold memory operands into the
8743   // shuffle in many cases.
8744   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
8745                                                          Mask, Subtarget, DAG))
8746     return ZExt;
8747
8748   int NumV2Elements =
8749       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8750
8751   if (NumV2Elements == 0) {
8752     // Check for being able to broadcast a single element.
8753     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8754                                                           Mask, Subtarget, DAG))
8755       return Broadcast;
8756
8757     // Straight shuffle of a single input vector. For everything from SSE2
8758     // onward this has a single fast instruction with no scary immediates.
8759     // We coerce the shuffle pattern to be compatible with UNPCK instructions
8760     // but we aren't actually going to use the UNPCK instruction because doing
8761     // so prevents folding a load into this instruction or making a copy.
8762     const int UnpackLoMask[] = {0, 0, 1, 1};
8763     const int UnpackHiMask[] = {2, 2, 3, 3};
8764     if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
8765       Mask = UnpackLoMask;
8766     else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
8767       Mask = UnpackHiMask;
8768
8769     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
8770                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8771   }
8772
8773   // Try to use bit shift instructions.
8774   if (SDValue Shift = lowerVectorShuffleAsBitShift(
8775           DL, MVT::v4i32, V1, V2, Mask, DAG))
8776     return Shift;
8777
8778   // Try to use byte shift instructions.
8779   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8780           DL, MVT::v4i32, V1, V2, Mask, DAG))
8781     return Shift;
8782
8783   // There are special ways we can lower some single-element blends.
8784   if (NumV2Elements == 1)
8785     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
8786                                                          Mask, Subtarget, DAG))
8787       return V;
8788
8789   if (Subtarget->hasSSE41())
8790     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
8791                                                   Subtarget, DAG))
8792       return Blend;
8793
8794   if (SDValue Masked =
8795           lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
8796     return Masked;
8797
8798   // Use dedicated unpack instructions for masks that match their pattern.
8799   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8800     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
8801   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8802     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
8803
8804   // Try to use byte rotation instructions.
8805   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8806   if (Subtarget->hasSSSE3())
8807     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8808             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
8809       return Rotate;
8810
8811   // We implement this with SHUFPS because it can blend from two vectors.
8812   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
8813   // up the inputs, bypassing domain shift penalties that we would encur if we
8814   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
8815   // relevant.
8816   return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
8817                      DAG.getVectorShuffle(
8818                          MVT::v4f32, DL,
8819                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
8820                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
8821 }
8822
8823 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
8824 /// shuffle lowering, and the most complex part.
8825 ///
8826 /// The lowering strategy is to try to form pairs of input lanes which are
8827 /// targeted at the same half of the final vector, and then use a dword shuffle
8828 /// to place them onto the right half, and finally unpack the paired lanes into
8829 /// their final position.
8830 ///
8831 /// The exact breakdown of how to form these dword pairs and align them on the
8832 /// correct sides is really tricky. See the comments within the function for
8833 /// more of the details.
8834 static SDValue lowerV8I16SingleInputVectorShuffle(
8835     SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
8836     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8837   assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
8838   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
8839   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
8840
8841   SmallVector<int, 4> LoInputs;
8842   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
8843                [](int M) { return M >= 0; });
8844   std::sort(LoInputs.begin(), LoInputs.end());
8845   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
8846   SmallVector<int, 4> HiInputs;
8847   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
8848                [](int M) { return M >= 0; });
8849   std::sort(HiInputs.begin(), HiInputs.end());
8850   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
8851   int NumLToL =
8852       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
8853   int NumHToL = LoInputs.size() - NumLToL;
8854   int NumLToH =
8855       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
8856   int NumHToH = HiInputs.size() - NumLToH;
8857   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
8858   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
8859   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
8860   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
8861
8862   // Check for being able to broadcast a single element.
8863   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
8864                                                         Mask, Subtarget, DAG))
8865     return Broadcast;
8866
8867   // Try to use bit shift instructions.
8868   if (SDValue Shift = lowerVectorShuffleAsBitShift(
8869           DL, MVT::v8i16, V, V, Mask, DAG))
8870     return Shift;
8871
8872   // Try to use byte shift instructions.
8873   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8874           DL, MVT::v8i16, V, V, Mask, DAG))
8875     return Shift;
8876
8877   // Use dedicated unpack instructions for masks that match their pattern.
8878   if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
8879     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
8880   if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
8881     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
8882
8883   // Try to use byte rotation instructions.
8884   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8885           DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
8886     return Rotate;
8887
8888   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
8889   // such inputs we can swap two of the dwords across the half mark and end up
8890   // with <=2 inputs to each half in each half. Once there, we can fall through
8891   // to the generic code below. For example:
8892   //
8893   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8894   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
8895   //
8896   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
8897   // and an existing 2-into-2 on the other half. In this case we may have to
8898   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
8899   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
8900   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
8901   // because any other situation (including a 3-into-1 or 1-into-3 in the other
8902   // half than the one we target for fixing) will be fixed when we re-enter this
8903   // path. We will also combine away any sequence of PSHUFD instructions that
8904   // result into a single instruction. Here is an example of the tricky case:
8905   //
8906   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8907   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
8908   //
8909   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
8910   //
8911   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
8912   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
8913   //
8914   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
8915   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
8916   //
8917   // The result is fine to be handled by the generic logic.
8918   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
8919                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
8920                           int AOffset, int BOffset) {
8921     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
8922            "Must call this with A having 3 or 1 inputs from the A half.");
8923     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
8924            "Must call this with B having 1 or 3 inputs from the B half.");
8925     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
8926            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
8927
8928     // Compute the index of dword with only one word among the three inputs in
8929     // a half by taking the sum of the half with three inputs and subtracting
8930     // the sum of the actual three inputs. The difference is the remaining
8931     // slot.
8932     int ADWord, BDWord;
8933     int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
8934     int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
8935     int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
8936     ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
8937     int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
8938     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
8939     int TripleNonInputIdx =
8940         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
8941     TripleDWord = TripleNonInputIdx / 2;
8942
8943     // We use xor with one to compute the adjacent DWord to whichever one the
8944     // OneInput is in.
8945     OneInputDWord = (OneInput / 2) ^ 1;
8946
8947     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
8948     // and BToA inputs. If there is also such a problem with the BToB and AToB
8949     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
8950     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
8951     // is essential that we don't *create* a 3<-1 as then we might oscillate.
8952     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
8953       // Compute how many inputs will be flipped by swapping these DWords. We
8954       // need
8955       // to balance this to ensure we don't form a 3-1 shuffle in the other
8956       // half.
8957       int NumFlippedAToBInputs =
8958           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
8959           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
8960       int NumFlippedBToBInputs =
8961           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
8962           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
8963       if ((NumFlippedAToBInputs == 1 &&
8964            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
8965           (NumFlippedBToBInputs == 1 &&
8966            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
8967         // We choose whether to fix the A half or B half based on whether that
8968         // half has zero flipped inputs. At zero, we may not be able to fix it
8969         // with that half. We also bias towards fixing the B half because that
8970         // will more commonly be the high half, and we have to bias one way.
8971         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
8972                                                        ArrayRef<int> Inputs) {
8973           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
8974           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
8975                                          PinnedIdx ^ 1) != Inputs.end();
8976           // Determine whether the free index is in the flipped dword or the
8977           // unflipped dword based on where the pinned index is. We use this bit
8978           // in an xor to conditionally select the adjacent dword.
8979           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
8980           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8981                                              FixFreeIdx) != Inputs.end();
8982           if (IsFixIdxInput == IsFixFreeIdxInput)
8983             FixFreeIdx += 1;
8984           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8985                                         FixFreeIdx) != Inputs.end();
8986           assert(IsFixIdxInput != IsFixFreeIdxInput &&
8987                  "We need to be changing the number of flipped inputs!");
8988           int PSHUFHalfMask[] = {0, 1, 2, 3};
8989           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
8990           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
8991                           MVT::v8i16, V,
8992                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
8993
8994           for (int &M : Mask)
8995             if (M != -1 && M == FixIdx)
8996               M = FixFreeIdx;
8997             else if (M != -1 && M == FixFreeIdx)
8998               M = FixIdx;
8999         };
9000         if (NumFlippedBToBInputs != 0) {
9001           int BPinnedIdx =
9002               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9003           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
9004         } else {
9005           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
9006           int APinnedIdx =
9007               AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9008           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
9009         }
9010       }
9011     }
9012
9013     int PSHUFDMask[] = {0, 1, 2, 3};
9014     PSHUFDMask[ADWord] = BDWord;
9015     PSHUFDMask[BDWord] = ADWord;
9016     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9017                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9018                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9019                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9020
9021     // Adjust the mask to match the new locations of A and B.
9022     for (int &M : Mask)
9023       if (M != -1 && M/2 == ADWord)
9024         M = 2 * BDWord + M % 2;
9025       else if (M != -1 && M/2 == BDWord)
9026         M = 2 * ADWord + M % 2;
9027
9028     // Recurse back into this routine to re-compute state now that this isn't
9029     // a 3 and 1 problem.
9030     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9031                                 Mask);
9032   };
9033   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
9034     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
9035   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
9036     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
9037
9038   // At this point there are at most two inputs to the low and high halves from
9039   // each half. That means the inputs can always be grouped into dwords and
9040   // those dwords can then be moved to the correct half with a dword shuffle.
9041   // We use at most one low and one high word shuffle to collect these paired
9042   // inputs into dwords, and finally a dword shuffle to place them.
9043   int PSHUFLMask[4] = {-1, -1, -1, -1};
9044   int PSHUFHMask[4] = {-1, -1, -1, -1};
9045   int PSHUFDMask[4] = {-1, -1, -1, -1};
9046
9047   // First fix the masks for all the inputs that are staying in their
9048   // original halves. This will then dictate the targets of the cross-half
9049   // shuffles.
9050   auto fixInPlaceInputs =
9051       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
9052                     MutableArrayRef<int> SourceHalfMask,
9053                     MutableArrayRef<int> HalfMask, int HalfOffset) {
9054     if (InPlaceInputs.empty())
9055       return;
9056     if (InPlaceInputs.size() == 1) {
9057       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9058           InPlaceInputs[0] - HalfOffset;
9059       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
9060       return;
9061     }
9062     if (IncomingInputs.empty()) {
9063       // Just fix all of the in place inputs.
9064       for (int Input : InPlaceInputs) {
9065         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
9066         PSHUFDMask[Input / 2] = Input / 2;
9067       }
9068       return;
9069     }
9070
9071     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
9072     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9073         InPlaceInputs[0] - HalfOffset;
9074     // Put the second input next to the first so that they are packed into
9075     // a dword. We find the adjacent index by toggling the low bit.
9076     int AdjIndex = InPlaceInputs[0] ^ 1;
9077     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
9078     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
9079     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
9080   };
9081   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
9082   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
9083
9084   // Now gather the cross-half inputs and place them into a free dword of
9085   // their target half.
9086   // FIXME: This operation could almost certainly be simplified dramatically to
9087   // look more like the 3-1 fixing operation.
9088   auto moveInputsToRightHalf = [&PSHUFDMask](
9089       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
9090       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
9091       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
9092       int DestOffset) {
9093     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
9094       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
9095     };
9096     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
9097                                                int Word) {
9098       int LowWord = Word & ~1;
9099       int HighWord = Word | 1;
9100       return isWordClobbered(SourceHalfMask, LowWord) ||
9101              isWordClobbered(SourceHalfMask, HighWord);
9102     };
9103
9104     if (IncomingInputs.empty())
9105       return;
9106
9107     if (ExistingInputs.empty()) {
9108       // Map any dwords with inputs from them into the right half.
9109       for (int Input : IncomingInputs) {
9110         // If the source half mask maps over the inputs, turn those into
9111         // swaps and use the swapped lane.
9112         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
9113           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
9114             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
9115                 Input - SourceOffset;
9116             // We have to swap the uses in our half mask in one sweep.
9117             for (int &M : HalfMask)
9118               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
9119                 M = Input;
9120               else if (M == Input)
9121                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9122           } else {
9123             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
9124                        Input - SourceOffset &&
9125                    "Previous placement doesn't match!");
9126           }
9127           // Note that this correctly re-maps both when we do a swap and when
9128           // we observe the other side of the swap above. We rely on that to
9129           // avoid swapping the members of the input list directly.
9130           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9131         }
9132
9133         // Map the input's dword into the correct half.
9134         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
9135           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
9136         else
9137           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
9138                      Input / 2 &&
9139                  "Previous placement doesn't match!");
9140       }
9141
9142       // And just directly shift any other-half mask elements to be same-half
9143       // as we will have mirrored the dword containing the element into the
9144       // same position within that half.
9145       for (int &M : HalfMask)
9146         if (M >= SourceOffset && M < SourceOffset + 4) {
9147           M = M - SourceOffset + DestOffset;
9148           assert(M >= 0 && "This should never wrap below zero!");
9149         }
9150       return;
9151     }
9152
9153     // Ensure we have the input in a viable dword of its current half. This
9154     // is particularly tricky because the original position may be clobbered
9155     // by inputs being moved and *staying* in that half.
9156     if (IncomingInputs.size() == 1) {
9157       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9158         int InputFixed = std::find(std::begin(SourceHalfMask),
9159                                    std::end(SourceHalfMask), -1) -
9160                          std::begin(SourceHalfMask) + SourceOffset;
9161         SourceHalfMask[InputFixed - SourceOffset] =
9162             IncomingInputs[0] - SourceOffset;
9163         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9164                      InputFixed);
9165         IncomingInputs[0] = InputFixed;
9166       }
9167     } else if (IncomingInputs.size() == 2) {
9168       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9169           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9170         // We have two non-adjacent or clobbered inputs we need to extract from
9171         // the source half. To do this, we need to map them into some adjacent
9172         // dword slot in the source mask.
9173         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9174                               IncomingInputs[1] - SourceOffset};
9175
9176         // If there is a free slot in the source half mask adjacent to one of
9177         // the inputs, place the other input in it. We use (Index XOR 1) to
9178         // compute an adjacent index.
9179         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9180             SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
9181           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9182           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9183           InputsFixed[1] = InputsFixed[0] ^ 1;
9184         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9185                    SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
9186           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9187           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9188           InputsFixed[0] = InputsFixed[1] ^ 1;
9189         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
9190                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
9191           // The two inputs are in the same DWord but it is clobbered and the
9192           // adjacent DWord isn't used at all. Move both inputs to the free
9193           // slot.
9194           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9195           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9196           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9197           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9198         } else {
9199           // The only way we hit this point is if there is no clobbering
9200           // (because there are no off-half inputs to this half) and there is no
9201           // free slot adjacent to one of the inputs. In this case, we have to
9202           // swap an input with a non-input.
9203           for (int i = 0; i < 4; ++i)
9204             assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
9205                    "We can't handle any clobbers here!");
9206           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9207                  "Cannot have adjacent inputs here!");
9208
9209           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9210           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9211
9212           // We also have to update the final source mask in this case because
9213           // it may need to undo the above swap.
9214           for (int &M : FinalSourceHalfMask)
9215             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9216               M = InputsFixed[1] + SourceOffset;
9217             else if (M == InputsFixed[1] + SourceOffset)
9218               M = (InputsFixed[0] ^ 1) + SourceOffset;
9219
9220           InputsFixed[1] = InputsFixed[0] ^ 1;
9221         }
9222
9223         // Point everything at the fixed inputs.
9224         for (int &M : HalfMask)
9225           if (M == IncomingInputs[0])
9226             M = InputsFixed[0] + SourceOffset;
9227           else if (M == IncomingInputs[1])
9228             M = InputsFixed[1] + SourceOffset;
9229
9230         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9231         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9232       }
9233     } else {
9234       llvm_unreachable("Unhandled input size!");
9235     }
9236
9237     // Now hoist the DWord down to the right half.
9238     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
9239     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
9240     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9241     for (int &M : HalfMask)
9242       for (int Input : IncomingInputs)
9243         if (M == Input)
9244           M = FreeDWord * 2 + Input % 2;
9245   };
9246   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9247                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
9248   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9249                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
9250
9251   // Now enact all the shuffles we've computed to move the inputs into their
9252   // target half.
9253   if (!isNoopShuffleMask(PSHUFLMask))
9254     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9255                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
9256   if (!isNoopShuffleMask(PSHUFHMask))
9257     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9258                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
9259   if (!isNoopShuffleMask(PSHUFDMask))
9260     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9261                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9262                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9263                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9264
9265   // At this point, each half should contain all its inputs, and we can then
9266   // just shuffle them into their final position.
9267   assert(std::count_if(LoMask.begin(), LoMask.end(),
9268                        [](int M) { return M >= 4; }) == 0 &&
9269          "Failed to lift all the high half inputs to the low mask!");
9270   assert(std::count_if(HiMask.begin(), HiMask.end(),
9271                        [](int M) { return M >= 0 && M < 4; }) == 0 &&
9272          "Failed to lift all the low half inputs to the high mask!");
9273
9274   // Do a half shuffle for the low mask.
9275   if (!isNoopShuffleMask(LoMask))
9276     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9277                     getV4X86ShuffleImm8ForMask(LoMask, DAG));
9278
9279   // Do a half shuffle with the high mask after shifting its values down.
9280   for (int &M : HiMask)
9281     if (M >= 0)
9282       M -= 4;
9283   if (!isNoopShuffleMask(HiMask))
9284     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9285                     getV4X86ShuffleImm8ForMask(HiMask, DAG));
9286
9287   return V;
9288 }
9289
9290 /// \brief Detect whether the mask pattern should be lowered through
9291 /// interleaving.
9292 ///
9293 /// This essentially tests whether viewing the mask as an interleaving of two
9294 /// sub-sequences reduces the cross-input traffic of a blend operation. If so,
9295 /// lowering it through interleaving is a significantly better strategy.
9296 static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
9297   int NumEvenInputs[2] = {0, 0};
9298   int NumOddInputs[2] = {0, 0};
9299   int NumLoInputs[2] = {0, 0};
9300   int NumHiInputs[2] = {0, 0};
9301   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9302     if (Mask[i] < 0)
9303       continue;
9304
9305     int InputIdx = Mask[i] >= Size;
9306
9307     if (i < Size / 2)
9308       ++NumLoInputs[InputIdx];
9309     else
9310       ++NumHiInputs[InputIdx];
9311
9312     if ((i % 2) == 0)
9313       ++NumEvenInputs[InputIdx];
9314     else
9315       ++NumOddInputs[InputIdx];
9316   }
9317
9318   // The minimum number of cross-input results for both the interleaved and
9319   // split cases. If interleaving results in fewer cross-input results, return
9320   // true.
9321   int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
9322                                     NumEvenInputs[0] + NumOddInputs[1]);
9323   int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
9324                               NumLoInputs[0] + NumHiInputs[1]);
9325   return InterleavedCrosses < SplitCrosses;
9326 }
9327
9328 /// \brief Blend two v8i16 vectors using a naive unpack strategy.
9329 ///
9330 /// This strategy only works when the inputs from each vector fit into a single
9331 /// half of that vector, and generally there are not so many inputs as to leave
9332 /// the in-place shuffles required highly constrained (and thus expensive). It
9333 /// shifts all the inputs into a single side of both input vectors and then
9334 /// uses an unpack to interleave these inputs in a single vector. At that
9335 /// point, we will fall back on the generic single input shuffle lowering.
9336 static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
9337                                                  SDValue V2,
9338                                                  MutableArrayRef<int> Mask,
9339                                                  const X86Subtarget *Subtarget,
9340                                                  SelectionDAG &DAG) {
9341   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9342   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9343   SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
9344   for (int i = 0; i < 8; ++i)
9345     if (Mask[i] >= 0 && Mask[i] < 4)
9346       LoV1Inputs.push_back(i);
9347     else if (Mask[i] >= 4 && Mask[i] < 8)
9348       HiV1Inputs.push_back(i);
9349     else if (Mask[i] >= 8 && Mask[i] < 12)
9350       LoV2Inputs.push_back(i);
9351     else if (Mask[i] >= 12)
9352       HiV2Inputs.push_back(i);
9353
9354   int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
9355   int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
9356   (void)NumV1Inputs;
9357   (void)NumV2Inputs;
9358   assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
9359   assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
9360   assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
9361
9362   bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
9363                      HiV1Inputs.size() + HiV2Inputs.size();
9364
9365   auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
9366                               ArrayRef<int> HiInputs, bool MoveToLo,
9367                               int MaskOffset) {
9368     ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
9369     ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
9370     if (BadInputs.empty())
9371       return V;
9372
9373     int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9374     int MoveOffset = MoveToLo ? 0 : 4;
9375
9376     if (GoodInputs.empty()) {
9377       for (int BadInput : BadInputs) {
9378         MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
9379         Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
9380       }
9381     } else {
9382       if (GoodInputs.size() == 2) {
9383         // If the low inputs are spread across two dwords, pack them into
9384         // a single dword.
9385         MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
9386         MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
9387         Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
9388         Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
9389       } else {
9390         // Otherwise pin the good inputs.
9391         for (int GoodInput : GoodInputs)
9392           MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
9393       }
9394
9395       if (BadInputs.size() == 2) {
9396         // If we have two bad inputs then there may be either one or two good
9397         // inputs fixed in place. Find a fixed input, and then find the *other*
9398         // two adjacent indices by using modular arithmetic.
9399         int GoodMaskIdx =
9400             std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
9401                          [](int M) { return M >= 0; }) -
9402             std::begin(MoveMask);
9403         int MoveMaskIdx =
9404             ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
9405         assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
9406         assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
9407         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9408         MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
9409         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9410         Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
9411       } else {
9412         assert(BadInputs.size() == 1 && "All sizes handled");
9413         int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
9414                                     std::end(MoveMask), -1) -
9415                           std::begin(MoveMask);
9416         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9417         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9418       }
9419     }
9420
9421     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9422                                 MoveMask);
9423   };
9424   V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
9425                         /*MaskOffset*/ 0);
9426   V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
9427                         /*MaskOffset*/ 8);
9428
9429   // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
9430   // cross-half traffic in the final shuffle.
9431
9432   // Munge the mask to be a single-input mask after the unpack merges the
9433   // results.
9434   for (int &M : Mask)
9435     if (M != -1)
9436       M = 2 * (M % 4) + (M / 8);
9437
9438   return DAG.getVectorShuffle(
9439       MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
9440                                   DL, MVT::v8i16, V1, V2),
9441       DAG.getUNDEF(MVT::v8i16), Mask);
9442 }
9443
9444 /// \brief Generic lowering of 8-lane i16 shuffles.
9445 ///
9446 /// This handles both single-input shuffles and combined shuffle/blends with
9447 /// two inputs. The single input shuffles are immediately delegated to
9448 /// a dedicated lowering routine.
9449 ///
9450 /// The blends are lowered in one of three fundamental ways. If there are few
9451 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9452 /// of the input is significantly cheaper when lowered as an interleaving of
9453 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9454 /// halves of the inputs separately (making them have relatively few inputs)
9455 /// and then concatenate them.
9456 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9457                                        const X86Subtarget *Subtarget,
9458                                        SelectionDAG &DAG) {
9459   SDLoc DL(Op);
9460   assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
9461   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9462   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9463   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9464   ArrayRef<int> OrigMask = SVOp->getMask();
9465   int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9466                         OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
9467   MutableArrayRef<int> Mask(MaskStorage);
9468
9469   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9470
9471   // Whenever we can lower this as a zext, that instruction is strictly faster
9472   // than any alternative.
9473   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9474           DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
9475     return ZExt;
9476
9477   auto isV1 = [](int M) { return M >= 0 && M < 8; };
9478   auto isV2 = [](int M) { return M >= 8; };
9479
9480   int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
9481   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
9482
9483   if (NumV2Inputs == 0)
9484     return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
9485
9486   assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
9487                             "to be V1-input shuffles.");
9488
9489   // Try to use bit shift instructions.
9490   if (SDValue Shift = lowerVectorShuffleAsBitShift(
9491           DL, MVT::v8i16, V1, V2, Mask, DAG))
9492     return Shift;
9493
9494   // Try to use byte shift instructions.
9495   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9496           DL, MVT::v8i16, V1, V2, Mask, DAG))
9497     return Shift;
9498
9499   // There are special ways we can lower some single-element blends.
9500   if (NumV2Inputs == 1)
9501     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
9502                                                          Mask, Subtarget, DAG))
9503       return V;
9504
9505   if (Subtarget->hasSSE41())
9506     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9507                                                   Subtarget, DAG))
9508       return Blend;
9509
9510   if (SDValue Masked =
9511           lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
9512     return Masked;
9513
9514   // Use dedicated unpack instructions for masks that match their pattern.
9515   if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
9516     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
9517   if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
9518     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
9519
9520   // Try to use byte rotation instructions.
9521   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9522           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9523     return Rotate;
9524
9525   if (NumV1Inputs + NumV2Inputs <= 4)
9526     return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
9527
9528   // Check whether an interleaving lowering is likely to be more efficient.
9529   // This isn't perfect but it is a strong heuristic that tends to work well on
9530   // the kinds of shuffles that show up in practice.
9531   //
9532   // FIXME: Handle 1x, 2x, and 4x interleaving.
9533   if (shouldLowerAsInterleaving(Mask)) {
9534     // FIXME: Figure out whether we should pack these into the low or high
9535     // halves.
9536
9537     int EMask[8], OMask[8];
9538     for (int i = 0; i < 4; ++i) {
9539       EMask[i] = Mask[2*i];
9540       OMask[i] = Mask[2*i + 1];
9541       EMask[i + 4] = -1;
9542       OMask[i + 4] = -1;
9543     }
9544
9545     SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
9546     SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
9547
9548     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
9549   }
9550
9551   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9552   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9553
9554   for (int i = 0; i < 4; ++i) {
9555     LoBlendMask[i] = Mask[i];
9556     HiBlendMask[i] = Mask[i + 4];
9557   }
9558
9559   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9560   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9561   LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
9562   HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
9563
9564   return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9565                      DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
9566 }
9567
9568 /// \brief Check whether a compaction lowering can be done by dropping even
9569 /// elements and compute how many times even elements must be dropped.
9570 ///
9571 /// This handles shuffles which take every Nth element where N is a power of
9572 /// two. Example shuffle masks:
9573 ///
9574 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
9575 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
9576 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
9577 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
9578 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
9579 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
9580 ///
9581 /// Any of these lanes can of course be undef.
9582 ///
9583 /// This routine only supports N <= 3.
9584 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
9585 /// for larger N.
9586 ///
9587 /// \returns N above, or the number of times even elements must be dropped if
9588 /// there is such a number. Otherwise returns zero.
9589 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
9590   // Figure out whether we're looping over two inputs or just one.
9591   bool IsSingleInput = isSingleInputShuffleMask(Mask);
9592
9593   // The modulus for the shuffle vector entries is based on whether this is
9594   // a single input or not.
9595   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
9596   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
9597          "We should only be called with masks with a power-of-2 size!");
9598
9599   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
9600
9601   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
9602   // and 2^3 simultaneously. This is because we may have ambiguity with
9603   // partially undef inputs.
9604   bool ViableForN[3] = {true, true, true};
9605
9606   for (int i = 0, e = Mask.size(); i < e; ++i) {
9607     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
9608     // want.
9609     if (Mask[i] == -1)
9610       continue;
9611
9612     bool IsAnyViable = false;
9613     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9614       if (ViableForN[j]) {
9615         uint64_t N = j + 1;
9616
9617         // The shuffle mask must be equal to (i * 2^N) % M.
9618         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
9619           IsAnyViable = true;
9620         else
9621           ViableForN[j] = false;
9622       }
9623     // Early exit if we exhaust the possible powers of two.
9624     if (!IsAnyViable)
9625       break;
9626   }
9627
9628   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9629     if (ViableForN[j])
9630       return j + 1;
9631
9632   // Return 0 as there is no viable power of two.
9633   return 0;
9634 }
9635
9636 /// \brief Generic lowering of v16i8 shuffles.
9637 ///
9638 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
9639 /// detect any complexity reducing interleaving. If that doesn't help, it uses
9640 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
9641 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
9642 /// back together.
9643 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9644                                        const X86Subtarget *Subtarget,
9645                                        SelectionDAG &DAG) {
9646   SDLoc DL(Op);
9647   assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
9648   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9649   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9650   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9651   ArrayRef<int> OrigMask = SVOp->getMask();
9652   assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
9653
9654   // Try to use bit shift instructions.
9655   if (SDValue Shift = lowerVectorShuffleAsBitShift(
9656           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9657     return Shift;
9658
9659   // Try to use byte shift instructions.
9660   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9661           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9662     return Shift;
9663
9664   // Try to use byte rotation instructions.
9665   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9666           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9667     return Rotate;
9668
9669   // Try to use a zext lowering.
9670   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9671           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9672     return ZExt;
9673
9674   int MaskStorage[16] = {
9675       OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
9676       OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
9677       OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
9678       OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
9679   MutableArrayRef<int> Mask(MaskStorage);
9680   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
9681   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
9682
9683   int NumV2Elements =
9684       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
9685
9686   // For single-input shuffles, there are some nicer lowering tricks we can use.
9687   if (NumV2Elements == 0) {
9688     // Check for being able to broadcast a single element.
9689     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9690                                                           Mask, Subtarget, DAG))
9691       return Broadcast;
9692
9693     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
9694     // Notably, this handles splat and partial-splat shuffles more efficiently.
9695     // However, it only makes sense if the pre-duplication shuffle simplifies
9696     // things significantly. Currently, this means we need to be able to
9697     // express the pre-duplication shuffle as an i16 shuffle.
9698     //
9699     // FIXME: We should check for other patterns which can be widened into an
9700     // i16 shuffle as well.
9701     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
9702       for (int i = 0; i < 16; i += 2)
9703         if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
9704           return false;
9705
9706       return true;
9707     };
9708     auto tryToWidenViaDuplication = [&]() -> SDValue {
9709       if (!canWidenViaDuplication(Mask))
9710         return SDValue();
9711       SmallVector<int, 4> LoInputs;
9712       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
9713                    [](int M) { return M >= 0 && M < 8; });
9714       std::sort(LoInputs.begin(), LoInputs.end());
9715       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
9716                      LoInputs.end());
9717       SmallVector<int, 4> HiInputs;
9718       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
9719                    [](int M) { return M >= 8; });
9720       std::sort(HiInputs.begin(), HiInputs.end());
9721       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
9722                      HiInputs.end());
9723
9724       bool TargetLo = LoInputs.size() >= HiInputs.size();
9725       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
9726       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
9727
9728       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9729       SmallDenseMap<int, int, 8> LaneMap;
9730       for (int I : InPlaceInputs) {
9731         PreDupI16Shuffle[I/2] = I/2;
9732         LaneMap[I] = I;
9733       }
9734       int j = TargetLo ? 0 : 4, je = j + 4;
9735       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
9736         // Check if j is already a shuffle of this input. This happens when
9737         // there are two adjacent bytes after we move the low one.
9738         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
9739           // If we haven't yet mapped the input, search for a slot into which
9740           // we can map it.
9741           while (j < je && PreDupI16Shuffle[j] != -1)
9742             ++j;
9743
9744           if (j == je)
9745             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
9746             return SDValue();
9747
9748           // Map this input with the i16 shuffle.
9749           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
9750         }
9751
9752         // Update the lane map based on the mapping we ended up with.
9753         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
9754       }
9755       V1 = DAG.getNode(
9756           ISD::BITCAST, DL, MVT::v16i8,
9757           DAG.getVectorShuffle(MVT::v8i16, DL,
9758                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9759                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
9760
9761       // Unpack the bytes to form the i16s that will be shuffled into place.
9762       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9763                        MVT::v16i8, V1, V1);
9764
9765       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9766       for (int i = 0; i < 16; ++i)
9767         if (Mask[i] != -1) {
9768           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
9769           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
9770           if (PostDupI16Shuffle[i / 2] == -1)
9771             PostDupI16Shuffle[i / 2] = MappedMask;
9772           else
9773             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
9774                    "Conflicting entrties in the original shuffle!");
9775         }
9776       return DAG.getNode(
9777           ISD::BITCAST, DL, MVT::v16i8,
9778           DAG.getVectorShuffle(MVT::v8i16, DL,
9779                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9780                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
9781     };
9782     if (SDValue V = tryToWidenViaDuplication())
9783       return V;
9784   }
9785
9786   // Check whether an interleaving lowering is likely to be more efficient.
9787   // This isn't perfect but it is a strong heuristic that tends to work well on
9788   // the kinds of shuffles that show up in practice.
9789   //
9790   // FIXME: We need to handle other interleaving widths (i16, i32, ...).
9791   if (shouldLowerAsInterleaving(Mask)) {
9792     int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9793       return (M >= 0 && M < 8) || (M >= 16 && M < 24);
9794     });
9795     int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9796       return (M >= 8 && M < 16) || M >= 24;
9797     });
9798     int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9799                      -1, -1, -1, -1, -1, -1, -1, -1};
9800     int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9801                      -1, -1, -1, -1, -1, -1, -1, -1};
9802     bool UnpackLo = NumLoHalf >= NumHiHalf;
9803     MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
9804     MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
9805     for (int i = 0; i < 8; ++i) {
9806       TargetEMask[i] = Mask[2 * i];
9807       TargetOMask[i] = Mask[2 * i + 1];
9808     }
9809
9810     SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
9811     SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
9812
9813     return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9814                        MVT::v16i8, Evens, Odds);
9815   }
9816
9817   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
9818   // with PSHUFB. It is important to do this before we attempt to generate any
9819   // blends but after all of the single-input lowerings. If the single input
9820   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
9821   // want to preserve that and we can DAG combine any longer sequences into
9822   // a PSHUFB in the end. But once we start blending from multiple inputs,
9823   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
9824   // and there are *very* few patterns that would actually be faster than the
9825   // PSHUFB approach because of its ability to zero lanes.
9826   //
9827   // FIXME: The only exceptions to the above are blends which are exact
9828   // interleavings with direct instructions supporting them. We currently don't
9829   // handle those well here.
9830   if (Subtarget->hasSSSE3()) {
9831     SDValue V1Mask[16];
9832     SDValue V2Mask[16];
9833     bool V1InUse = false;
9834     bool V2InUse = false;
9835     SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9836
9837     for (int i = 0; i < 16; ++i) {
9838       if (Mask[i] == -1) {
9839         V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9840       } else {
9841         const int ZeroMask = 0x80;
9842         int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
9843         int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
9844         if (Zeroable[i])
9845           V1Idx = V2Idx = ZeroMask;
9846         V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
9847         V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
9848         V1InUse |= (ZeroMask != V1Idx);
9849         V2InUse |= (ZeroMask != V2Idx);
9850       }
9851     }
9852
9853     if (V1InUse)
9854       V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
9855                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
9856     if (V2InUse)
9857       V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
9858                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
9859
9860     // If we need shuffled inputs from both, blend the two.
9861     if (V1InUse && V2InUse)
9862       return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9863     if (V1InUse)
9864       return V1; // Single inputs are easy.
9865     if (V2InUse)
9866       return V2; // Single inputs are easy.
9867     // Shuffling to a zeroable vector.
9868     return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
9869   }
9870
9871   // There are special ways we can lower some single-element blends.
9872   if (NumV2Elements == 1)
9873     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
9874                                                          Mask, Subtarget, DAG))
9875       return V;
9876
9877   // Check whether a compaction lowering can be done. This handles shuffles
9878   // which take every Nth element for some even N. See the helper function for
9879   // details.
9880   //
9881   // We special case these as they can be particularly efficiently handled with
9882   // the PACKUSB instruction on x86 and they show up in common patterns of
9883   // rearranging bytes to truncate wide elements.
9884   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
9885     // NumEvenDrops is the power of two stride of the elements. Another way of
9886     // thinking about it is that we need to drop the even elements this many
9887     // times to get the original input.
9888     bool IsSingleInput = isSingleInputShuffleMask(Mask);
9889
9890     // First we need to zero all the dropped bytes.
9891     assert(NumEvenDrops <= 3 &&
9892            "No support for dropping even elements more than 3 times.");
9893     // We use the mask type to pick which bytes are preserved based on how many
9894     // elements are dropped.
9895     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
9896     SDValue ByteClearMask =
9897         DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
9898                     DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
9899     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
9900     if (!IsSingleInput)
9901       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
9902
9903     // Now pack things back together.
9904     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
9905     V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
9906     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
9907     for (int i = 1; i < NumEvenDrops; ++i) {
9908       Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
9909       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
9910     }
9911
9912     return Result;
9913   }
9914
9915   int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9916   int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9917   int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9918   int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9919
9920   auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
9921                             MutableArrayRef<int> V1HalfBlendMask,
9922                             MutableArrayRef<int> V2HalfBlendMask) {
9923     for (int i = 0; i < 8; ++i)
9924       if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
9925         V1HalfBlendMask[i] = HalfMask[i];
9926         HalfMask[i] = i;
9927       } else if (HalfMask[i] >= 16) {
9928         V2HalfBlendMask[i] = HalfMask[i] - 16;
9929         HalfMask[i] = i + 8;
9930       }
9931   };
9932   buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
9933   buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
9934
9935   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
9936
9937   auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
9938                              MutableArrayRef<int> HiBlendMask) {
9939     SDValue V1, V2;
9940     // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
9941     // them out and avoid using UNPCK{L,H} to extract the elements of V as
9942     // i16s.
9943     if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
9944                      [](int M) { return M >= 0 && M % 2 == 1; }) &&
9945         std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
9946                      [](int M) { return M >= 0 && M % 2 == 1; })) {
9947       // Use a mask to drop the high bytes.
9948       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
9949       V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
9950                        DAG.getConstant(0x00FF, MVT::v8i16));
9951
9952       // This will be a single vector shuffle instead of a blend so nuke V2.
9953       V2 = DAG.getUNDEF(MVT::v8i16);
9954
9955       // Squash the masks to point directly into V1.
9956       for (int &M : LoBlendMask)
9957         if (M >= 0)
9958           M /= 2;
9959       for (int &M : HiBlendMask)
9960         if (M >= 0)
9961           M /= 2;
9962     } else {
9963       // Otherwise just unpack the low half of V into V1 and the high half into
9964       // V2 so that we can blend them as i16s.
9965       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9966                        DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
9967       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9968                        DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
9969     }
9970
9971     SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9972     SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9973     return std::make_pair(BlendedLo, BlendedHi);
9974   };
9975   SDValue V1Lo, V1Hi, V2Lo, V2Hi;
9976   std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
9977   std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
9978
9979   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
9980   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
9981
9982   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
9983 }
9984
9985 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
9986 ///
9987 /// This routine breaks down the specific type of 128-bit shuffle and
9988 /// dispatches to the lowering routines accordingly.
9989 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9990                                         MVT VT, const X86Subtarget *Subtarget,
9991                                         SelectionDAG &DAG) {
9992   switch (VT.SimpleTy) {
9993   case MVT::v2i64:
9994     return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9995   case MVT::v2f64:
9996     return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9997   case MVT::v4i32:
9998     return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9999   case MVT::v4f32:
10000     return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10001   case MVT::v8i16:
10002     return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10003   case MVT::v16i8:
10004     return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10005
10006   default:
10007     llvm_unreachable("Unimplemented!");
10008   }
10009 }
10010
10011 /// \brief Helper function to test whether a shuffle mask could be
10012 /// simplified by widening the elements being shuffled.
10013 ///
10014 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
10015 /// leaves it in an unspecified state.
10016 ///
10017 /// NOTE: This must handle normal vector shuffle masks and *target* vector
10018 /// shuffle masks. The latter have the special property of a '-2' representing
10019 /// a zero-ed lane of a vector.
10020 static bool canWidenShuffleElements(ArrayRef<int> Mask,
10021                                     SmallVectorImpl<int> &WidenedMask) {
10022   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
10023     // If both elements are undef, its trivial.
10024     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
10025       WidenedMask.push_back(SM_SentinelUndef);
10026       continue;
10027     }
10028
10029     // Check for an undef mask and a mask value properly aligned to fit with
10030     // a pair of values. If we find such a case, use the non-undef mask's value.
10031     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
10032       WidenedMask.push_back(Mask[i + 1] / 2);
10033       continue;
10034     }
10035     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
10036       WidenedMask.push_back(Mask[i] / 2);
10037       continue;
10038     }
10039
10040     // When zeroing, we need to spread the zeroing across both lanes to widen.
10041     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
10042       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
10043           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
10044         WidenedMask.push_back(SM_SentinelZero);
10045         continue;
10046       }
10047       return false;
10048     }
10049
10050     // Finally check if the two mask values are adjacent and aligned with
10051     // a pair.
10052     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
10053       WidenedMask.push_back(Mask[i] / 2);
10054       continue;
10055     }
10056
10057     // Otherwise we can't safely widen the elements used in this shuffle.
10058     return false;
10059   }
10060   assert(WidenedMask.size() == Mask.size() / 2 &&
10061          "Incorrect size of mask after widening the elements!");
10062
10063   return true;
10064 }
10065
10066 /// \brief Generic routine to split ector shuffle into half-sized shuffles.
10067 ///
10068 /// This routine just extracts two subvectors, shuffles them independently, and
10069 /// then concatenates them back together. This should work effectively with all
10070 /// AVX vector shuffle types.
10071 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10072                                           SDValue V2, ArrayRef<int> Mask,
10073                                           SelectionDAG &DAG) {
10074   assert(VT.getSizeInBits() >= 256 &&
10075          "Only for 256-bit or wider vector shuffles!");
10076   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
10077   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
10078
10079   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
10080   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
10081
10082   int NumElements = VT.getVectorNumElements();
10083   int SplitNumElements = NumElements / 2;
10084   MVT ScalarVT = VT.getScalarType();
10085   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
10086
10087   SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
10088                              DAG.getIntPtrConstant(0));
10089   SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
10090                              DAG.getIntPtrConstant(SplitNumElements));
10091   SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
10092                              DAG.getIntPtrConstant(0));
10093   SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
10094                              DAG.getIntPtrConstant(SplitNumElements));
10095
10096   // Now create two 4-way blends of these half-width vectors.
10097   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
10098     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
10099     SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
10100     for (int i = 0; i < SplitNumElements; ++i) {
10101       int M = HalfMask[i];
10102       if (M >= NumElements) {
10103         if (M >= NumElements + SplitNumElements)
10104           UseHiV2 = true;
10105         else
10106           UseLoV2 = true;
10107         V2BlendMask.push_back(M - NumElements);
10108         V1BlendMask.push_back(-1);
10109         BlendMask.push_back(SplitNumElements + i);
10110       } else if (M >= 0) {
10111         if (M >= SplitNumElements)
10112           UseHiV1 = true;
10113         else
10114           UseLoV1 = true;
10115         V2BlendMask.push_back(-1);
10116         V1BlendMask.push_back(M);
10117         BlendMask.push_back(i);
10118       } else {
10119         V2BlendMask.push_back(-1);
10120         V1BlendMask.push_back(-1);
10121         BlendMask.push_back(-1);
10122       }
10123     }
10124
10125     // Because the lowering happens after all combining takes place, we need to
10126     // manually combine these blend masks as much as possible so that we create
10127     // a minimal number of high-level vector shuffle nodes.
10128
10129     // First try just blending the halves of V1 or V2.
10130     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
10131       return DAG.getUNDEF(SplitVT);
10132     if (!UseLoV2 && !UseHiV2)
10133       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10134     if (!UseLoV1 && !UseHiV1)
10135       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10136
10137     SDValue V1Blend, V2Blend;
10138     if (UseLoV1 && UseHiV1) {
10139       V1Blend =
10140         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10141     } else {
10142       // We only use half of V1 so map the usage down into the final blend mask.
10143       V1Blend = UseLoV1 ? LoV1 : HiV1;
10144       for (int i = 0; i < SplitNumElements; ++i)
10145         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
10146           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
10147     }
10148     if (UseLoV2 && UseHiV2) {
10149       V2Blend =
10150         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10151     } else {
10152       // We only use half of V2 so map the usage down into the final blend mask.
10153       V2Blend = UseLoV2 ? LoV2 : HiV2;
10154       for (int i = 0; i < SplitNumElements; ++i)
10155         if (BlendMask[i] >= SplitNumElements)
10156           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
10157     }
10158     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
10159   };
10160   SDValue Lo = HalfBlend(LoMask);
10161   SDValue Hi = HalfBlend(HiMask);
10162   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
10163 }
10164
10165 /// \brief Either split a vector in halves or decompose the shuffles and the
10166 /// blend.
10167 ///
10168 /// This is provided as a good fallback for many lowerings of non-single-input
10169 /// shuffles with more than one 128-bit lane. In those cases, we want to select
10170 /// between splitting the shuffle into 128-bit components and stitching those
10171 /// back together vs. extracting the single-input shuffles and blending those
10172 /// results.
10173 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
10174                                                 SDValue V2, ArrayRef<int> Mask,
10175                                                 SelectionDAG &DAG) {
10176   assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
10177                                             "lower single-input shuffles as it "
10178                                             "could then recurse on itself.");
10179   int Size = Mask.size();
10180
10181   // If this can be modeled as a broadcast of two elements followed by a blend,
10182   // prefer that lowering. This is especially important because broadcasts can
10183   // often fold with memory operands.
10184   auto DoBothBroadcast = [&] {
10185     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10186     for (int M : Mask)
10187       if (M >= Size) {
10188         if (V2BroadcastIdx == -1)
10189           V2BroadcastIdx = M - Size;
10190         else if (M - Size != V2BroadcastIdx)
10191           return false;
10192       } else if (M >= 0) {
10193         if (V1BroadcastIdx == -1)
10194           V1BroadcastIdx = M;
10195         else if (M != V1BroadcastIdx)
10196           return false;
10197       }
10198     return true;
10199   };
10200   if (DoBothBroadcast())
10201     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10202                                                       DAG);
10203
10204   // If the inputs all stem from a single 128-bit lane of each input, then we
10205   // split them rather than blending because the split will decompose to
10206   // unusually few instructions.
10207   int LaneCount = VT.getSizeInBits() / 128;
10208   int LaneSize = Size / LaneCount;
10209   SmallBitVector LaneInputs[2];
10210   LaneInputs[0].resize(LaneCount, false);
10211   LaneInputs[1].resize(LaneCount, false);
10212   for (int i = 0; i < Size; ++i)
10213     if (Mask[i] >= 0)
10214       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10215   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10216     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10217
10218   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10219   // that the decomposed single-input shuffles don't end up here.
10220   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10221 }
10222
10223 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10224 /// a permutation and blend of those lanes.
10225 ///
10226 /// This essentially blends the out-of-lane inputs to each lane into the lane
10227 /// from a permuted copy of the vector. This lowering strategy results in four
10228 /// instructions in the worst case for a single-input cross lane shuffle which
10229 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10230 /// of. Special cases for each particular shuffle pattern should be handled
10231 /// prior to trying this lowering.
10232 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
10233                                                        SDValue V1, SDValue V2,
10234                                                        ArrayRef<int> Mask,
10235                                                        SelectionDAG &DAG) {
10236   // FIXME: This should probably be generalized for 512-bit vectors as well.
10237   assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
10238   int LaneSize = Mask.size() / 2;
10239
10240   // If there are only inputs from one 128-bit lane, splitting will in fact be
10241   // less expensive. The flags track wether the given lane contains an element
10242   // that crosses to another lane.
10243   bool LaneCrossing[2] = {false, false};
10244   for (int i = 0, Size = Mask.size(); i < Size; ++i)
10245     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10246       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10247   if (!LaneCrossing[0] || !LaneCrossing[1])
10248     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10249
10250   if (isSingleInputShuffleMask(Mask)) {
10251     SmallVector<int, 32> FlippedBlendMask;
10252     for (int i = 0, Size = Mask.size(); i < Size; ++i)
10253       FlippedBlendMask.push_back(
10254           Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10255                                   ? Mask[i]
10256                                   : Mask[i] % LaneSize +
10257                                         (i / LaneSize) * LaneSize + Size));
10258
10259     // Flip the vector, and blend the results which should now be in-lane. The
10260     // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10261     // 5 for the high source. The value 3 selects the high half of source 2 and
10262     // the value 2 selects the low half of source 2. We only use source 2 to
10263     // allow folding it into a memory operand.
10264     unsigned PERMMask = 3 | 2 << 4;
10265     SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10266                                   V1, DAG.getConstant(PERMMask, MVT::i8));
10267     return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10268   }
10269
10270   // This now reduces to two single-input shuffles of V1 and V2 which at worst
10271   // will be handled by the above logic and a blend of the results, much like
10272   // other patterns in AVX.
10273   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10274 }
10275
10276 /// \brief Handle lowering 2-lane 128-bit shuffles.
10277 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10278                                         SDValue V2, ArrayRef<int> Mask,
10279                                         const X86Subtarget *Subtarget,
10280                                         SelectionDAG &DAG) {
10281   // Blends are faster and handle all the non-lane-crossing cases.
10282   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10283                                                 Subtarget, DAG))
10284     return Blend;
10285
10286   MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10287                                VT.getVectorNumElements() / 2);
10288   // Check for patterns which can be matched with a single insert of a 128-bit
10289   // subvector.
10290   if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
10291       isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
10292     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10293                               DAG.getIntPtrConstant(0));
10294     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10295                               Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
10296     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10297   }
10298   if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
10299     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10300                               DAG.getIntPtrConstant(0));
10301     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
10302                               DAG.getIntPtrConstant(2));
10303     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10304   }
10305
10306   // Otherwise form a 128-bit permutation.
10307   // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
10308   unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
10309   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10310                      DAG.getConstant(PermMask, MVT::i8));
10311 }
10312
10313 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10314 /// shuffling each lane.
10315 ///
10316 /// This will only succeed when the result of fixing the 128-bit lanes results
10317 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10318 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10319 /// the lane crosses early and then use simpler shuffles within each lane.
10320 ///
10321 /// FIXME: It might be worthwhile at some point to support this without
10322 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10323 /// in x86 only floating point has interesting non-repeating shuffles, and even
10324 /// those are still *marginally* more expensive.
10325 static SDValue lowerVectorShuffleByMerging128BitLanes(
10326     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10327     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
10328   assert(!isSingleInputShuffleMask(Mask) &&
10329          "This is only useful with multiple inputs.");
10330
10331   int Size = Mask.size();
10332   int LaneSize = 128 / VT.getScalarSizeInBits();
10333   int NumLanes = Size / LaneSize;
10334   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10335
10336   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10337   // check whether the in-128-bit lane shuffles share a repeating pattern.
10338   SmallVector<int, 4> Lanes;
10339   Lanes.resize(NumLanes, -1);
10340   SmallVector<int, 4> InLaneMask;
10341   InLaneMask.resize(LaneSize, -1);
10342   for (int i = 0; i < Size; ++i) {
10343     if (Mask[i] < 0)
10344       continue;
10345
10346     int j = i / LaneSize;
10347
10348     if (Lanes[j] < 0) {
10349       // First entry we've seen for this lane.
10350       Lanes[j] = Mask[i] / LaneSize;
10351     } else if (Lanes[j] != Mask[i] / LaneSize) {
10352       // This doesn't match the lane selected previously!
10353       return SDValue();
10354     }
10355
10356     // Check that within each lane we have a consistent shuffle mask.
10357     int k = i % LaneSize;
10358     if (InLaneMask[k] < 0) {
10359       InLaneMask[k] = Mask[i] % LaneSize;
10360     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10361       // This doesn't fit a repeating in-lane mask.
10362       return SDValue();
10363     }
10364   }
10365
10366   // First shuffle the lanes into place.
10367   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10368                                 VT.getSizeInBits() / 64);
10369   SmallVector<int, 8> LaneMask;
10370   LaneMask.resize(NumLanes * 2, -1);
10371   for (int i = 0; i < NumLanes; ++i)
10372     if (Lanes[i] >= 0) {
10373       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10374       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10375     }
10376
10377   V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
10378   V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
10379   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10380
10381   // Cast it back to the type we actually want.
10382   LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
10383
10384   // Now do a simple shuffle that isn't lane crossing.
10385   SmallVector<int, 8> NewMask;
10386   NewMask.resize(Size, -1);
10387   for (int i = 0; i < Size; ++i)
10388     if (Mask[i] >= 0)
10389       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10390   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10391          "Must not introduce lane crosses at this point!");
10392
10393   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10394 }
10395
10396 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10397 /// given mask.
10398 ///
10399 /// This returns true if the elements from a particular input are already in the
10400 /// slot required by the given mask and require no permutation.
10401 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10402   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10403   int Size = Mask.size();
10404   for (int i = 0; i < Size; ++i)
10405     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10406       return false;
10407
10408   return true;
10409 }
10410
10411 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
10412 ///
10413 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
10414 /// isn't available.
10415 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10416                                        const X86Subtarget *Subtarget,
10417                                        SelectionDAG &DAG) {
10418   SDLoc DL(Op);
10419   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10420   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10421   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10422   ArrayRef<int> Mask = SVOp->getMask();
10423   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10424
10425   SmallVector<int, 4> WidenedMask;
10426   if (canWidenShuffleElements(Mask, WidenedMask))
10427     return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
10428                                     DAG);
10429
10430   if (isSingleInputShuffleMask(Mask)) {
10431     // Check for being able to broadcast a single element.
10432     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
10433                                                           Mask, Subtarget, DAG))
10434       return Broadcast;
10435
10436     // Use low duplicate instructions for masks that match their pattern.
10437     if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
10438       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
10439
10440     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
10441       // Non-half-crossing single input shuffles can be lowerid with an
10442       // interleaved permutation.
10443       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
10444                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
10445       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
10446                          DAG.getConstant(VPERMILPMask, MVT::i8));
10447     }
10448
10449     // With AVX2 we have direct support for this permutation.
10450     if (Subtarget->hasAVX2())
10451       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
10452                          getV4X86ShuffleImm8ForMask(Mask, DAG));
10453
10454     // Otherwise, fall back.
10455     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
10456                                                    DAG);
10457   }
10458
10459   // X86 has dedicated unpack instructions that can handle specific blend
10460   // operations: UNPCKH and UNPCKL.
10461   if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10462     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
10463   if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10464     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
10465
10466   // If we have a single input to the zero element, insert that into V1 if we
10467   // can do so cheaply.
10468   int NumV2Elements =
10469       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
10470   if (NumV2Elements == 1 && Mask[0] >= 4)
10471     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10472             MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
10473       return Insertion;
10474
10475   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
10476                                                 Subtarget, DAG))
10477     return Blend;
10478
10479   // Check if the blend happens to exactly fit that of SHUFPD.
10480   if ((Mask[0] == -1 || Mask[0] < 2) &&
10481       (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
10482       (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
10483       (Mask[3] == -1 || Mask[3] >= 6)) {
10484     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
10485                           ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
10486     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
10487                        DAG.getConstant(SHUFPDMask, MVT::i8));
10488   }
10489   if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
10490       (Mask[1] == -1 || Mask[1] < 2) &&
10491       (Mask[2] == -1 || Mask[2] >= 6) &&
10492       (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
10493     unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
10494                           ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
10495     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
10496                        DAG.getConstant(SHUFPDMask, MVT::i8));
10497   }
10498
10499   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10500   // shuffle. However, if we have AVX2 and either inputs are already in place,
10501   // we will be able to shuffle even across lanes the other input in a single
10502   // instruction so skip this pattern.
10503   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10504                                  isShuffleMaskInputInPlace(1, Mask))))
10505     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10506             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10507       return Result;
10508
10509   // If we have AVX2 then we always want to lower with a blend because an v4 we
10510   // can fully permute the elements.
10511   if (Subtarget->hasAVX2())
10512     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
10513                                                       Mask, DAG);
10514
10515   // Otherwise fall back on generic lowering.
10516   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
10517 }
10518
10519 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
10520 ///
10521 /// This routine is only called when we have AVX2 and thus a reasonable
10522 /// instruction set for v4i64 shuffling..
10523 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10524                                        const X86Subtarget *Subtarget,
10525                                        SelectionDAG &DAG) {
10526   SDLoc DL(Op);
10527   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10528   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10529   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10530   ArrayRef<int> Mask = SVOp->getMask();
10531   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10532   assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
10533
10534   SmallVector<int, 4> WidenedMask;
10535   if (canWidenShuffleElements(Mask, WidenedMask))
10536     return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
10537                                     DAG);
10538
10539   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
10540                                                 Subtarget, DAG))
10541     return Blend;
10542
10543   // Check for being able to broadcast a single element.
10544   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
10545                                                         Mask, Subtarget, DAG))
10546     return Broadcast;
10547
10548   // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
10549   // use lower latency instructions that will operate on both 128-bit lanes.
10550   SmallVector<int, 2> RepeatedMask;
10551   if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
10552     if (isSingleInputShuffleMask(Mask)) {
10553       int PSHUFDMask[] = {-1, -1, -1, -1};
10554       for (int i = 0; i < 2; ++i)
10555         if (RepeatedMask[i] >= 0) {
10556           PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
10557           PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
10558         }
10559       return DAG.getNode(
10560           ISD::BITCAST, DL, MVT::v4i64,
10561           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
10562                       DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
10563                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
10564     }
10565
10566     // Use dedicated unpack instructions for masks that match their pattern.
10567     if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10568       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
10569     if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10570       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
10571   }
10572
10573   // AVX2 provides a direct instruction for permuting a single input across
10574   // lanes.
10575   if (isSingleInputShuffleMask(Mask))
10576     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
10577                        getV4X86ShuffleImm8ForMask(Mask, DAG));
10578
10579   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10580   // shuffle. However, if we have AVX2 and either inputs are already in place,
10581   // we will be able to shuffle even across lanes the other input in a single
10582   // instruction so skip this pattern.
10583   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10584                                  isShuffleMaskInputInPlace(1, Mask))))
10585     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10586             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
10587       return Result;
10588
10589   // Otherwise fall back on generic blend lowering.
10590   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
10591                                                     Mask, DAG);
10592 }
10593
10594 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
10595 ///
10596 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
10597 /// isn't available.
10598 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10599                                        const X86Subtarget *Subtarget,
10600                                        SelectionDAG &DAG) {
10601   SDLoc DL(Op);
10602   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10603   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10604   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10605   ArrayRef<int> Mask = SVOp->getMask();
10606   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10607
10608   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
10609                                                 Subtarget, DAG))
10610     return Blend;
10611
10612   // Check for being able to broadcast a single element.
10613   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
10614                                                         Mask, Subtarget, DAG))
10615     return Broadcast;
10616
10617   // If the shuffle mask is repeated in each 128-bit lane, we have many more
10618   // options to efficiently lower the shuffle.
10619   SmallVector<int, 4> RepeatedMask;
10620   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
10621     assert(RepeatedMask.size() == 4 &&
10622            "Repeated masks must be half the mask width!");
10623
10624     // Use even/odd duplicate instructions for masks that match their pattern.
10625     if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6))
10626       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
10627     if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7))
10628       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
10629
10630     if (isSingleInputShuffleMask(Mask))
10631       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
10632                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10633
10634     // Use dedicated unpack instructions for masks that match their pattern.
10635     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10636       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
10637     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10638       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
10639
10640     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
10641     // have already handled any direct blends. We also need to squash the
10642     // repeated mask into a simulated v4f32 mask.
10643     for (int i = 0; i < 4; ++i)
10644       if (RepeatedMask[i] >= 8)
10645         RepeatedMask[i] -= 4;
10646     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
10647   }
10648
10649   // If we have a single input shuffle with different shuffle patterns in the
10650   // two 128-bit lanes use the variable mask to VPERMILPS.
10651   if (isSingleInputShuffleMask(Mask)) {
10652     SDValue VPermMask[8];
10653     for (int i = 0; i < 8; ++i)
10654       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10655                                  : DAG.getConstant(Mask[i], MVT::i32);
10656     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
10657       return DAG.getNode(
10658           X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
10659           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
10660
10661     if (Subtarget->hasAVX2())
10662       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
10663                          DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
10664                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
10665                                                  MVT::v8i32, VPermMask)),
10666                          V1);
10667
10668     // Otherwise, fall back.
10669     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
10670                                                    DAG);
10671   }
10672
10673   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10674   // shuffle.
10675   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10676           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
10677     return Result;
10678
10679   // If we have AVX2 then we always want to lower with a blend because at v8 we
10680   // can fully permute the elements.
10681   if (Subtarget->hasAVX2())
10682     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
10683                                                       Mask, DAG);
10684
10685   // Otherwise fall back on generic lowering.
10686   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
10687 }
10688
10689 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
10690 ///
10691 /// This routine is only called when we have AVX2 and thus a reasonable
10692 /// instruction set for v8i32 shuffling..
10693 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10694                                        const X86Subtarget *Subtarget,
10695                                        SelectionDAG &DAG) {
10696   SDLoc DL(Op);
10697   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10698   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10699   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10700   ArrayRef<int> Mask = SVOp->getMask();
10701   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10702   assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
10703
10704   // Whenever we can lower this as a zext, that instruction is strictly faster
10705   // than any alternative. It also allows us to fold memory operands into the
10706   // shuffle in many cases.
10707   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
10708                                                          Mask, Subtarget, DAG))
10709     return ZExt;
10710
10711   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
10712                                                 Subtarget, DAG))
10713     return Blend;
10714
10715   // Check for being able to broadcast a single element.
10716   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
10717                                                         Mask, Subtarget, DAG))
10718     return Broadcast;
10719
10720   // If the shuffle mask is repeated in each 128-bit lane we can use more
10721   // efficient instructions that mirror the shuffles across the two 128-bit
10722   // lanes.
10723   SmallVector<int, 4> RepeatedMask;
10724   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
10725     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
10726     if (isSingleInputShuffleMask(Mask))
10727       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
10728                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10729
10730     // Use dedicated unpack instructions for masks that match their pattern.
10731     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10732       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
10733     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10734       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
10735   }
10736
10737   // If the shuffle patterns aren't repeated but it is a single input, directly
10738   // generate a cross-lane VPERMD instruction.
10739   if (isSingleInputShuffleMask(Mask)) {
10740     SDValue VPermMask[8];
10741     for (int i = 0; i < 8; ++i)
10742       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10743                                  : DAG.getConstant(Mask[i], MVT::i32);
10744     return DAG.getNode(
10745         X86ISD::VPERMV, DL, MVT::v8i32,
10746         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
10747   }
10748
10749   // Try to use bit shift instructions.
10750   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10751           DL, MVT::v8i32, V1, V2, Mask, DAG))
10752     return Shift;
10753
10754   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10755   // shuffle.
10756   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10757           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
10758     return Result;
10759
10760   // Otherwise fall back on generic blend lowering.
10761   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
10762                                                     Mask, DAG);
10763 }
10764
10765 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
10766 ///
10767 /// This routine is only called when we have AVX2 and thus a reasonable
10768 /// instruction set for v16i16 shuffling..
10769 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10770                                         const X86Subtarget *Subtarget,
10771                                         SelectionDAG &DAG) {
10772   SDLoc DL(Op);
10773   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10774   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10775   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10776   ArrayRef<int> Mask = SVOp->getMask();
10777   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10778   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
10779
10780   // Whenever we can lower this as a zext, that instruction is strictly faster
10781   // than any alternative. It also allows us to fold memory operands into the
10782   // shuffle in many cases.
10783   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
10784                                                          Mask, Subtarget, DAG))
10785     return ZExt;
10786
10787   // Check for being able to broadcast a single element.
10788   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
10789                                                         Mask, Subtarget, DAG))
10790     return Broadcast;
10791
10792   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
10793                                                 Subtarget, DAG))
10794     return Blend;
10795
10796   // Use dedicated unpack instructions for masks that match their pattern.
10797   if (isShuffleEquivalent(Mask,
10798                           // First 128-bit lane:
10799                           0, 16, 1, 17, 2, 18, 3, 19,
10800                           // Second 128-bit lane:
10801                           8, 24, 9, 25, 10, 26, 11, 27))
10802     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
10803   if (isShuffleEquivalent(Mask,
10804                           // First 128-bit lane:
10805                           4, 20, 5, 21, 6, 22, 7, 23,
10806                           // Second 128-bit lane:
10807                           12, 28, 13, 29, 14, 30, 15, 31))
10808     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
10809
10810   if (isSingleInputShuffleMask(Mask)) {
10811     // There are no generalized cross-lane shuffle operations available on i16
10812     // element types.
10813     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
10814       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
10815                                                      Mask, DAG);
10816
10817     SDValue PSHUFBMask[32];
10818     for (int i = 0; i < 16; ++i) {
10819       if (Mask[i] == -1) {
10820         PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
10821         continue;
10822       }
10823
10824       int M = i < 8 ? Mask[i] : Mask[i] - 8;
10825       assert(M >= 0 && M < 8 && "Invalid single-input mask!");
10826       PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
10827       PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
10828     }
10829     return DAG.getNode(
10830         ISD::BITCAST, DL, MVT::v16i16,
10831         DAG.getNode(
10832             X86ISD::PSHUFB, DL, MVT::v32i8,
10833             DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
10834             DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
10835   }
10836
10837   // Try to use bit shift instructions.
10838   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10839           DL, MVT::v16i16, V1, V2, Mask, DAG))
10840     return Shift;
10841
10842   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10843   // shuffle.
10844   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10845           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
10846     return Result;
10847
10848   // Otherwise fall back on generic lowering.
10849   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
10850 }
10851
10852 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
10853 ///
10854 /// This routine is only called when we have AVX2 and thus a reasonable
10855 /// instruction set for v32i8 shuffling..
10856 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10857                                        const X86Subtarget *Subtarget,
10858                                        SelectionDAG &DAG) {
10859   SDLoc DL(Op);
10860   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10861   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10862   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10863   ArrayRef<int> Mask = SVOp->getMask();
10864   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10865   assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
10866
10867   // Whenever we can lower this as a zext, that instruction is strictly faster
10868   // than any alternative. It also allows us to fold memory operands into the
10869   // shuffle in many cases.
10870   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
10871                                                          Mask, Subtarget, DAG))
10872     return ZExt;
10873
10874   // Check for being able to broadcast a single element.
10875   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
10876                                                         Mask, Subtarget, DAG))
10877     return Broadcast;
10878
10879   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
10880                                                 Subtarget, DAG))
10881     return Blend;
10882
10883   // Use dedicated unpack instructions for masks that match their pattern.
10884   // Note that these are repeated 128-bit lane unpacks, not unpacks across all
10885   // 256-bit lanes.
10886   if (isShuffleEquivalent(
10887           Mask,
10888           // First 128-bit lane:
10889           0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
10890           // Second 128-bit lane:
10891           16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
10892     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
10893   if (isShuffleEquivalent(
10894           Mask,
10895           // First 128-bit lane:
10896           8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
10897           // Second 128-bit lane:
10898           24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
10899     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
10900
10901   if (isSingleInputShuffleMask(Mask)) {
10902     // There are no generalized cross-lane shuffle operations available on i8
10903     // element types.
10904     if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
10905       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
10906                                                      Mask, DAG);
10907
10908     SDValue PSHUFBMask[32];
10909     for (int i = 0; i < 32; ++i)
10910       PSHUFBMask[i] =
10911           Mask[i] < 0
10912               ? DAG.getUNDEF(MVT::i8)
10913               : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
10914
10915     return DAG.getNode(
10916         X86ISD::PSHUFB, DL, MVT::v32i8, V1,
10917         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
10918   }
10919
10920   // Try to use bit shift instructions.
10921   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10922           DL, MVT::v32i8, V1, V2, Mask, DAG))
10923     return Shift;
10924
10925   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10926   // shuffle.
10927   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10928           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
10929     return Result;
10930
10931   // Otherwise fall back on generic lowering.
10932   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
10933 }
10934
10935 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
10936 ///
10937 /// This routine either breaks down the specific type of a 256-bit x86 vector
10938 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
10939 /// together based on the available instructions.
10940 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10941                                         MVT VT, const X86Subtarget *Subtarget,
10942                                         SelectionDAG &DAG) {
10943   SDLoc DL(Op);
10944   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10945   ArrayRef<int> Mask = SVOp->getMask();
10946
10947   // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
10948   // check for those subtargets here and avoid much of the subtarget querying in
10949   // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
10950   // ability to manipulate a 256-bit vector with integer types. Since we'll use
10951   // floating point types there eventually, just immediately cast everything to
10952   // a float and operate entirely in that domain.
10953   if (VT.isInteger() && !Subtarget->hasAVX2()) {
10954     int ElementBits = VT.getScalarSizeInBits();
10955     if (ElementBits < 32)
10956       // No floating point type available, decompose into 128-bit vectors.
10957       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10958
10959     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
10960                                 VT.getVectorNumElements());
10961     V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
10962     V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
10963     return DAG.getNode(ISD::BITCAST, DL, VT,
10964                        DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
10965   }
10966
10967   switch (VT.SimpleTy) {
10968   case MVT::v4f64:
10969     return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10970   case MVT::v4i64:
10971     return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10972   case MVT::v8f32:
10973     return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10974   case MVT::v8i32:
10975     return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10976   case MVT::v16i16:
10977     return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10978   case MVT::v32i8:
10979     return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10980
10981   default:
10982     llvm_unreachable("Not a valid 256-bit x86 vector type!");
10983   }
10984 }
10985
10986 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
10987 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10988                                        const X86Subtarget *Subtarget,
10989                                        SelectionDAG &DAG) {
10990   SDLoc DL(Op);
10991   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10992   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10993   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10994   ArrayRef<int> Mask = SVOp->getMask();
10995   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10996
10997   // X86 has dedicated unpack instructions that can handle specific blend
10998   // operations: UNPCKH and UNPCKL.
10999   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
11000     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
11001   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11002     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
11003
11004   // FIXME: Implement direct support for this type!
11005   return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
11006 }
11007
11008 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
11009 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11010                                        const X86Subtarget *Subtarget,
11011                                        SelectionDAG &DAG) {
11012   SDLoc DL(Op);
11013   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11014   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11015   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11016   ArrayRef<int> Mask = SVOp->getMask();
11017   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11018
11019   // Use dedicated unpack instructions for masks that match their pattern.
11020   if (isShuffleEquivalent(Mask,
11021                           0, 16, 1, 17, 4, 20, 5, 21,
11022                           8, 24, 9, 25, 12, 28, 13, 29))
11023     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
11024   if (isShuffleEquivalent(Mask,
11025                           2, 18, 3, 19, 6, 22, 7, 23,
11026                           10, 26, 11, 27, 14, 30, 15, 31))
11027     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
11028
11029   // FIXME: Implement direct support for this type!
11030   return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
11031 }
11032
11033 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
11034 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11035                                        const X86Subtarget *Subtarget,
11036                                        SelectionDAG &DAG) {
11037   SDLoc DL(Op);
11038   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11039   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11040   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11041   ArrayRef<int> Mask = SVOp->getMask();
11042   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11043
11044   // X86 has dedicated unpack instructions that can handle specific blend
11045   // operations: UNPCKH and UNPCKL.
11046   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
11047     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
11048   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11049     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
11050
11051   // FIXME: Implement direct support for this type!
11052   return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
11053 }
11054
11055 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
11056 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11057                                        const X86Subtarget *Subtarget,
11058                                        SelectionDAG &DAG) {
11059   SDLoc DL(Op);
11060   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11061   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11062   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11063   ArrayRef<int> Mask = SVOp->getMask();
11064   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11065
11066   // Use dedicated unpack instructions for masks that match their pattern.
11067   if (isShuffleEquivalent(Mask,
11068                           0, 16, 1, 17, 4, 20, 5, 21,
11069                           8, 24, 9, 25, 12, 28, 13, 29))
11070     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
11071   if (isShuffleEquivalent(Mask,
11072                           2, 18, 3, 19, 6, 22, 7, 23,
11073                           10, 26, 11, 27, 14, 30, 15, 31))
11074     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
11075
11076   // FIXME: Implement direct support for this type!
11077   return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
11078 }
11079
11080 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
11081 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11082                                         const X86Subtarget *Subtarget,
11083                                         SelectionDAG &DAG) {
11084   SDLoc DL(Op);
11085   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11086   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11087   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11088   ArrayRef<int> Mask = SVOp->getMask();
11089   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11090   assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
11091
11092   // FIXME: Implement direct support for this type!
11093   return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
11094 }
11095
11096 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
11097 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11098                                        const X86Subtarget *Subtarget,
11099                                        SelectionDAG &DAG) {
11100   SDLoc DL(Op);
11101   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11102   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11103   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11104   ArrayRef<int> Mask = SVOp->getMask();
11105   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
11106   assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
11107
11108   // FIXME: Implement direct support for this type!
11109   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
11110 }
11111
11112 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
11113 ///
11114 /// This routine either breaks down the specific type of a 512-bit x86 vector
11115 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
11116 /// together based on the available instructions.
11117 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11118                                         MVT VT, const X86Subtarget *Subtarget,
11119                                         SelectionDAG &DAG) {
11120   SDLoc DL(Op);
11121   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11122   ArrayRef<int> Mask = SVOp->getMask();
11123   assert(Subtarget->hasAVX512() &&
11124          "Cannot lower 512-bit vectors w/ basic ISA!");
11125
11126   // Check for being able to broadcast a single element.
11127   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
11128                                                         Mask, Subtarget, DAG))
11129     return Broadcast;
11130
11131   // Dispatch to each element type for lowering. If we don't have supprot for
11132   // specific element type shuffles at 512 bits, immediately split them and
11133   // lower them. Each lowering routine of a given type is allowed to assume that
11134   // the requisite ISA extensions for that element type are available.
11135   switch (VT.SimpleTy) {
11136   case MVT::v8f64:
11137     return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11138   case MVT::v16f32:
11139     return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11140   case MVT::v8i64:
11141     return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11142   case MVT::v16i32:
11143     return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11144   case MVT::v32i16:
11145     if (Subtarget->hasBWI())
11146       return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
11147     break;
11148   case MVT::v64i8:
11149     if (Subtarget->hasBWI())
11150       return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
11151     break;
11152
11153   default:
11154     llvm_unreachable("Not a valid 512-bit x86 vector type!");
11155   }
11156
11157   // Otherwise fall back on splitting.
11158   return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11159 }
11160
11161 /// \brief Top-level lowering for x86 vector shuffles.
11162 ///
11163 /// This handles decomposition, canonicalization, and lowering of all x86
11164 /// vector shuffles. Most of the specific lowering strategies are encapsulated
11165 /// above in helper routines. The canonicalization attempts to widen shuffles
11166 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
11167 /// s.t. only one of the two inputs needs to be tested, etc.
11168 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
11169                                   SelectionDAG &DAG) {
11170   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11171   ArrayRef<int> Mask = SVOp->getMask();
11172   SDValue V1 = Op.getOperand(0);
11173   SDValue V2 = Op.getOperand(1);
11174   MVT VT = Op.getSimpleValueType();
11175   int NumElements = VT.getVectorNumElements();
11176   SDLoc dl(Op);
11177
11178   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
11179
11180   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
11181   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11182   if (V1IsUndef && V2IsUndef)
11183     return DAG.getUNDEF(VT);
11184
11185   // When we create a shuffle node we put the UNDEF node to second operand,
11186   // but in some cases the first operand may be transformed to UNDEF.
11187   // In this case we should just commute the node.
11188   if (V1IsUndef)
11189     return DAG.getCommutedVectorShuffle(*SVOp);
11190
11191   // Check for non-undef masks pointing at an undef vector and make the masks
11192   // undef as well. This makes it easier to match the shuffle based solely on
11193   // the mask.
11194   if (V2IsUndef)
11195     for (int M : Mask)
11196       if (M >= NumElements) {
11197         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
11198         for (int &M : NewMask)
11199           if (M >= NumElements)
11200             M = -1;
11201         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
11202       }
11203
11204   // Try to collapse shuffles into using a vector type with fewer elements but
11205   // wider element types. We cap this to not form integers or floating point
11206   // elements wider than 64 bits, but it might be interesting to form i128
11207   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
11208   SmallVector<int, 16> WidenedMask;
11209   if (VT.getScalarSizeInBits() < 64 &&
11210       canWidenShuffleElements(Mask, WidenedMask)) {
11211     MVT NewEltVT = VT.isFloatingPoint()
11212                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
11213                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
11214     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
11215     // Make sure that the new vector type is legal. For example, v2f64 isn't
11216     // legal on SSE1.
11217     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
11218       V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
11219       V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
11220       return DAG.getNode(ISD::BITCAST, dl, VT,
11221                          DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
11222     }
11223   }
11224
11225   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
11226   for (int M : SVOp->getMask())
11227     if (M < 0)
11228       ++NumUndefElements;
11229     else if (M < NumElements)
11230       ++NumV1Elements;
11231     else
11232       ++NumV2Elements;
11233
11234   // Commute the shuffle as needed such that more elements come from V1 than
11235   // V2. This allows us to match the shuffle pattern strictly on how many
11236   // elements come from V1 without handling the symmetric cases.
11237   if (NumV2Elements > NumV1Elements)
11238     return DAG.getCommutedVectorShuffle(*SVOp);
11239
11240   // When the number of V1 and V2 elements are the same, try to minimize the
11241   // number of uses of V2 in the low half of the vector. When that is tied,
11242   // ensure that the sum of indices for V1 is equal to or lower than the sum
11243   // indices for V2. When those are equal, try to ensure that the number of odd
11244   // indices for V1 is lower than the number of odd indices for V2.
11245   if (NumV1Elements == NumV2Elements) {
11246     int LowV1Elements = 0, LowV2Elements = 0;
11247     for (int M : SVOp->getMask().slice(0, NumElements / 2))
11248       if (M >= NumElements)
11249         ++LowV2Elements;
11250       else if (M >= 0)
11251         ++LowV1Elements;
11252     if (LowV2Elements > LowV1Elements) {
11253       return DAG.getCommutedVectorShuffle(*SVOp);
11254     } else if (LowV2Elements == LowV1Elements) {
11255       int SumV1Indices = 0, SumV2Indices = 0;
11256       for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11257         if (SVOp->getMask()[i] >= NumElements)
11258           SumV2Indices += i;
11259         else if (SVOp->getMask()[i] >= 0)
11260           SumV1Indices += i;
11261       if (SumV2Indices < SumV1Indices) {
11262         return DAG.getCommutedVectorShuffle(*SVOp);
11263       } else if (SumV2Indices == SumV1Indices) {
11264         int NumV1OddIndices = 0, NumV2OddIndices = 0;
11265         for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11266           if (SVOp->getMask()[i] >= NumElements)
11267             NumV2OddIndices += i % 2;
11268           else if (SVOp->getMask()[i] >= 0)
11269             NumV1OddIndices += i % 2;
11270         if (NumV2OddIndices < NumV1OddIndices)
11271           return DAG.getCommutedVectorShuffle(*SVOp);
11272       }
11273     }
11274   }
11275
11276   // For each vector width, delegate to a specialized lowering routine.
11277   if (VT.getSizeInBits() == 128)
11278     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11279
11280   if (VT.getSizeInBits() == 256)
11281     return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11282
11283   // Force AVX-512 vectors to be scalarized for now.
11284   // FIXME: Implement AVX-512 support!
11285   if (VT.getSizeInBits() == 512)
11286     return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11287
11288   llvm_unreachable("Unimplemented!");
11289 }
11290
11291
11292 //===----------------------------------------------------------------------===//
11293 // Legacy vector shuffle lowering
11294 //
11295 // This code is the legacy code handling vector shuffles until the above
11296 // replaces its functionality and performance.
11297 //===----------------------------------------------------------------------===//
11298
11299 static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
11300                         bool hasInt256, unsigned *MaskOut = nullptr) {
11301   MVT EltVT = VT.getVectorElementType();
11302
11303   // There is no blend with immediate in AVX-512.
11304   if (VT.is512BitVector())
11305     return false;
11306
11307   if (!hasSSE41 || EltVT == MVT::i8)
11308     return false;
11309   if (!hasInt256 && VT == MVT::v16i16)
11310     return false;
11311
11312   unsigned MaskValue = 0;
11313   unsigned NumElems = VT.getVectorNumElements();
11314   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
11315   unsigned NumLanes = (NumElems - 1) / 8 + 1;
11316   unsigned NumElemsInLane = NumElems / NumLanes;
11317
11318   // Blend for v16i16 should be symmetric for both lanes.
11319   for (unsigned i = 0; i < NumElemsInLane; ++i) {
11320
11321     int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
11322     int EltIdx = MaskVals[i];
11323
11324     if ((EltIdx < 0 || EltIdx == (int)i) &&
11325         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
11326       continue;
11327
11328     if (((unsigned)EltIdx == (i + NumElems)) &&
11329         (SndLaneEltIdx < 0 ||
11330          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
11331       MaskValue |= (1 << i);
11332     else
11333       return false;
11334   }
11335
11336   if (MaskOut)
11337     *MaskOut = MaskValue;
11338   return true;
11339 }
11340
11341 // Try to lower a shuffle node into a simple blend instruction.
11342 // This function assumes isBlendMask returns true for this
11343 // SuffleVectorSDNode
11344 static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
11345                                           unsigned MaskValue,
11346                                           const X86Subtarget *Subtarget,
11347                                           SelectionDAG &DAG) {
11348   MVT VT = SVOp->getSimpleValueType(0);
11349   MVT EltVT = VT.getVectorElementType();
11350   assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
11351                      Subtarget->hasInt256() && "Trying to lower a "
11352                                                "VECTOR_SHUFFLE to a Blend but "
11353                                                "with the wrong mask"));
11354   SDValue V1 = SVOp->getOperand(0);
11355   SDValue V2 = SVOp->getOperand(1);
11356   SDLoc dl(SVOp);
11357   unsigned NumElems = VT.getVectorNumElements();
11358
11359   // Convert i32 vectors to floating point if it is not AVX2.
11360   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
11361   MVT BlendVT = VT;
11362   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
11363     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
11364                                NumElems);
11365     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
11366     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
11367   }
11368
11369   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
11370                             DAG.getConstant(MaskValue, MVT::i32));
11371   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
11372 }
11373
11374 /// In vector type \p VT, return true if the element at index \p InputIdx
11375 /// falls on a different 128-bit lane than \p OutputIdx.
11376 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
11377                                      unsigned OutputIdx) {
11378   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
11379   return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
11380 }
11381
11382 /// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
11383 /// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
11384 /// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
11385 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
11386 /// zero.
11387 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
11388                          SelectionDAG &DAG) {
11389   MVT VT = V1.getSimpleValueType();
11390   assert(VT.is128BitVector() || VT.is256BitVector());
11391
11392   MVT EltVT = VT.getVectorElementType();
11393   unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
11394   unsigned NumElts = VT.getVectorNumElements();
11395
11396   SmallVector<SDValue, 32> PshufbMask;
11397   for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
11398     int InputIdx = MaskVals[OutputIdx];
11399     unsigned InputByteIdx;
11400
11401     if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
11402       InputByteIdx = 0x80;
11403     else {
11404       // Cross lane is not allowed.
11405       if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
11406         return SDValue();
11407       InputByteIdx = InputIdx * EltSizeInBytes;
11408       // Index is an byte offset within the 128-bit lane.
11409       InputByteIdx &= 0xf;
11410     }
11411
11412     for (unsigned j = 0; j < EltSizeInBytes; ++j) {
11413       PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
11414       if (InputByteIdx != 0x80)
11415         ++InputByteIdx;
11416     }
11417   }
11418
11419   MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
11420   if (ShufVT != VT)
11421     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
11422   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
11423                      DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
11424 }
11425
11426 // v8i16 shuffles - Prefer shuffles in the following order:
11427 // 1. [all]   pshuflw, pshufhw, optional move
11428 // 2. [ssse3] 1 x pshufb
11429 // 3. [ssse3] 2 x pshufb + 1 x por
11430 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
11431 static SDValue
11432 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
11433                          SelectionDAG &DAG) {
11434   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11435   SDValue V1 = SVOp->getOperand(0);
11436   SDValue V2 = SVOp->getOperand(1);
11437   SDLoc dl(SVOp);
11438   SmallVector<int, 8> MaskVals;
11439
11440   // Determine if more than 1 of the words in each of the low and high quadwords
11441   // of the result come from the same quadword of one of the two inputs.  Undef
11442   // mask values count as coming from any quadword, for better codegen.
11443   //
11444   // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
11445   // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
11446   unsigned LoQuad[] = { 0, 0, 0, 0 };
11447   unsigned HiQuad[] = { 0, 0, 0, 0 };
11448   // Indices of quads used.
11449   std::bitset<4> InputQuads;
11450   for (unsigned i = 0; i < 8; ++i) {
11451     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
11452     int EltIdx = SVOp->getMaskElt(i);
11453     MaskVals.push_back(EltIdx);
11454     if (EltIdx < 0) {
11455       ++Quad[0];
11456       ++Quad[1];
11457       ++Quad[2];
11458       ++Quad[3];
11459       continue;
11460     }
11461     ++Quad[EltIdx / 4];
11462     InputQuads.set(EltIdx / 4);
11463   }
11464
11465   int BestLoQuad = -1;
11466   unsigned MaxQuad = 1;
11467   for (unsigned i = 0; i < 4; ++i) {
11468     if (LoQuad[i] > MaxQuad) {
11469       BestLoQuad = i;
11470       MaxQuad = LoQuad[i];
11471     }
11472   }
11473
11474   int BestHiQuad = -1;
11475   MaxQuad = 1;
11476   for (unsigned i = 0; i < 4; ++i) {
11477     if (HiQuad[i] > MaxQuad) {
11478       BestHiQuad = i;
11479       MaxQuad = HiQuad[i];
11480     }
11481   }
11482
11483   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
11484   // of the two input vectors, shuffle them into one input vector so only a
11485   // single pshufb instruction is necessary. If there are more than 2 input
11486   // quads, disable the next transformation since it does not help SSSE3.
11487   bool V1Used = InputQuads[0] || InputQuads[1];
11488   bool V2Used = InputQuads[2] || InputQuads[3];
11489   if (Subtarget->hasSSSE3()) {
11490     if (InputQuads.count() == 2 && V1Used && V2Used) {
11491       BestLoQuad = InputQuads[0] ? 0 : 1;
11492       BestHiQuad = InputQuads[2] ? 2 : 3;
11493     }
11494     if (InputQuads.count() > 2) {
11495       BestLoQuad = -1;
11496       BestHiQuad = -1;
11497     }
11498   }
11499
11500   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
11501   // the shuffle mask.  If a quad is scored as -1, that means that it contains
11502   // words from all 4 input quadwords.
11503   SDValue NewV;
11504   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
11505     int MaskV[] = {
11506       BestLoQuad < 0 ? 0 : BestLoQuad,
11507       BestHiQuad < 0 ? 1 : BestHiQuad
11508     };
11509     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
11510                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
11511                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
11512     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
11513
11514     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
11515     // source words for the shuffle, to aid later transformations.
11516     bool AllWordsInNewV = true;
11517     bool InOrder[2] = { true, true };
11518     for (unsigned i = 0; i != 8; ++i) {
11519       int idx = MaskVals[i];
11520       if (idx != (int)i)
11521         InOrder[i/4] = false;
11522       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
11523         continue;
11524       AllWordsInNewV = false;
11525       break;
11526     }
11527
11528     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
11529     if (AllWordsInNewV) {
11530       for (int i = 0; i != 8; ++i) {
11531         int idx = MaskVals[i];
11532         if (idx < 0)
11533           continue;
11534         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
11535         if ((idx != i) && idx < 4)
11536           pshufhw = false;
11537         if ((idx != i) && idx > 3)
11538           pshuflw = false;
11539       }
11540       V1 = NewV;
11541       V2Used = false;
11542       BestLoQuad = 0;
11543       BestHiQuad = 1;
11544     }
11545
11546     // If we've eliminated the use of V2, and the new mask is a pshuflw or
11547     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
11548     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
11549       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
11550       unsigned TargetMask = 0;
11551       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
11552                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
11553       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11554       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
11555                              getShufflePSHUFLWImmediate(SVOp);
11556       V1 = NewV.getOperand(0);
11557       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
11558     }
11559   }
11560
11561   // Promote splats to a larger type which usually leads to more efficient code.
11562   // FIXME: Is this true if pshufb is available?
11563   if (SVOp->isSplat())
11564     return PromoteSplat(SVOp, DAG);
11565
11566   // If we have SSSE3, and all words of the result are from 1 input vector,
11567   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
11568   // is present, fall back to case 4.
11569   if (Subtarget->hasSSSE3()) {
11570     SmallVector<SDValue,16> pshufbMask;
11571
11572     // If we have elements from both input vectors, set the high bit of the
11573     // shuffle mask element to zero out elements that come from V2 in the V1
11574     // mask, and elements that come from V1 in the V2 mask, so that the two
11575     // results can be OR'd together.
11576     bool TwoInputs = V1Used && V2Used;
11577     V1 = getPSHUFB(MaskVals, V1, dl, DAG);
11578     if (!TwoInputs)
11579       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11580
11581     // Calculate the shuffle mask for the second input, shuffle it, and
11582     // OR it with the first shuffled input.
11583     CommuteVectorShuffleMask(MaskVals, 8);
11584     V2 = getPSHUFB(MaskVals, V2, dl, DAG);
11585     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11586     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11587   }
11588
11589   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
11590   // and update MaskVals with new element order.
11591   std::bitset<8> InOrder;
11592   if (BestLoQuad >= 0) {
11593     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
11594     for (int i = 0; i != 4; ++i) {
11595       int idx = MaskVals[i];
11596       if (idx < 0) {
11597         InOrder.set(i);
11598       } else if ((idx / 4) == BestLoQuad) {
11599         MaskV[i] = idx & 3;
11600         InOrder.set(i);
11601       }
11602     }
11603     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11604                                 &MaskV[0]);
11605
11606     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11607       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11608       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
11609                                   NewV.getOperand(0),
11610                                   getShufflePSHUFLWImmediate(SVOp), DAG);
11611     }
11612   }
11613
11614   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
11615   // and update MaskVals with the new element order.
11616   if (BestHiQuad >= 0) {
11617     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
11618     for (unsigned i = 4; i != 8; ++i) {
11619       int idx = MaskVals[i];
11620       if (idx < 0) {
11621         InOrder.set(i);
11622       } else if ((idx / 4) == BestHiQuad) {
11623         MaskV[i] = (idx & 3) + 4;
11624         InOrder.set(i);
11625       }
11626     }
11627     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11628                                 &MaskV[0]);
11629
11630     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11631       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11632       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
11633                                   NewV.getOperand(0),
11634                                   getShufflePSHUFHWImmediate(SVOp), DAG);
11635     }
11636   }
11637
11638   // In case BestHi & BestLo were both -1, which means each quadword has a word
11639   // from each of the four input quadwords, calculate the InOrder bitvector now
11640   // before falling through to the insert/extract cleanup.
11641   if (BestLoQuad == -1 && BestHiQuad == -1) {
11642     NewV = V1;
11643     for (int i = 0; i != 8; ++i)
11644       if (MaskVals[i] < 0 || MaskVals[i] == i)
11645         InOrder.set(i);
11646   }
11647
11648   // The other elements are put in the right place using pextrw and pinsrw.
11649   for (unsigned i = 0; i != 8; ++i) {
11650     if (InOrder[i])
11651       continue;
11652     int EltIdx = MaskVals[i];
11653     if (EltIdx < 0)
11654       continue;
11655     SDValue ExtOp = (EltIdx < 8) ?
11656       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
11657                   DAG.getIntPtrConstant(EltIdx)) :
11658       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
11659                   DAG.getIntPtrConstant(EltIdx - 8));
11660     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
11661                        DAG.getIntPtrConstant(i));
11662   }
11663   return NewV;
11664 }
11665
11666 /// \brief v16i16 shuffles
11667 ///
11668 /// FIXME: We only support generation of a single pshufb currently.  We can
11669 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
11670 /// well (e.g 2 x pshufb + 1 x por).
11671 static SDValue
11672 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
11673   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11674   SDValue V1 = SVOp->getOperand(0);
11675   SDValue V2 = SVOp->getOperand(1);
11676   SDLoc dl(SVOp);
11677
11678   if (V2.getOpcode() != ISD::UNDEF)
11679     return SDValue();
11680
11681   SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11682   return getPSHUFB(MaskVals, V1, dl, DAG);
11683 }
11684
11685 // v16i8 shuffles - Prefer shuffles in the following order:
11686 // 1. [ssse3] 1 x pshufb
11687 // 2. [ssse3] 2 x pshufb + 1 x por
11688 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
11689 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
11690                                         const X86Subtarget* Subtarget,
11691                                         SelectionDAG &DAG) {
11692   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11693   SDValue V1 = SVOp->getOperand(0);
11694   SDValue V2 = SVOp->getOperand(1);
11695   SDLoc dl(SVOp);
11696   ArrayRef<int> MaskVals = SVOp->getMask();
11697
11698   // Promote splats to a larger type which usually leads to more efficient code.
11699   // FIXME: Is this true if pshufb is available?
11700   if (SVOp->isSplat())
11701     return PromoteSplat(SVOp, DAG);
11702
11703   // If we have SSSE3, case 1 is generated when all result bytes come from
11704   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
11705   // present, fall back to case 3.
11706
11707   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
11708   if (Subtarget->hasSSSE3()) {
11709     SmallVector<SDValue,16> pshufbMask;
11710
11711     // If all result elements are from one input vector, then only translate
11712     // undef mask values to 0x80 (zero out result) in the pshufb mask.
11713     //
11714     // Otherwise, we have elements from both input vectors, and must zero out
11715     // elements that come from V2 in the first mask, and V1 in the second mask
11716     // so that we can OR them together.
11717     for (unsigned i = 0; i != 16; ++i) {
11718       int EltIdx = MaskVals[i];
11719       if (EltIdx < 0 || EltIdx >= 16)
11720         EltIdx = 0x80;
11721       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11722     }
11723     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
11724                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11725                                  MVT::v16i8, pshufbMask));
11726
11727     // As PSHUFB will zero elements with negative indices, it's safe to ignore
11728     // the 2nd operand if it's undefined or zero.
11729     if (V2.getOpcode() == ISD::UNDEF ||
11730         ISD::isBuildVectorAllZeros(V2.getNode()))
11731       return V1;
11732
11733     // Calculate the shuffle mask for the second input, shuffle it, and
11734     // OR it with the first shuffled input.
11735     pshufbMask.clear();
11736     for (unsigned i = 0; i != 16; ++i) {
11737       int EltIdx = MaskVals[i];
11738       EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
11739       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11740     }
11741     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
11742                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11743                                  MVT::v16i8, pshufbMask));
11744     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11745   }
11746
11747   // No SSSE3 - Calculate in place words and then fix all out of place words
11748   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
11749   // the 16 different words that comprise the two doublequadword input vectors.
11750   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11751   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
11752   SDValue NewV = V1;
11753   for (int i = 0; i != 8; ++i) {
11754     int Elt0 = MaskVals[i*2];
11755     int Elt1 = MaskVals[i*2+1];
11756
11757     // This word of the result is all undef, skip it.
11758     if (Elt0 < 0 && Elt1 < 0)
11759       continue;
11760
11761     // This word of the result is already in the correct place, skip it.
11762     if ((Elt0 == i*2) && (Elt1 == i*2+1))
11763       continue;
11764
11765     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
11766     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
11767     SDValue InsElt;
11768
11769     // If Elt0 and Elt1 are defined, are consecutive, and can be load
11770     // using a single extract together, load it and store it.
11771     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
11772       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11773                            DAG.getIntPtrConstant(Elt1 / 2));
11774       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11775                         DAG.getIntPtrConstant(i));
11776       continue;
11777     }
11778
11779     // If Elt1 is defined, extract it from the appropriate source.  If the
11780     // source byte is not also odd, shift the extracted word left 8 bits
11781     // otherwise clear the bottom 8 bits if we need to do an or.
11782     if (Elt1 >= 0) {
11783       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11784                            DAG.getIntPtrConstant(Elt1 / 2));
11785       if ((Elt1 & 1) == 0)
11786         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
11787                              DAG.getConstant(8,
11788                                   TLI.getShiftAmountTy(InsElt.getValueType())));
11789       else if (Elt0 >= 0)
11790         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
11791                              DAG.getConstant(0xFF00, MVT::i16));
11792     }
11793     // If Elt0 is defined, extract it from the appropriate source.  If the
11794     // source byte is not also even, shift the extracted word right 8 bits. If
11795     // Elt1 was also defined, OR the extracted values together before
11796     // inserting them in the result.
11797     if (Elt0 >= 0) {
11798       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
11799                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
11800       if ((Elt0 & 1) != 0)
11801         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
11802                               DAG.getConstant(8,
11803                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
11804       else if (Elt1 >= 0)
11805         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
11806                              DAG.getConstant(0x00FF, MVT::i16));
11807       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
11808                          : InsElt0;
11809     }
11810     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11811                        DAG.getIntPtrConstant(i));
11812   }
11813   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
11814 }
11815
11816 // v32i8 shuffles - Translate to VPSHUFB if possible.
11817 static
11818 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
11819                                  const X86Subtarget *Subtarget,
11820                                  SelectionDAG &DAG) {
11821   MVT VT = SVOp->getSimpleValueType(0);
11822   SDValue V1 = SVOp->getOperand(0);
11823   SDValue V2 = SVOp->getOperand(1);
11824   SDLoc dl(SVOp);
11825   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11826
11827   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11828   bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
11829   bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
11830
11831   // VPSHUFB may be generated if
11832   // (1) one of input vector is undefined or zeroinitializer.
11833   // The mask value 0x80 puts 0 in the corresponding slot of the vector.
11834   // And (2) the mask indexes don't cross the 128-bit lane.
11835   if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
11836       (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
11837     return SDValue();
11838
11839   if (V1IsAllZero && !V2IsAllZero) {
11840     CommuteVectorShuffleMask(MaskVals, 32);
11841     V1 = V2;
11842   }
11843   return getPSHUFB(MaskVals, V1, dl, DAG);
11844 }
11845
11846 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
11847 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
11848 /// done when every pair / quad of shuffle mask elements point to elements in
11849 /// the right sequence. e.g.
11850 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
11851 static
11852 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
11853                                  SelectionDAG &DAG) {
11854   MVT VT = SVOp->getSimpleValueType(0);
11855   SDLoc dl(SVOp);
11856   unsigned NumElems = VT.getVectorNumElements();
11857   MVT NewVT;
11858   unsigned Scale;
11859   switch (VT.SimpleTy) {
11860   default: llvm_unreachable("Unexpected!");
11861   case MVT::v2i64:
11862   case MVT::v2f64:
11863            return SDValue(SVOp, 0);
11864   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
11865   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
11866   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
11867   case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
11868   case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
11869   case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
11870   }
11871
11872   SmallVector<int, 8> MaskVec;
11873   for (unsigned i = 0; i != NumElems; i += Scale) {
11874     int StartIdx = -1;
11875     for (unsigned j = 0; j != Scale; ++j) {
11876       int EltIdx = SVOp->getMaskElt(i+j);
11877       if (EltIdx < 0)
11878         continue;
11879       if (StartIdx < 0)
11880         StartIdx = (EltIdx / Scale);
11881       if (EltIdx != (int)(StartIdx*Scale + j))
11882         return SDValue();
11883     }
11884     MaskVec.push_back(StartIdx);
11885   }
11886
11887   SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
11888   SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
11889   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
11890 }
11891
11892 /// getVZextMovL - Return a zero-extending vector move low node.
11893 ///
11894 static SDValue getVZextMovL(MVT VT, MVT OpVT,
11895                             SDValue SrcOp, SelectionDAG &DAG,
11896                             const X86Subtarget *Subtarget, SDLoc dl) {
11897   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
11898     LoadSDNode *LD = nullptr;
11899     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
11900       LD = dyn_cast<LoadSDNode>(SrcOp);
11901     if (!LD) {
11902       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
11903       // instead.
11904       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
11905       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
11906           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
11907           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
11908           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
11909         // PR2108
11910         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
11911         return DAG.getNode(ISD::BITCAST, dl, VT,
11912                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11913                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11914                                                    OpVT,
11915                                                    SrcOp.getOperand(0)
11916                                                           .getOperand(0))));
11917       }
11918     }
11919   }
11920
11921   return DAG.getNode(ISD::BITCAST, dl, VT,
11922                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11923                                  DAG.getNode(ISD::BITCAST, dl,
11924                                              OpVT, SrcOp)));
11925 }
11926
11927 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
11928 /// which could not be matched by any known target speficic shuffle
11929 static SDValue
11930 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11931
11932   SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
11933   if (NewOp.getNode())
11934     return NewOp;
11935
11936   MVT VT = SVOp->getSimpleValueType(0);
11937
11938   unsigned NumElems = VT.getVectorNumElements();
11939   unsigned NumLaneElems = NumElems / 2;
11940
11941   SDLoc dl(SVOp);
11942   MVT EltVT = VT.getVectorElementType();
11943   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
11944   SDValue Output[2];
11945
11946   SmallVector<int, 16> Mask;
11947   for (unsigned l = 0; l < 2; ++l) {
11948     // Build a shuffle mask for the output, discovering on the fly which
11949     // input vectors to use as shuffle operands (recorded in InputUsed).
11950     // If building a suitable shuffle vector proves too hard, then bail
11951     // out with UseBuildVector set.
11952     bool UseBuildVector = false;
11953     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
11954     unsigned LaneStart = l * NumLaneElems;
11955     for (unsigned i = 0; i != NumLaneElems; ++i) {
11956       // The mask element.  This indexes into the input.
11957       int Idx = SVOp->getMaskElt(i+LaneStart);
11958       if (Idx < 0) {
11959         // the mask element does not index into any input vector.
11960         Mask.push_back(-1);
11961         continue;
11962       }
11963
11964       // The input vector this mask element indexes into.
11965       int Input = Idx / NumLaneElems;
11966
11967       // Turn the index into an offset from the start of the input vector.
11968       Idx -= Input * NumLaneElems;
11969
11970       // Find or create a shuffle vector operand to hold this input.
11971       unsigned OpNo;
11972       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
11973         if (InputUsed[OpNo] == Input)
11974           // This input vector is already an operand.
11975           break;
11976         if (InputUsed[OpNo] < 0) {
11977           // Create a new operand for this input vector.
11978           InputUsed[OpNo] = Input;
11979           break;
11980         }
11981       }
11982
11983       if (OpNo >= array_lengthof(InputUsed)) {
11984         // More than two input vectors used!  Give up on trying to create a
11985         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
11986         UseBuildVector = true;
11987         break;
11988       }
11989
11990       // Add the mask index for the new shuffle vector.
11991       Mask.push_back(Idx + OpNo * NumLaneElems);
11992     }
11993
11994     if (UseBuildVector) {
11995       SmallVector<SDValue, 16> SVOps;
11996       for (unsigned i = 0; i != NumLaneElems; ++i) {
11997         // The mask element.  This indexes into the input.
11998         int Idx = SVOp->getMaskElt(i+LaneStart);
11999         if (Idx < 0) {
12000           SVOps.push_back(DAG.getUNDEF(EltVT));
12001           continue;
12002         }
12003
12004         // The input vector this mask element indexes into.
12005         int Input = Idx / NumElems;
12006
12007         // Turn the index into an offset from the start of the input vector.
12008         Idx -= Input * NumElems;
12009
12010         // Extract the vector element by hand.
12011         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
12012                                     SVOp->getOperand(Input),
12013                                     DAG.getIntPtrConstant(Idx)));
12014       }
12015
12016       // Construct the output using a BUILD_VECTOR.
12017       Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
12018     } else if (InputUsed[0] < 0) {
12019       // No input vectors were used! The result is undefined.
12020       Output[l] = DAG.getUNDEF(NVT);
12021     } else {
12022       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
12023                                         (InputUsed[0] % 2) * NumLaneElems,
12024                                         DAG, dl);
12025       // If only one input was used, use an undefined vector for the other.
12026       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
12027         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
12028                             (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
12029       // At least one input vector was used. Create a new shuffle vector.
12030       Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
12031     }
12032
12033     Mask.clear();
12034   }
12035
12036   // Concatenate the result back
12037   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
12038 }
12039
12040 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
12041 /// 4 elements, and match them with several different shuffle types.
12042 static SDValue
12043 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
12044   SDValue V1 = SVOp->getOperand(0);
12045   SDValue V2 = SVOp->getOperand(1);
12046   SDLoc dl(SVOp);
12047   MVT VT = SVOp->getSimpleValueType(0);
12048
12049   assert(VT.is128BitVector() && "Unsupported vector size");
12050
12051   std::pair<int, int> Locs[4];
12052   int Mask1[] = { -1, -1, -1, -1 };
12053   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
12054
12055   unsigned NumHi = 0;
12056   unsigned NumLo = 0;
12057   for (unsigned i = 0; i != 4; ++i) {
12058     int Idx = PermMask[i];
12059     if (Idx < 0) {
12060       Locs[i] = std::make_pair(-1, -1);
12061     } else {
12062       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
12063       if (Idx < 4) {
12064         Locs[i] = std::make_pair(0, NumLo);
12065         Mask1[NumLo] = Idx;
12066         NumLo++;
12067       } else {
12068         Locs[i] = std::make_pair(1, NumHi);
12069         if (2+NumHi < 4)
12070           Mask1[2+NumHi] = Idx;
12071         NumHi++;
12072       }
12073     }
12074   }
12075
12076   if (NumLo <= 2 && NumHi <= 2) {
12077     // If no more than two elements come from either vector. This can be
12078     // implemented with two shuffles. First shuffle gather the elements.
12079     // The second shuffle, which takes the first shuffle as both of its
12080     // vector operands, put the elements into the right order.
12081     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12082
12083     int Mask2[] = { -1, -1, -1, -1 };
12084
12085     for (unsigned i = 0; i != 4; ++i)
12086       if (Locs[i].first != -1) {
12087         unsigned Idx = (i < 2) ? 0 : 4;
12088         Idx += Locs[i].first * 2 + Locs[i].second;
12089         Mask2[i] = Idx;
12090       }
12091
12092     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
12093   }
12094
12095   if (NumLo == 3 || NumHi == 3) {
12096     // Otherwise, we must have three elements from one vector, call it X, and
12097     // one element from the other, call it Y.  First, use a shufps to build an
12098     // intermediate vector with the one element from Y and the element from X
12099     // that will be in the same half in the final destination (the indexes don't
12100     // matter). Then, use a shufps to build the final vector, taking the half
12101     // containing the element from Y from the intermediate, and the other half
12102     // from X.
12103     if (NumHi == 3) {
12104       // Normalize it so the 3 elements come from V1.
12105       CommuteVectorShuffleMask(PermMask, 4);
12106       std::swap(V1, V2);
12107     }
12108
12109     // Find the element from V2.
12110     unsigned HiIndex;
12111     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
12112       int Val = PermMask[HiIndex];
12113       if (Val < 0)
12114         continue;
12115       if (Val >= 4)
12116         break;
12117     }
12118
12119     Mask1[0] = PermMask[HiIndex];
12120     Mask1[1] = -1;
12121     Mask1[2] = PermMask[HiIndex^1];
12122     Mask1[3] = -1;
12123     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12124
12125     if (HiIndex >= 2) {
12126       Mask1[0] = PermMask[0];
12127       Mask1[1] = PermMask[1];
12128       Mask1[2] = HiIndex & 1 ? 6 : 4;
12129       Mask1[3] = HiIndex & 1 ? 4 : 6;
12130       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12131     }
12132
12133     Mask1[0] = HiIndex & 1 ? 2 : 0;
12134     Mask1[1] = HiIndex & 1 ? 0 : 2;
12135     Mask1[2] = PermMask[2];
12136     Mask1[3] = PermMask[3];
12137     if (Mask1[2] >= 0)
12138       Mask1[2] += 4;
12139     if (Mask1[3] >= 0)
12140       Mask1[3] += 4;
12141     return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
12142   }
12143
12144   // Break it into (shuffle shuffle_hi, shuffle_lo).
12145   int LoMask[] = { -1, -1, -1, -1 };
12146   int HiMask[] = { -1, -1, -1, -1 };
12147
12148   int *MaskPtr = LoMask;
12149   unsigned MaskIdx = 0;
12150   unsigned LoIdx = 0;
12151   unsigned HiIdx = 2;
12152   for (unsigned i = 0; i != 4; ++i) {
12153     if (i == 2) {
12154       MaskPtr = HiMask;
12155       MaskIdx = 1;
12156       LoIdx = 0;
12157       HiIdx = 2;
12158     }
12159     int Idx = PermMask[i];
12160     if (Idx < 0) {
12161       Locs[i] = std::make_pair(-1, -1);
12162     } else if (Idx < 4) {
12163       Locs[i] = std::make_pair(MaskIdx, LoIdx);
12164       MaskPtr[LoIdx] = Idx;
12165       LoIdx++;
12166     } else {
12167       Locs[i] = std::make_pair(MaskIdx, HiIdx);
12168       MaskPtr[HiIdx] = Idx;
12169       HiIdx++;
12170     }
12171   }
12172
12173   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
12174   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
12175   int MaskOps[] = { -1, -1, -1, -1 };
12176   for (unsigned i = 0; i != 4; ++i)
12177     if (Locs[i].first != -1)
12178       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
12179   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
12180 }
12181
12182 static bool MayFoldVectorLoad(SDValue V) {
12183   while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
12184     V = V.getOperand(0);
12185
12186   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
12187     V = V.getOperand(0);
12188   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
12189       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
12190     // BUILD_VECTOR (load), undef
12191     V = V.getOperand(0);
12192
12193   return MayFoldLoad(V);
12194 }
12195
12196 static
12197 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
12198   MVT VT = Op.getSimpleValueType();
12199
12200   // Canonicalize to v2f64.
12201   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
12202   return DAG.getNode(ISD::BITCAST, dl, VT,
12203                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
12204                                           V1, DAG));
12205 }
12206
12207 static
12208 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
12209                         bool HasSSE2) {
12210   SDValue V1 = Op.getOperand(0);
12211   SDValue V2 = Op.getOperand(1);
12212   MVT VT = Op.getSimpleValueType();
12213
12214   assert(VT != MVT::v2i64 && "unsupported shuffle type");
12215
12216   if (HasSSE2 && VT == MVT::v2f64)
12217     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
12218
12219   // v4f32 or v4i32: canonicalize to v4f32 (which is legal for SSE1)
12220   return DAG.getNode(ISD::BITCAST, dl, VT,
12221                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
12222                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
12223                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
12224 }
12225
12226 static
12227 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
12228   SDValue V1 = Op.getOperand(0);
12229   SDValue V2 = Op.getOperand(1);
12230   MVT VT = Op.getSimpleValueType();
12231
12232   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
12233          "unsupported shuffle type");
12234
12235   if (V2.getOpcode() == ISD::UNDEF)
12236     V2 = V1;
12237
12238   // v4i32 or v4f32
12239   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
12240 }
12241
12242 static
12243 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
12244   SDValue V1 = Op.getOperand(0);
12245   SDValue V2 = Op.getOperand(1);
12246   MVT VT = Op.getSimpleValueType();
12247   unsigned NumElems = VT.getVectorNumElements();
12248
12249   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
12250   // operand of these instructions is only memory, so check if there's a
12251   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
12252   // same masks.
12253   bool CanFoldLoad = false;
12254
12255   // Trivial case, when V2 comes from a load.
12256   if (MayFoldVectorLoad(V2))
12257     CanFoldLoad = true;
12258
12259   // When V1 is a load, it can be folded later into a store in isel, example:
12260   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
12261   //    turns into:
12262   //  (MOVLPSmr addr:$src1, VR128:$src2)
12263   // So, recognize this potential and also use MOVLPS or MOVLPD
12264   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
12265     CanFoldLoad = true;
12266
12267   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12268   if (CanFoldLoad) {
12269     if (HasSSE2 && NumElems == 2)
12270       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
12271
12272     if (NumElems == 4)
12273       // If we don't care about the second element, proceed to use movss.
12274       if (SVOp->getMaskElt(1) != -1)
12275         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
12276   }
12277
12278   // movl and movlp will both match v2i64, but v2i64 is never matched by
12279   // movl earlier because we make it strict to avoid messing with the movlp load
12280   // folding logic (see the code above getMOVLP call). Match it here then,
12281   // this is horrible, but will stay like this until we move all shuffle
12282   // matching to x86 specific nodes. Note that for the 1st condition all
12283   // types are matched with movsd.
12284   if (HasSSE2) {
12285     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
12286     // as to remove this logic from here, as much as possible
12287     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
12288       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12289     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12290   }
12291
12292   assert(VT != MVT::v4i32 && "unsupported shuffle type");
12293
12294   // Invert the operand order and use SHUFPS to match it.
12295   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
12296                               getShuffleSHUFImmediate(SVOp), DAG);
12297 }
12298
12299 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
12300                                          SelectionDAG &DAG) {
12301   SDLoc dl(Load);
12302   MVT VT = Load->getSimpleValueType(0);
12303   MVT EVT = VT.getVectorElementType();
12304   SDValue Addr = Load->getOperand(1);
12305   SDValue NewAddr = DAG.getNode(
12306       ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
12307       DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
12308
12309   SDValue NewLoad =
12310       DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
12311                   DAG.getMachineFunction().getMachineMemOperand(
12312                       Load->getMemOperand(), 0, EVT.getStoreSize()));
12313   return NewLoad;
12314 }
12315
12316 // It is only safe to call this function if isINSERTPSMask is true for
12317 // this shufflevector mask.
12318 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
12319                            SelectionDAG &DAG) {
12320   // Generate an insertps instruction when inserting an f32 from memory onto a
12321   // v4f32 or when copying a member from one v4f32 to another.
12322   // We also use it for transferring i32 from one register to another,
12323   // since it simply copies the same bits.
12324   // If we're transferring an i32 from memory to a specific element in a
12325   // register, we output a generic DAG that will match the PINSRD
12326   // instruction.
12327   MVT VT = SVOp->getSimpleValueType(0);
12328   MVT EVT = VT.getVectorElementType();
12329   SDValue V1 = SVOp->getOperand(0);
12330   SDValue V2 = SVOp->getOperand(1);
12331   auto Mask = SVOp->getMask();
12332   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
12333          "unsupported vector type for insertps/pinsrd");
12334
12335   auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
12336   auto FromV2Predicate = [](const int &i) { return i >= 4; };
12337   int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
12338
12339   SDValue From;
12340   SDValue To;
12341   unsigned DestIndex;
12342   if (FromV1 == 1) {
12343     From = V1;
12344     To = V2;
12345     DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
12346                 Mask.begin();
12347
12348     // If we have 1 element from each vector, we have to check if we're
12349     // changing V1's element's place. If so, we're done. Otherwise, we
12350     // should assume we're changing V2's element's place and behave
12351     // accordingly.
12352     int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
12353     assert(DestIndex <= INT32_MAX && "truncated destination index");
12354     if (FromV1 == FromV2 &&
12355         static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
12356       From = V2;
12357       To = V1;
12358       DestIndex =
12359           std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12360     }
12361   } else {
12362     assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
12363            "More than one element from V1 and from V2, or no elements from one "
12364            "of the vectors. This case should not have returned true from "
12365            "isINSERTPSMask");
12366     From = V2;
12367     To = V1;
12368     DestIndex =
12369         std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12370   }
12371
12372   // Get an index into the source vector in the range [0,4) (the mask is
12373   // in the range [0,8) because it can address V1 and V2)
12374   unsigned SrcIndex = Mask[DestIndex] % 4;
12375   if (MayFoldLoad(From)) {
12376     // Trivial case, when From comes from a load and is only used by the
12377     // shuffle. Make it use insertps from the vector that we need from that
12378     // load.
12379     SDValue NewLoad =
12380         NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
12381     if (!NewLoad.getNode())
12382       return SDValue();
12383
12384     if (EVT == MVT::f32) {
12385       // Create this as a scalar to vector to match the instruction pattern.
12386       SDValue LoadScalarToVector =
12387           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
12388       SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
12389       return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
12390                          InsertpsMask);
12391     } else { // EVT == MVT::i32
12392       // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
12393       // instruction, to match the PINSRD instruction, which loads an i32 to a
12394       // certain vector element.
12395       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
12396                          DAG.getConstant(DestIndex, MVT::i32));
12397     }
12398   }
12399
12400   // Vector-element-to-vector
12401   SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
12402   return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
12403 }
12404
12405 // Reduce a vector shuffle to zext.
12406 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
12407                                     SelectionDAG &DAG) {
12408   // PMOVZX is only available from SSE41.
12409   if (!Subtarget->hasSSE41())
12410     return SDValue();
12411
12412   MVT VT = Op.getSimpleValueType();
12413
12414   // Only AVX2 support 256-bit vector integer extending.
12415   if (!Subtarget->hasInt256() && VT.is256BitVector())
12416     return SDValue();
12417
12418   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12419   SDLoc DL(Op);
12420   SDValue V1 = Op.getOperand(0);
12421   SDValue V2 = Op.getOperand(1);
12422   unsigned NumElems = VT.getVectorNumElements();
12423
12424   // Extending is an unary operation and the element type of the source vector
12425   // won't be equal to or larger than i64.
12426   if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
12427       VT.getVectorElementType() == MVT::i64)
12428     return SDValue();
12429
12430   // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
12431   unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
12432   while ((1U << Shift) < NumElems) {
12433     if (SVOp->getMaskElt(1U << Shift) == 1)
12434       break;
12435     Shift += 1;
12436     // The maximal ratio is 8, i.e. from i8 to i64.
12437     if (Shift > 3)
12438       return SDValue();
12439   }
12440
12441   // Check the shuffle mask.
12442   unsigned Mask = (1U << Shift) - 1;
12443   for (unsigned i = 0; i != NumElems; ++i) {
12444     int EltIdx = SVOp->getMaskElt(i);
12445     if ((i & Mask) != 0 && EltIdx != -1)
12446       return SDValue();
12447     if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
12448       return SDValue();
12449   }
12450
12451   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
12452   MVT NeVT = MVT::getIntegerVT(NBits);
12453   MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
12454
12455   if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
12456     return SDValue();
12457
12458   return DAG.getNode(ISD::BITCAST, DL, VT,
12459                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
12460 }
12461
12462 static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
12463                                       SelectionDAG &DAG) {
12464   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12465   MVT VT = Op.getSimpleValueType();
12466   SDLoc dl(Op);
12467   SDValue V1 = Op.getOperand(0);
12468   SDValue V2 = Op.getOperand(1);
12469
12470   if (isZeroShuffle(SVOp))
12471     return getZeroVector(VT, Subtarget, DAG, dl);
12472
12473   // Handle splat operations
12474   if (SVOp->isSplat()) {
12475     // Use vbroadcast whenever the splat comes from a foldable load
12476     SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
12477     if (Broadcast.getNode())
12478       return Broadcast;
12479   }
12480
12481   // Check integer expanding shuffles.
12482   SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
12483   if (NewOp.getNode())
12484     return NewOp;
12485
12486   // If the shuffle can be profitably rewritten as a narrower shuffle, then
12487   // do it!
12488   if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
12489       VT == MVT::v32i8) {
12490     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12491     if (NewOp.getNode())
12492       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
12493   } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
12494     // FIXME: Figure out a cleaner way to do this.
12495     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
12496       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12497       if (NewOp.getNode()) {
12498         MVT NewVT = NewOp.getSimpleValueType();
12499         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
12500                                NewVT, true, false))
12501           return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
12502                               dl);
12503       }
12504     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
12505       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12506       if (NewOp.getNode()) {
12507         MVT NewVT = NewOp.getSimpleValueType();
12508         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
12509           return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
12510                               dl);
12511       }
12512     }
12513   }
12514   return SDValue();
12515 }
12516
12517 SDValue
12518 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
12519   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12520   SDValue V1 = Op.getOperand(0);
12521   SDValue V2 = Op.getOperand(1);
12522   MVT VT = Op.getSimpleValueType();
12523   SDLoc dl(Op);
12524   unsigned NumElems = VT.getVectorNumElements();
12525   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
12526   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
12527   bool V1IsSplat = false;
12528   bool V2IsSplat = false;
12529   bool HasSSE2 = Subtarget->hasSSE2();
12530   bool HasFp256    = Subtarget->hasFp256();
12531   bool HasInt256   = Subtarget->hasInt256();
12532   MachineFunction &MF = DAG.getMachineFunction();
12533   bool OptForSize = MF.getFunction()->getAttributes().
12534     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
12535
12536   // Check if we should use the experimental vector shuffle lowering. If so,
12537   // delegate completely to that code path.
12538   if (ExperimentalVectorShuffleLowering)
12539     return lowerVectorShuffle(Op, Subtarget, DAG);
12540
12541   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
12542
12543   if (V1IsUndef && V2IsUndef)
12544     return DAG.getUNDEF(VT);
12545
12546   // When we create a shuffle node we put the UNDEF node to second operand,
12547   // but in some cases the first operand may be transformed to UNDEF.
12548   // In this case we should just commute the node.
12549   if (V1IsUndef)
12550     return DAG.getCommutedVectorShuffle(*SVOp);
12551
12552   // Vector shuffle lowering takes 3 steps:
12553   //
12554   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
12555   //    narrowing and commutation of operands should be handled.
12556   // 2) Matching of shuffles with known shuffle masks to x86 target specific
12557   //    shuffle nodes.
12558   // 3) Rewriting of unmatched masks into new generic shuffle operations,
12559   //    so the shuffle can be broken into other shuffles and the legalizer can
12560   //    try the lowering again.
12561   //
12562   // The general idea is that no vector_shuffle operation should be left to
12563   // be matched during isel, all of them must be converted to a target specific
12564   // node here.
12565
12566   // Normalize the input vectors. Here splats, zeroed vectors, profitable
12567   // narrowing and commutation of operands should be handled. The actual code
12568   // doesn't include all of those, work in progress...
12569   SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
12570   if (NewOp.getNode())
12571     return NewOp;
12572
12573   SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
12574
12575   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
12576   // unpckh_undef). Only use pshufd if speed is more important than size.
12577   if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12578     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12579   if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12580     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12581
12582   if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
12583       V2IsUndef && MayFoldVectorLoad(V1))
12584     return getMOVDDup(Op, dl, V1, DAG);
12585
12586   if (isMOVHLPS_v_undef_Mask(M, VT))
12587     return getMOVHighToLow(Op, dl, DAG);
12588
12589   // Use to match splats
12590   if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
12591       (VT == MVT::v2f64 || VT == MVT::v2i64))
12592     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12593
12594   if (isPSHUFDMask(M, VT)) {
12595     // The actual implementation will match the mask in the if above and then
12596     // during isel it can match several different instructions, not only pshufd
12597     // as its name says, sad but true, emulate the behavior for now...
12598     if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
12599       return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
12600
12601     unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
12602
12603     if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
12604       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
12605
12606     if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
12607       return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
12608                                   DAG);
12609
12610     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
12611                                 TargetMask, DAG);
12612   }
12613
12614   if (isPALIGNRMask(M, VT, Subtarget))
12615     return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
12616                                 getShufflePALIGNRImmediate(SVOp),
12617                                 DAG);
12618
12619   if (isVALIGNMask(M, VT, Subtarget))
12620     return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
12621                                 getShuffleVALIGNImmediate(SVOp),
12622                                 DAG);
12623
12624   // Check if this can be converted into a logical shift.
12625   bool isLeft = false;
12626   unsigned ShAmt = 0;
12627   SDValue ShVal;
12628   bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
12629   if (isShift && ShVal.hasOneUse()) {
12630     // If the shifted value has multiple uses, it may be cheaper to use
12631     // v_set0 + movlhps or movhlps, etc.
12632     MVT EltVT = VT.getVectorElementType();
12633     ShAmt *= EltVT.getSizeInBits();
12634     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12635   }
12636
12637   if (isMOVLMask(M, VT)) {
12638     if (ISD::isBuildVectorAllZeros(V1.getNode()))
12639       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
12640     if (!isMOVLPMask(M, VT)) {
12641       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
12642         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12643
12644       if (VT == MVT::v4i32 || VT == MVT::v4f32)
12645         return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12646     }
12647   }
12648
12649   // FIXME: fold these into legal mask.
12650   if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
12651     return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
12652
12653   if (isMOVHLPSMask(M, VT))
12654     return getMOVHighToLow(Op, dl, DAG);
12655
12656   if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
12657     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
12658
12659   if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
12660     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
12661
12662   if (isMOVLPMask(M, VT))
12663     return getMOVLP(Op, dl, DAG, HasSSE2);
12664
12665   if (ShouldXformToMOVHLPS(M, VT) ||
12666       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
12667     return DAG.getCommutedVectorShuffle(*SVOp);
12668
12669   if (isShift) {
12670     // No better options. Use a vshldq / vsrldq.
12671     MVT EltVT = VT.getVectorElementType();
12672     ShAmt *= EltVT.getSizeInBits();
12673     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12674   }
12675
12676   bool Commuted = false;
12677   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
12678   // 1,1,1,1 -> v8i16 though.
12679   BitVector UndefElements;
12680   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
12681     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12682       V1IsSplat = true;
12683   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
12684     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12685       V2IsSplat = true;
12686
12687   // Canonicalize the splat or undef, if present, to be on the RHS.
12688   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
12689     CommuteVectorShuffleMask(M, NumElems);
12690     std::swap(V1, V2);
12691     std::swap(V1IsSplat, V2IsSplat);
12692     Commuted = true;
12693   }
12694
12695   if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
12696     // Shuffling low element of v1 into undef, just return v1.
12697     if (V2IsUndef)
12698       return V1;
12699     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
12700     // the instruction selector will not match, so get a canonical MOVL with
12701     // swapped operands to undo the commute.
12702     return getMOVL(DAG, dl, VT, V2, V1);
12703   }
12704
12705   if (isUNPCKLMask(M, VT, HasInt256))
12706     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12707
12708   if (isUNPCKHMask(M, VT, HasInt256))
12709     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12710
12711   if (V2IsSplat) {
12712     // Normalize mask so all entries that point to V2 points to its first
12713     // element then try to match unpck{h|l} again. If match, return a
12714     // new vector_shuffle with the corrected mask.p
12715     SmallVector<int, 8> NewMask(M.begin(), M.end());
12716     NormalizeMask(NewMask, NumElems);
12717     if (isUNPCKLMask(NewMask, VT, HasInt256, true))
12718       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12719     if (isUNPCKHMask(NewMask, VT, HasInt256, true))
12720       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12721   }
12722
12723   if (Commuted) {
12724     // Commute is back and try unpck* again.
12725     // FIXME: this seems wrong.
12726     CommuteVectorShuffleMask(M, NumElems);
12727     std::swap(V1, V2);
12728     std::swap(V1IsSplat, V2IsSplat);
12729
12730     if (isUNPCKLMask(M, VT, HasInt256))
12731       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12732
12733     if (isUNPCKHMask(M, VT, HasInt256))
12734       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12735   }
12736
12737   // Normalize the node to match x86 shuffle ops if needed
12738   if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
12739     return DAG.getCommutedVectorShuffle(*SVOp);
12740
12741   // The checks below are all present in isShuffleMaskLegal, but they are
12742   // inlined here right now to enable us to directly emit target specific
12743   // nodes, and remove one by one until they don't return Op anymore.
12744
12745   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
12746       SVOp->getSplatIndex() == 0 && V2IsUndef) {
12747     if (VT == MVT::v2f64 || VT == MVT::v2i64)
12748       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12749   }
12750
12751   if (isPSHUFHWMask(M, VT, HasInt256))
12752     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
12753                                 getShufflePSHUFHWImmediate(SVOp),
12754                                 DAG);
12755
12756   if (isPSHUFLWMask(M, VT, HasInt256))
12757     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
12758                                 getShufflePSHUFLWImmediate(SVOp),
12759                                 DAG);
12760
12761   unsigned MaskValue;
12762   if (isBlendMask(M, VT, Subtarget->hasSSE41(), HasInt256, &MaskValue))
12763     return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
12764
12765   if (isSHUFPMask(M, VT))
12766     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
12767                                 getShuffleSHUFImmediate(SVOp), DAG);
12768
12769   if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12770     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12771   if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12772     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12773
12774   //===--------------------------------------------------------------------===//
12775   // Generate target specific nodes for 128 or 256-bit shuffles only
12776   // supported in the AVX instruction set.
12777   //
12778
12779   // Handle VMOVDDUPY permutations
12780   if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
12781     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
12782
12783   // Handle VPERMILPS/D* permutations
12784   if (isVPERMILPMask(M, VT)) {
12785     if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
12786       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
12787                                   getShuffleSHUFImmediate(SVOp), DAG);
12788     return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
12789                                 getShuffleSHUFImmediate(SVOp), DAG);
12790   }
12791
12792   unsigned Idx;
12793   if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
12794     return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
12795                               Idx*(NumElems/2), DAG, dl);
12796
12797   // Handle VPERM2F128/VPERM2I128 permutations
12798   if (isVPERM2X128Mask(M, VT, HasFp256))
12799     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
12800                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
12801
12802   if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
12803     return getINSERTPS(SVOp, dl, DAG);
12804
12805   unsigned Imm8;
12806   if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
12807     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
12808
12809   if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
12810       VT.is512BitVector()) {
12811     MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
12812     MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
12813     SmallVector<SDValue, 16> permclMask;
12814     for (unsigned i = 0; i != NumElems; ++i) {
12815       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
12816     }
12817
12818     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
12819     if (V2IsUndef)
12820       // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
12821       return DAG.getNode(X86ISD::VPERMV, dl, VT,
12822                           DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
12823     return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
12824                        DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
12825   }
12826
12827   //===--------------------------------------------------------------------===//
12828   // Since no target specific shuffle was selected for this generic one,
12829   // lower it into other known shuffles. FIXME: this isn't true yet, but
12830   // this is the plan.
12831   //
12832
12833   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
12834   if (VT == MVT::v8i16) {
12835     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
12836     if (NewOp.getNode())
12837       return NewOp;
12838   }
12839
12840   if (VT == MVT::v16i16 && HasInt256) {
12841     SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
12842     if (NewOp.getNode())
12843       return NewOp;
12844   }
12845
12846   if (VT == MVT::v16i8) {
12847     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
12848     if (NewOp.getNode())
12849       return NewOp;
12850   }
12851
12852   if (VT == MVT::v32i8) {
12853     SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
12854     if (NewOp.getNode())
12855       return NewOp;
12856   }
12857
12858   // Handle all 128-bit wide vectors with 4 elements, and match them with
12859   // several different shuffle types.
12860   if (NumElems == 4 && VT.is128BitVector())
12861     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
12862
12863   // Handle general 256-bit shuffles
12864   if (VT.is256BitVector())
12865     return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
12866
12867   return SDValue();
12868 }
12869
12870 // This function assumes its argument is a BUILD_VECTOR of constants or
12871 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
12872 // true.
12873 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
12874                                     unsigned &MaskValue) {
12875   MaskValue = 0;
12876   unsigned NumElems = BuildVector->getNumOperands();
12877   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
12878   unsigned NumLanes = (NumElems - 1) / 8 + 1;
12879   unsigned NumElemsInLane = NumElems / NumLanes;
12880
12881   // Blend for v16i16 should be symetric for the both lanes.
12882   for (unsigned i = 0; i < NumElemsInLane; ++i) {
12883     SDValue EltCond = BuildVector->getOperand(i);
12884     SDValue SndLaneEltCond =
12885         (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
12886
12887     int Lane1Cond = -1, Lane2Cond = -1;
12888     if (isa<ConstantSDNode>(EltCond))
12889       Lane1Cond = !isZero(EltCond);
12890     if (isa<ConstantSDNode>(SndLaneEltCond))
12891       Lane2Cond = !isZero(SndLaneEltCond);
12892
12893     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
12894       // Lane1Cond != 0, means we want the first argument.
12895       // Lane1Cond == 0, means we want the second argument.
12896       // The encoding of this argument is 0 for the first argument, 1
12897       // for the second. Therefore, invert the condition.
12898       MaskValue |= !Lane1Cond << i;
12899     else if (Lane1Cond < 0)
12900       MaskValue |= !Lane2Cond << i;
12901     else
12902       return false;
12903   }
12904   return true;
12905 }
12906
12907 /// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
12908 /// instruction.
12909 static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
12910                                     SelectionDAG &DAG) {
12911   SDValue Cond = Op.getOperand(0);
12912   SDValue LHS = Op.getOperand(1);
12913   SDValue RHS = Op.getOperand(2);
12914   SDLoc dl(Op);
12915   MVT VT = Op.getSimpleValueType();
12916   MVT EltVT = VT.getVectorElementType();
12917   unsigned NumElems = VT.getVectorNumElements();
12918
12919   // There is no blend with immediate in AVX-512.
12920   if (VT.is512BitVector())
12921     return SDValue();
12922
12923   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
12924     return SDValue();
12925   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
12926     return SDValue();
12927
12928   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12929     return SDValue();
12930
12931   // Check the mask for BLEND and build the value.
12932   unsigned MaskValue = 0;
12933   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
12934     return SDValue();
12935
12936   // Convert i32 vectors to floating point if it is not AVX2.
12937   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
12938   MVT BlendVT = VT;
12939   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
12940     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
12941                                NumElems);
12942     LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
12943     RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
12944   }
12945
12946   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
12947                             DAG.getConstant(MaskValue, MVT::i32));
12948   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
12949 }
12950
12951 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12952   // A vselect where all conditions and data are constants can be optimized into
12953   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12954   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12955       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12956       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12957     return SDValue();
12958
12959   SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
12960   if (BlendOp.getNode())
12961     return BlendOp;
12962
12963   // Some types for vselect were previously set to Expand, not Legal or
12964   // Custom. Return an empty SDValue so we fall-through to Expand, after
12965   // the Custom lowering phase.
12966   MVT VT = Op.getSimpleValueType();
12967   switch (VT.SimpleTy) {
12968   default:
12969     break;
12970   case MVT::v8i16:
12971   case MVT::v16i16:
12972     if (Subtarget->hasBWI() && Subtarget->hasVLX())
12973       break;
12974     return SDValue();
12975   }
12976
12977   // We couldn't create a "Blend with immediate" node.
12978   // This node should still be legal, but we'll have to emit a blendv*
12979   // instruction.
12980   return Op;
12981 }
12982
12983 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12984   MVT VT = Op.getSimpleValueType();
12985   SDLoc dl(Op);
12986
12987   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12988     return SDValue();
12989
12990   if (VT.getSizeInBits() == 8) {
12991     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12992                                   Op.getOperand(0), Op.getOperand(1));
12993     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12994                                   DAG.getValueType(VT));
12995     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12996   }
12997
12998   if (VT.getSizeInBits() == 16) {
12999     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13000     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
13001     if (Idx == 0)
13002       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13003                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13004                                      DAG.getNode(ISD::BITCAST, dl,
13005                                                  MVT::v4i32,
13006                                                  Op.getOperand(0)),
13007                                      Op.getOperand(1)));
13008     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
13009                                   Op.getOperand(0), Op.getOperand(1));
13010     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13011                                   DAG.getValueType(VT));
13012     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13013   }
13014
13015   if (VT == MVT::f32) {
13016     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13017     // the result back to FR32 register. It's only worth matching if the
13018     // result has a single use which is a store or a bitcast to i32.  And in
13019     // the case of a store, it's not worth it if the index is a constant 0,
13020     // because a MOVSSmr can be used instead, which is smaller and faster.
13021     if (!Op.hasOneUse())
13022       return SDValue();
13023     SDNode *User = *Op.getNode()->use_begin();
13024     if ((User->getOpcode() != ISD::STORE ||
13025          (isa<ConstantSDNode>(Op.getOperand(1)) &&
13026           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
13027         (User->getOpcode() != ISD::BITCAST ||
13028          User->getValueType(0) != MVT::i32))
13029       return SDValue();
13030     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13031                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
13032                                               Op.getOperand(0)),
13033                                               Op.getOperand(1));
13034     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
13035   }
13036
13037   if (VT == MVT::i32 || VT == MVT::i64) {
13038     // ExtractPS/pextrq works with constant index.
13039     if (isa<ConstantSDNode>(Op.getOperand(1)))
13040       return Op;
13041   }
13042   return SDValue();
13043 }
13044
13045 /// Extract one bit from mask vector, like v16i1 or v8i1.
13046 /// AVX-512 feature.
13047 SDValue
13048 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13049   SDValue Vec = Op.getOperand(0);
13050   SDLoc dl(Vec);
13051   MVT VecVT = Vec.getSimpleValueType();
13052   SDValue Idx = Op.getOperand(1);
13053   MVT EltVT = Op.getSimpleValueType();
13054
13055   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13056   assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
13057          "Unexpected vector type in ExtractBitFromMaskVector");
13058
13059   // variable index can't be handled in mask registers,
13060   // extend vector to VR512
13061   if (!isa<ConstantSDNode>(Idx)) {
13062     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13063     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
13064     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13065                               ExtVT.getVectorElementType(), Ext, Idx);
13066     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13067   }
13068
13069   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13070   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13071   if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
13072     rc = getRegClassFor(MVT::v16i1);
13073   unsigned MaxSift = rc->getSize()*8 - 1;
13074   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13075                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13076   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13077                     DAG.getConstant(MaxSift, MVT::i8));
13078   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
13079                        DAG.getIntPtrConstant(0));
13080 }
13081
13082 SDValue
13083 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13084                                            SelectionDAG &DAG) const {
13085   SDLoc dl(Op);
13086   SDValue Vec = Op.getOperand(0);
13087   MVT VecVT = Vec.getSimpleValueType();
13088   SDValue Idx = Op.getOperand(1);
13089
13090   if (Op.getSimpleValueType() == MVT::i1)
13091     return ExtractBitFromMaskVector(Op, DAG);
13092
13093   if (!isa<ConstantSDNode>(Idx)) {
13094     if (VecVT.is512BitVector() ||
13095         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
13096          VecVT.getVectorElementType().getSizeInBits() == 32)) {
13097
13098       MVT MaskEltVT =
13099         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
13100       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
13101                                     MaskEltVT.getSizeInBits());
13102
13103       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
13104       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
13105                                 getZeroVector(MaskVT, Subtarget, DAG, dl),
13106                                 Idx, DAG.getConstant(0, getPointerTy()));
13107       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
13108       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
13109                         Perm, DAG.getConstant(0, getPointerTy()));
13110     }
13111     return SDValue();
13112   }
13113
13114   // If this is a 256-bit vector result, first extract the 128-bit vector and
13115   // then extract the element from the 128-bit vector.
13116   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
13117
13118     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13119     // Get the 128-bit vector.
13120     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
13121     MVT EltVT = VecVT.getVectorElementType();
13122
13123     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
13124
13125     //if (IdxVal >= NumElems/2)
13126     //  IdxVal -= NumElems/2;
13127     IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
13128     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
13129                        DAG.getConstant(IdxVal, MVT::i32));
13130   }
13131
13132   assert(VecVT.is128BitVector() && "Unexpected vector length");
13133
13134   if (Subtarget->hasSSE41()) {
13135     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
13136     if (Res.getNode())
13137       return Res;
13138   }
13139
13140   MVT VT = Op.getSimpleValueType();
13141   // TODO: handle v16i8.
13142   if (VT.getSizeInBits() == 16) {
13143     SDValue Vec = Op.getOperand(0);
13144     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13145     if (Idx == 0)
13146       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13147                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13148                                      DAG.getNode(ISD::BITCAST, dl,
13149                                                  MVT::v4i32, Vec),
13150                                      Op.getOperand(1)));
13151     // Transform it so it match pextrw which produces a 32-bit result.
13152     MVT EltVT = MVT::i32;
13153     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
13154                                   Op.getOperand(0), Op.getOperand(1));
13155     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
13156                                   DAG.getValueType(VT));
13157     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13158   }
13159
13160   if (VT.getSizeInBits() == 32) {
13161     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13162     if (Idx == 0)
13163       return Op;
13164
13165     // SHUFPS the element to the lowest double word, then movss.
13166     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
13167     MVT VVT = Op.getOperand(0).getSimpleValueType();
13168     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13169                                        DAG.getUNDEF(VVT), Mask);
13170     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13171                        DAG.getIntPtrConstant(0));
13172   }
13173
13174   if (VT.getSizeInBits() == 64) {
13175     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
13176     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
13177     //        to match extract_elt for f64.
13178     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13179     if (Idx == 0)
13180       return Op;
13181
13182     // UNPCKHPD the element to the lowest double word, then movsd.
13183     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
13184     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
13185     int Mask[2] = { 1, -1 };
13186     MVT VVT = Op.getOperand(0).getSimpleValueType();
13187     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13188                                        DAG.getUNDEF(VVT), Mask);
13189     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13190                        DAG.getIntPtrConstant(0));
13191   }
13192
13193   return SDValue();
13194 }
13195
13196 /// Insert one bit to mask vector, like v16i1 or v8i1.
13197 /// AVX-512 feature.
13198 SDValue
13199 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
13200   SDLoc dl(Op);
13201   SDValue Vec = Op.getOperand(0);
13202   SDValue Elt = Op.getOperand(1);
13203   SDValue Idx = Op.getOperand(2);
13204   MVT VecVT = Vec.getSimpleValueType();
13205
13206   if (!isa<ConstantSDNode>(Idx)) {
13207     // Non constant index. Extend source and destination,
13208     // insert element and then truncate the result.
13209     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13210     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
13211     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13212       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13213       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13214     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13215   }
13216
13217   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13218   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13219   if (Vec.getOpcode() == ISD::UNDEF)
13220     return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13221                        DAG.getConstant(IdxVal, MVT::i8));
13222   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13223   unsigned MaxSift = rc->getSize()*8 - 1;
13224   EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13225                     DAG.getConstant(MaxSift, MVT::i8));
13226   EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
13227                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13228   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13229 }
13230
13231 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13232                                                   SelectionDAG &DAG) const {
13233   MVT VT = Op.getSimpleValueType();
13234   MVT EltVT = VT.getVectorElementType();
13235
13236   if (EltVT == MVT::i1)
13237     return InsertBitToMaskVector(Op, DAG);
13238
13239   SDLoc dl(Op);
13240   SDValue N0 = Op.getOperand(0);
13241   SDValue N1 = Op.getOperand(1);
13242   SDValue N2 = Op.getOperand(2);
13243   if (!isa<ConstantSDNode>(N2))
13244     return SDValue();
13245   auto *N2C = cast<ConstantSDNode>(N2);
13246   unsigned IdxVal = N2C->getZExtValue();
13247
13248   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13249   // into that, and then insert the subvector back into the result.
13250   if (VT.is256BitVector() || VT.is512BitVector()) {
13251     // Get the desired 128-bit vector half.
13252     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
13253
13254     // Insert the element into the desired half.
13255     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13256     unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
13257
13258     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13259                     DAG.getConstant(IdxIn128, MVT::i32));
13260
13261     // Insert the changed part back to the 256-bit vector
13262     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
13263   }
13264   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13265
13266   if (Subtarget->hasSSE41()) {
13267     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13268       unsigned Opc;
13269       if (VT == MVT::v8i16) {
13270         Opc = X86ISD::PINSRW;
13271       } else {
13272         assert(VT == MVT::v16i8);
13273         Opc = X86ISD::PINSRB;
13274       }
13275
13276       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13277       // argument.
13278       if (N1.getValueType() != MVT::i32)
13279         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13280       if (N2.getValueType() != MVT::i32)
13281         N2 = DAG.getIntPtrConstant(IdxVal);
13282       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13283     }
13284
13285     if (EltVT == MVT::f32) {
13286       // Bits [7:6] of the constant are the source select.  This will always be
13287       //  zero here.  The DAG Combiner may combine an extract_elt index into
13288       //  these
13289       //  bits.  For example (insert (extract, 3), 2) could be matched by
13290       //  putting
13291       //  the '3' into bits [7:6] of X86ISD::INSERTPS.
13292       // Bits [5:4] of the constant are the destination select.  This is the
13293       //  value of the incoming immediate.
13294       // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
13295       //   combine either bitwise AND or insert of float 0.0 to set these bits.
13296       N2 = DAG.getIntPtrConstant(IdxVal << 4);
13297       // Create this as a scalar to vector..
13298       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13299       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13300     }
13301
13302     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13303       // PINSR* works with constant index.
13304       return Op;
13305     }
13306   }
13307
13308   if (EltVT == MVT::i8)
13309     return SDValue();
13310
13311   if (EltVT.getSizeInBits() == 16) {
13312     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13313     // as its second argument.
13314     if (N1.getValueType() != MVT::i32)
13315       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13316     if (N2.getValueType() != MVT::i32)
13317       N2 = DAG.getIntPtrConstant(IdxVal);
13318     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13319   }
13320   return SDValue();
13321 }
13322
13323 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13324   SDLoc dl(Op);
13325   MVT OpVT = Op.getSimpleValueType();
13326
13327   // If this is a 256-bit vector result, first insert into a 128-bit
13328   // vector and then insert into the 256-bit vector.
13329   if (!OpVT.is128BitVector()) {
13330     // Insert into a 128-bit vector.
13331     unsigned SizeFactor = OpVT.getSizeInBits()/128;
13332     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13333                                  OpVT.getVectorNumElements() / SizeFactor);
13334
13335     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13336
13337     // Insert the 128-bit vector.
13338     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13339   }
13340
13341   if (OpVT == MVT::v1i64 &&
13342       Op.getOperand(0).getValueType() == MVT::i64)
13343     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
13344
13345   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13346   assert(OpVT.is128BitVector() && "Expected an SSE type!");
13347   return DAG.getNode(ISD::BITCAST, dl, OpVT,
13348                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
13349 }
13350
13351 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
13352 // a simple subregister reference or explicit instructions to grab
13353 // upper bits of a vector.
13354 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13355                                       SelectionDAG &DAG) {
13356   SDLoc dl(Op);
13357   SDValue In =  Op.getOperand(0);
13358   SDValue Idx = Op.getOperand(1);
13359   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13360   MVT ResVT   = Op.getSimpleValueType();
13361   MVT InVT    = In.getSimpleValueType();
13362
13363   if (Subtarget->hasFp256()) {
13364     if (ResVT.is128BitVector() &&
13365         (InVT.is256BitVector() || InVT.is512BitVector()) &&
13366         isa<ConstantSDNode>(Idx)) {
13367       return Extract128BitVector(In, IdxVal, DAG, dl);
13368     }
13369     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
13370         isa<ConstantSDNode>(Idx)) {
13371       return Extract256BitVector(In, IdxVal, DAG, dl);
13372     }
13373   }
13374   return SDValue();
13375 }
13376
13377 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
13378 // simple superregister reference or explicit instructions to insert
13379 // the upper bits of a vector.
13380 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13381                                      SelectionDAG &DAG) {
13382   if (!Subtarget->hasAVX())
13383     return SDValue();
13384
13385   SDLoc dl(Op);
13386   SDValue Vec = Op.getOperand(0);
13387   SDValue SubVec = Op.getOperand(1);
13388   SDValue Idx = Op.getOperand(2);
13389
13390   if (!isa<ConstantSDNode>(Idx))
13391     return SDValue();
13392
13393   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13394   MVT OpVT = Op.getSimpleValueType();
13395   MVT SubVecVT = SubVec.getSimpleValueType();
13396
13397   // Fold two 16-byte subvector loads into one 32-byte load:
13398   // (insert_subvector (insert_subvector undef, (load addr), 0),
13399   //                   (load addr + 16), Elts/2)
13400   // --> load32 addr
13401   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
13402       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
13403       OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
13404       !Subtarget->isUnalignedMem32Slow()) {
13405     SDValue SubVec2 = Vec.getOperand(1);
13406     if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
13407       if (Idx2->getZExtValue() == 0) {
13408         SDValue Ops[] = { SubVec2, SubVec };
13409         SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
13410         if (LD.getNode())
13411           return LD;
13412       }
13413     }
13414   }
13415
13416   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13417       SubVecVT.is128BitVector())
13418     return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13419
13420   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
13421     return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13422
13423   return SDValue();
13424 }
13425
13426 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13427 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13428 // one of the above mentioned nodes. It has to be wrapped because otherwise
13429 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13430 // be used to form addressing mode. These wrapped nodes will be selected
13431 // into MOV32ri.
13432 SDValue
13433 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13434   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13435
13436   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13437   // global base reg.
13438   unsigned char OpFlag = 0;
13439   unsigned WrapperKind = X86ISD::Wrapper;
13440   CodeModel::Model M = DAG.getTarget().getCodeModel();
13441
13442   if (Subtarget->isPICStyleRIPRel() &&
13443       (M == CodeModel::Small || M == CodeModel::Kernel))
13444     WrapperKind = X86ISD::WrapperRIP;
13445   else if (Subtarget->isPICStyleGOT())
13446     OpFlag = X86II::MO_GOTOFF;
13447   else if (Subtarget->isPICStyleStubPIC())
13448     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13449
13450   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
13451                                              CP->getAlignment(),
13452                                              CP->getOffset(), OpFlag);
13453   SDLoc DL(CP);
13454   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13455   // With PIC, the address is actually $g + Offset.
13456   if (OpFlag) {
13457     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13458                          DAG.getNode(X86ISD::GlobalBaseReg,
13459                                      SDLoc(), getPointerTy()),
13460                          Result);
13461   }
13462
13463   return Result;
13464 }
13465
13466 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13467   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13468
13469   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13470   // global base reg.
13471   unsigned char OpFlag = 0;
13472   unsigned WrapperKind = X86ISD::Wrapper;
13473   CodeModel::Model M = DAG.getTarget().getCodeModel();
13474
13475   if (Subtarget->isPICStyleRIPRel() &&
13476       (M == CodeModel::Small || M == CodeModel::Kernel))
13477     WrapperKind = X86ISD::WrapperRIP;
13478   else if (Subtarget->isPICStyleGOT())
13479     OpFlag = X86II::MO_GOTOFF;
13480   else if (Subtarget->isPICStyleStubPIC())
13481     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13482
13483   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
13484                                           OpFlag);
13485   SDLoc DL(JT);
13486   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13487
13488   // With PIC, the address is actually $g + Offset.
13489   if (OpFlag)
13490     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13491                          DAG.getNode(X86ISD::GlobalBaseReg,
13492                                      SDLoc(), getPointerTy()),
13493                          Result);
13494
13495   return Result;
13496 }
13497
13498 SDValue
13499 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13500   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13501
13502   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13503   // global base reg.
13504   unsigned char OpFlag = 0;
13505   unsigned WrapperKind = X86ISD::Wrapper;
13506   CodeModel::Model M = DAG.getTarget().getCodeModel();
13507
13508   if (Subtarget->isPICStyleRIPRel() &&
13509       (M == CodeModel::Small || M == CodeModel::Kernel)) {
13510     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
13511       OpFlag = X86II::MO_GOTPCREL;
13512     WrapperKind = X86ISD::WrapperRIP;
13513   } else if (Subtarget->isPICStyleGOT()) {
13514     OpFlag = X86II::MO_GOT;
13515   } else if (Subtarget->isPICStyleStubPIC()) {
13516     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
13517   } else if (Subtarget->isPICStyleStubNoDynamic()) {
13518     OpFlag = X86II::MO_DARWIN_NONLAZY;
13519   }
13520
13521   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
13522
13523   SDLoc DL(Op);
13524   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13525
13526   // With PIC, the address is actually $g + Offset.
13527   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
13528       !Subtarget->is64Bit()) {
13529     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13530                          DAG.getNode(X86ISD::GlobalBaseReg,
13531                                      SDLoc(), getPointerTy()),
13532                          Result);
13533   }
13534
13535   // For symbols that require a load from a stub to get the address, emit the
13536   // load.
13537   if (isGlobalStubReference(OpFlag))
13538     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
13539                          MachinePointerInfo::getGOT(), false, false, false, 0);
13540
13541   return Result;
13542 }
13543
13544 SDValue
13545 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13546   // Create the TargetBlockAddressAddress node.
13547   unsigned char OpFlags =
13548     Subtarget->ClassifyBlockAddressReference();
13549   CodeModel::Model M = DAG.getTarget().getCodeModel();
13550   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13551   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13552   SDLoc dl(Op);
13553   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
13554                                              OpFlags);
13555
13556   if (Subtarget->isPICStyleRIPRel() &&
13557       (M == CodeModel::Small || M == CodeModel::Kernel))
13558     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13559   else
13560     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13561
13562   // With PIC, the address is actually $g + Offset.
13563   if (isGlobalRelativeToPICBase(OpFlags)) {
13564     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13565                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13566                          Result);
13567   }
13568
13569   return Result;
13570 }
13571
13572 SDValue
13573 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
13574                                       int64_t Offset, SelectionDAG &DAG) const {
13575   // Create the TargetGlobalAddress node, folding in the constant
13576   // offset if it is legal.
13577   unsigned char OpFlags =
13578       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
13579   CodeModel::Model M = DAG.getTarget().getCodeModel();
13580   SDValue Result;
13581   if (OpFlags == X86II::MO_NO_FLAG &&
13582       X86::isOffsetSuitableForCodeModel(Offset, M)) {
13583     // A direct static reference to a global.
13584     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
13585     Offset = 0;
13586   } else {
13587     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
13588   }
13589
13590   if (Subtarget->isPICStyleRIPRel() &&
13591       (M == CodeModel::Small || M == CodeModel::Kernel))
13592     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13593   else
13594     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13595
13596   // With PIC, the address is actually $g + Offset.
13597   if (isGlobalRelativeToPICBase(OpFlags)) {
13598     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13599                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13600                          Result);
13601   }
13602
13603   // For globals that require a load from a stub to get the address, emit the
13604   // load.
13605   if (isGlobalStubReference(OpFlags))
13606     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
13607                          MachinePointerInfo::getGOT(), false, false, false, 0);
13608
13609   // If there was a non-zero offset that we didn't fold, create an explicit
13610   // addition for it.
13611   if (Offset != 0)
13612     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
13613                          DAG.getConstant(Offset, getPointerTy()));
13614
13615   return Result;
13616 }
13617
13618 SDValue
13619 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13620   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13621   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13622   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13623 }
13624
13625 static SDValue
13626 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13627            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13628            unsigned char OperandFlags, bool LocalDynamic = false) {
13629   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13630   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13631   SDLoc dl(GA);
13632   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13633                                            GA->getValueType(0),
13634                                            GA->getOffset(),
13635                                            OperandFlags);
13636
13637   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13638                                            : X86ISD::TLSADDR;
13639
13640   if (InFlag) {
13641     SDValue Ops[] = { Chain,  TGA, *InFlag };
13642     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13643   } else {
13644     SDValue Ops[]  = { Chain, TGA };
13645     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13646   }
13647
13648   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13649   MFI->setAdjustsStack(true);
13650   MFI->setHasCalls(true);
13651
13652   SDValue Flag = Chain.getValue(1);
13653   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13654 }
13655
13656 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13657 static SDValue
13658 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13659                                 const EVT PtrVT) {
13660   SDValue InFlag;
13661   SDLoc dl(GA);  // ? function entry point might be better
13662   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13663                                    DAG.getNode(X86ISD::GlobalBaseReg,
13664                                                SDLoc(), PtrVT), InFlag);
13665   InFlag = Chain.getValue(1);
13666
13667   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13668 }
13669
13670 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13671 static SDValue
13672 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13673                                 const EVT PtrVT) {
13674   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13675                     X86::RAX, X86II::MO_TLSGD);
13676 }
13677
13678 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13679                                            SelectionDAG &DAG,
13680                                            const EVT PtrVT,
13681                                            bool is64Bit) {
13682   SDLoc dl(GA);
13683
13684   // Get the start address of the TLS block for this module.
13685   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13686       .getInfo<X86MachineFunctionInfo>();
13687   MFI->incNumLocalDynamicTLSAccesses();
13688
13689   SDValue Base;
13690   if (is64Bit) {
13691     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13692                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
13693   } else {
13694     SDValue InFlag;
13695     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13696         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13697     InFlag = Chain.getValue(1);
13698     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13699                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13700   }
13701
13702   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13703   // of Base.
13704
13705   // Build x@dtpoff.
13706   unsigned char OperandFlags = X86II::MO_DTPOFF;
13707   unsigned WrapperKind = X86ISD::Wrapper;
13708   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13709                                            GA->getValueType(0),
13710                                            GA->getOffset(), OperandFlags);
13711   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13712
13713   // Add x@dtpoff with the base.
13714   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13715 }
13716
13717 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13718 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13719                                    const EVT PtrVT, TLSModel::Model model,
13720                                    bool is64Bit, bool isPIC) {
13721   SDLoc dl(GA);
13722
13723   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13724   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13725                                                          is64Bit ? 257 : 256));
13726
13727   SDValue ThreadPointer =
13728       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
13729                   MachinePointerInfo(Ptr), false, false, false, 0);
13730
13731   unsigned char OperandFlags = 0;
13732   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
13733   // initialexec.
13734   unsigned WrapperKind = X86ISD::Wrapper;
13735   if (model == TLSModel::LocalExec) {
13736     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13737   } else if (model == TLSModel::InitialExec) {
13738     if (is64Bit) {
13739       OperandFlags = X86II::MO_GOTTPOFF;
13740       WrapperKind = X86ISD::WrapperRIP;
13741     } else {
13742       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13743     }
13744   } else {
13745     llvm_unreachable("Unexpected model");
13746   }
13747
13748   // emit "addl x@ntpoff,%eax" (local exec)
13749   // or "addl x@indntpoff,%eax" (initial exec)
13750   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13751   SDValue TGA =
13752       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13753                                  GA->getOffset(), OperandFlags);
13754   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13755
13756   if (model == TLSModel::InitialExec) {
13757     if (isPIC && !is64Bit) {
13758       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13759                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13760                            Offset);
13761     }
13762
13763     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13764                          MachinePointerInfo::getGOT(), false, false, false, 0);
13765   }
13766
13767   // The address of the thread local variable is the add of the thread
13768   // pointer with the offset of the variable.
13769   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13770 }
13771
13772 SDValue
13773 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13774
13775   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13776   const GlobalValue *GV = GA->getGlobal();
13777
13778   if (Subtarget->isTargetELF()) {
13779     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13780
13781     switch (model) {
13782       case TLSModel::GeneralDynamic:
13783         if (Subtarget->is64Bit())
13784           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
13785         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
13786       case TLSModel::LocalDynamic:
13787         return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
13788                                            Subtarget->is64Bit());
13789       case TLSModel::InitialExec:
13790       case TLSModel::LocalExec:
13791         return LowerToTLSExecModel(
13792             GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
13793             DAG.getTarget().getRelocationModel() == Reloc::PIC_);
13794     }
13795     llvm_unreachable("Unknown TLS model.");
13796   }
13797
13798   if (Subtarget->isTargetDarwin()) {
13799     // Darwin only has one model of TLS.  Lower to that.
13800     unsigned char OpFlag = 0;
13801     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
13802                            X86ISD::WrapperRIP : X86ISD::Wrapper;
13803
13804     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13805     // global base reg.
13806     bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
13807                  !Subtarget->is64Bit();
13808     if (PIC32)
13809       OpFlag = X86II::MO_TLVP_PIC_BASE;
13810     else
13811       OpFlag = X86II::MO_TLVP;
13812     SDLoc DL(Op);
13813     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13814                                                 GA->getValueType(0),
13815                                                 GA->getOffset(), OpFlag);
13816     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13817
13818     // With PIC32, the address is actually $g + Offset.
13819     if (PIC32)
13820       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13821                            DAG.getNode(X86ISD::GlobalBaseReg,
13822                                        SDLoc(), getPointerTy()),
13823                            Offset);
13824
13825     // Lowering the machine isd will make sure everything is in the right
13826     // location.
13827     SDValue Chain = DAG.getEntryNode();
13828     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13829     SDValue Args[] = { Chain, Offset };
13830     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13831
13832     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13833     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13834     MFI->setAdjustsStack(true);
13835
13836     // And our return value (tls address) is in the standard call return value
13837     // location.
13838     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
13839     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
13840                               Chain.getValue(1));
13841   }
13842
13843   if (Subtarget->isTargetKnownWindowsMSVC() ||
13844       Subtarget->isTargetWindowsGNU()) {
13845     // Just use the implicit TLS architecture
13846     // Need to generate someting similar to:
13847     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13848     //                                  ; from TEB
13849     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
13850     //   mov     rcx, qword [rdx+rcx*8]
13851     //   mov     eax, .tls$:tlsvar
13852     //   [rax+rcx] contains the address
13853     // Windows 64bit: gs:0x58
13854     // Windows 32bit: fs:__tls_array
13855
13856     SDLoc dl(GA);
13857     SDValue Chain = DAG.getEntryNode();
13858
13859     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13860     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13861     // use its literal value of 0x2C.
13862     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
13863                                         ? Type::getInt8PtrTy(*DAG.getContext(),
13864                                                              256)
13865                                         : Type::getInt32PtrTy(*DAG.getContext(),
13866                                                               257));
13867
13868     SDValue TlsArray =
13869         Subtarget->is64Bit()
13870             ? DAG.getIntPtrConstant(0x58)
13871             : (Subtarget->isTargetWindowsGNU()
13872                    ? DAG.getIntPtrConstant(0x2C)
13873                    : DAG.getExternalSymbol("_tls_array", getPointerTy()));
13874
13875     SDValue ThreadPointer =
13876         DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
13877                     MachinePointerInfo(Ptr), false, false, false, 0);
13878
13879     // Load the _tls_index variable
13880     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
13881     if (Subtarget->is64Bit())
13882       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
13883                            IDX, MachinePointerInfo(), MVT::i32,
13884                            false, false, false, 0);
13885     else
13886       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
13887                         false, false, false, 0);
13888
13889     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
13890                                     getPointerTy());
13891     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
13892
13893     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
13894     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
13895                       false, false, false, 0);
13896
13897     // Get the offset of start of .tls section
13898     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13899                                              GA->getValueType(0),
13900                                              GA->getOffset(), X86II::MO_SECREL);
13901     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
13902
13903     // The address of the thread local variable is the add of the thread
13904     // pointer with the offset of the variable.
13905     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
13906   }
13907
13908   llvm_unreachable("TLS not implemented for this target.");
13909 }
13910
13911 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
13912 /// and take a 2 x i32 value to shift plus a shift amount.
13913 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13914   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13915   MVT VT = Op.getSimpleValueType();
13916   unsigned VTBits = VT.getSizeInBits();
13917   SDLoc dl(Op);
13918   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13919   SDValue ShOpLo = Op.getOperand(0);
13920   SDValue ShOpHi = Op.getOperand(1);
13921   SDValue ShAmt  = Op.getOperand(2);
13922   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13923   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13924   // during isel.
13925   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13926                                   DAG.getConstant(VTBits - 1, MVT::i8));
13927   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13928                                      DAG.getConstant(VTBits - 1, MVT::i8))
13929                        : DAG.getConstant(0, VT);
13930
13931   SDValue Tmp2, Tmp3;
13932   if (Op.getOpcode() == ISD::SHL_PARTS) {
13933     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13934     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13935   } else {
13936     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13937     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13938   }
13939
13940   // If the shift amount is larger or equal than the width of a part we can't
13941   // rely on the results of shld/shrd. Insert a test and select the appropriate
13942   // values for large shift amounts.
13943   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13944                                 DAG.getConstant(VTBits, MVT::i8));
13945   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13946                              AndNode, DAG.getConstant(0, MVT::i8));
13947
13948   SDValue Hi, Lo;
13949   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
13950   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13951   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13952
13953   if (Op.getOpcode() == ISD::SHL_PARTS) {
13954     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13955     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13956   } else {
13957     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13958     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13959   }
13960
13961   SDValue Ops[2] = { Lo, Hi };
13962   return DAG.getMergeValues(Ops, dl);
13963 }
13964
13965 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13966                                            SelectionDAG &DAG) const {
13967   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
13968   SDLoc dl(Op);
13969
13970   if (SrcVT.isVector()) {
13971     if (SrcVT.getVectorElementType() == MVT::i1) {
13972       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13973       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13974                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
13975                                      Op.getOperand(0)));
13976     }
13977     return SDValue();
13978   }
13979
13980   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13981          "Unknown SINT_TO_FP to lower!");
13982
13983   // These are really Legal; return the operand so the caller accepts it as
13984   // Legal.
13985   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13986     return Op;
13987   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13988       Subtarget->is64Bit()) {
13989     return Op;
13990   }
13991
13992   unsigned Size = SrcVT.getSizeInBits()/8;
13993   MachineFunction &MF = DAG.getMachineFunction();
13994   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13995   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13996   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13997                                StackSlot,
13998                                MachinePointerInfo::getFixedStack(SSFI),
13999                                false, false, 0);
14000   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14001 }
14002
14003 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14004                                      SDValue StackSlot,
14005                                      SelectionDAG &DAG) const {
14006   // Build the FILD
14007   SDLoc DL(Op);
14008   SDVTList Tys;
14009   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14010   if (useSSE)
14011     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14012   else
14013     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14014
14015   unsigned ByteSize = SrcVT.getSizeInBits()/8;
14016
14017   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14018   MachineMemOperand *MMO;
14019   if (FI) {
14020     int SSFI = FI->getIndex();
14021     MMO =
14022       DAG.getMachineFunction()
14023       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14024                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
14025   } else {
14026     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14027     StackSlot = StackSlot.getOperand(1);
14028   }
14029   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14030   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14031                                            X86ISD::FILD, DL,
14032                                            Tys, Ops, SrcVT, MMO);
14033
14034   if (useSSE) {
14035     Chain = Result.getValue(1);
14036     SDValue InFlag = Result.getValue(2);
14037
14038     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14039     // shouldn't be necessary except that RFP cannot be live across
14040     // multiple blocks. When stackifier is fixed, they can be uncoupled.
14041     MachineFunction &MF = DAG.getMachineFunction();
14042     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
14043     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
14044     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14045     Tys = DAG.getVTList(MVT::Other);
14046     SDValue Ops[] = {
14047       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
14048     };
14049     MachineMemOperand *MMO =
14050       DAG.getMachineFunction()
14051       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14052                             MachineMemOperand::MOStore, SSFISize, SSFISize);
14053
14054     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
14055                                     Ops, Op.getValueType(), MMO);
14056     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
14057                          MachinePointerInfo::getFixedStack(SSFI),
14058                          false, false, false, 0);
14059   }
14060
14061   return Result;
14062 }
14063
14064 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
14065 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
14066                                                SelectionDAG &DAG) const {
14067   // This algorithm is not obvious. Here it is what we're trying to output:
14068   /*
14069      movq       %rax,  %xmm0
14070      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
14071      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
14072      #ifdef __SSE3__
14073        haddpd   %xmm0, %xmm0
14074      #else
14075        pshufd   $0x4e, %xmm0, %xmm1
14076        addpd    %xmm1, %xmm0
14077      #endif
14078   */
14079
14080   SDLoc dl(Op);
14081   LLVMContext *Context = DAG.getContext();
14082
14083   // Build some magic constants.
14084   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
14085   Constant *C0 = ConstantDataVector::get(*Context, CV0);
14086   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
14087
14088   SmallVector<Constant*,2> CV1;
14089   CV1.push_back(
14090     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14091                                       APInt(64, 0x4330000000000000ULL))));
14092   CV1.push_back(
14093     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14094                                       APInt(64, 0x4530000000000000ULL))));
14095   Constant *C1 = ConstantVector::get(CV1);
14096   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
14097
14098   // Load the 64-bit value into an XMM register.
14099   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
14100                             Op.getOperand(0));
14101   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
14102                               MachinePointerInfo::getConstantPool(),
14103                               false, false, false, 16);
14104   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
14105                               DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
14106                               CLod0);
14107
14108   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
14109                               MachinePointerInfo::getConstantPool(),
14110                               false, false, false, 16);
14111   SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
14112   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
14113   SDValue Result;
14114
14115   if (Subtarget->hasSSE3()) {
14116     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
14117     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
14118   } else {
14119     SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
14120     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
14121                                            S2F, 0x4E, DAG);
14122     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
14123                          DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
14124                          Sub);
14125   }
14126
14127   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
14128                      DAG.getIntPtrConstant(0));
14129 }
14130
14131 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
14132 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
14133                                                SelectionDAG &DAG) const {
14134   SDLoc dl(Op);
14135   // FP constant to bias correct the final result.
14136   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
14137                                    MVT::f64);
14138
14139   // Load the 32-bit value into an XMM register.
14140   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
14141                              Op.getOperand(0));
14142
14143   // Zero out the upper parts of the register.
14144   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
14145
14146   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14147                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
14148                      DAG.getIntPtrConstant(0));
14149
14150   // Or the load with the bias.
14151   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
14152                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14153                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14154                                                    MVT::v2f64, Load)),
14155                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14156                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14157                                                    MVT::v2f64, Bias)));
14158   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14159                    DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
14160                    DAG.getIntPtrConstant(0));
14161
14162   // Subtract the bias.
14163   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
14164
14165   // Handle final rounding.
14166   EVT DestVT = Op.getValueType();
14167
14168   if (DestVT.bitsLT(MVT::f64))
14169     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
14170                        DAG.getIntPtrConstant(0));
14171   if (DestVT.bitsGT(MVT::f64))
14172     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
14173
14174   // Handle final rounding.
14175   return Sub;
14176 }
14177
14178 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
14179                                      const X86Subtarget &Subtarget) {
14180   // The algorithm is the following:
14181   // #ifdef __SSE4_1__
14182   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14183   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14184   //                                 (uint4) 0x53000000, 0xaa);
14185   // #else
14186   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14187   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14188   // #endif
14189   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14190   //     return (float4) lo + fhi;
14191
14192   SDLoc DL(Op);
14193   SDValue V = Op->getOperand(0);
14194   EVT VecIntVT = V.getValueType();
14195   bool Is128 = VecIntVT == MVT::v4i32;
14196   EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
14197   // If we convert to something else than the supported type, e.g., to v4f64,
14198   // abort early.
14199   if (VecFloatVT != Op->getValueType(0))
14200     return SDValue();
14201
14202   unsigned NumElts = VecIntVT.getVectorNumElements();
14203   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
14204          "Unsupported custom type");
14205   assert(NumElts <= 8 && "The size of the constant array must be fixed");
14206
14207   // In the #idef/#else code, we have in common:
14208   // - The vector of constants:
14209   // -- 0x4b000000
14210   // -- 0x53000000
14211   // - A shift:
14212   // -- v >> 16
14213
14214   // Create the splat vector for 0x4b000000.
14215   SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
14216   SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
14217                            CstLow, CstLow, CstLow, CstLow};
14218   SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14219                                   makeArrayRef(&CstLowArray[0], NumElts));
14220   // Create the splat vector for 0x53000000.
14221   SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
14222   SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
14223                             CstHigh, CstHigh, CstHigh, CstHigh};
14224   SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14225                                    makeArrayRef(&CstHighArray[0], NumElts));
14226
14227   // Create the right shift.
14228   SDValue CstShift = DAG.getConstant(16, MVT::i32);
14229   SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
14230                              CstShift, CstShift, CstShift, CstShift};
14231   SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14232                                     makeArrayRef(&CstShiftArray[0], NumElts));
14233   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14234
14235   SDValue Low, High;
14236   if (Subtarget.hasSSE41()) {
14237     EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14238     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14239     SDValue VecCstLowBitcast =
14240         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
14241     SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
14242     // Low will be bitcasted right away, so do not bother bitcasting back to its
14243     // original type.
14244     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14245                       VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
14246     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14247     //                                 (uint4) 0x53000000, 0xaa);
14248     SDValue VecCstHighBitcast =
14249         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
14250     SDValue VecShiftBitcast =
14251         DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
14252     // High will be bitcasted right away, so do not bother bitcasting back to
14253     // its original type.
14254     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14255                        VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
14256   } else {
14257     SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
14258     SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
14259                                      CstMask, CstMask, CstMask);
14260     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14261     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14262     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14263
14264     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14265     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14266   }
14267
14268   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14269   SDValue CstFAdd = DAG.getConstantFP(
14270       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
14271   SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
14272                             CstFAdd, CstFAdd, CstFAdd, CstFAdd};
14273   SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
14274                                    makeArrayRef(&CstFAddArray[0], NumElts));
14275
14276   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14277   SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
14278   SDValue FHigh =
14279       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14280   //     return (float4) lo + fhi;
14281   SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
14282   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14283 }
14284
14285 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14286                                                SelectionDAG &DAG) const {
14287   SDValue N0 = Op.getOperand(0);
14288   MVT SVT = N0.getSimpleValueType();
14289   SDLoc dl(Op);
14290
14291   switch (SVT.SimpleTy) {
14292   default:
14293     llvm_unreachable("Custom UINT_TO_FP is not supported!");
14294   case MVT::v4i8:
14295   case MVT::v4i16:
14296   case MVT::v8i8:
14297   case MVT::v8i16: {
14298     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
14299     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14300                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14301   }
14302   case MVT::v4i32:
14303   case MVT::v8i32:
14304     return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
14305   }
14306   llvm_unreachable(nullptr);
14307 }
14308
14309 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14310                                            SelectionDAG &DAG) const {
14311   SDValue N0 = Op.getOperand(0);
14312   SDLoc dl(Op);
14313
14314   if (Op.getValueType().isVector())
14315     return lowerUINT_TO_FP_vec(Op, DAG);
14316
14317   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14318   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14319   // the optimization here.
14320   if (DAG.SignBitIsZero(N0))
14321     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14322
14323   MVT SrcVT = N0.getSimpleValueType();
14324   MVT DstVT = Op.getSimpleValueType();
14325   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14326     return LowerUINT_TO_FP_i64(Op, DAG);
14327   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14328     return LowerUINT_TO_FP_i32(Op, DAG);
14329   if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14330     return SDValue();
14331
14332   // Make a 64-bit buffer, and use it to build an FILD.
14333   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14334   if (SrcVT == MVT::i32) {
14335     SDValue WordOff = DAG.getConstant(4, getPointerTy());
14336     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
14337                                      getPointerTy(), StackSlot, WordOff);
14338     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14339                                   StackSlot, MachinePointerInfo(),
14340                                   false, false, 0);
14341     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
14342                                   OffsetSlot, MachinePointerInfo(),
14343                                   false, false, 0);
14344     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14345     return Fild;
14346   }
14347
14348   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14349   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14350                                StackSlot, MachinePointerInfo(),
14351                                false, false, 0);
14352   // For i64 source, we need to add the appropriate power of 2 if the input
14353   // was negative.  This is the same as the optimization in
14354   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14355   // we must be careful to do the computation in x87 extended precision, not
14356   // in SSE. (The generic code can't know it's OK to do this, or how to.)
14357   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14358   MachineMemOperand *MMO =
14359     DAG.getMachineFunction()
14360     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14361                           MachineMemOperand::MOLoad, 8, 8);
14362
14363   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14364   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14365   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14366                                          MVT::i64, MMO);
14367
14368   APInt FF(32, 0x5F800000ULL);
14369
14370   // Check whether the sign bit is set.
14371   SDValue SignSet = DAG.getSetCC(dl,
14372                                  getSetCCResultType(*DAG.getContext(), MVT::i64),
14373                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
14374                                  ISD::SETLT);
14375
14376   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14377   SDValue FudgePtr = DAG.getConstantPool(
14378                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
14379                                          getPointerTy());
14380
14381   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14382   SDValue Zero = DAG.getIntPtrConstant(0);
14383   SDValue Four = DAG.getIntPtrConstant(4);
14384   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14385                                Zero, Four);
14386   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
14387
14388   // Load the value out, extending it from f32 to f80.
14389   // FIXME: Avoid the extend by constructing the right constant pool?
14390   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
14391                                  FudgePtr, MachinePointerInfo::getConstantPool(),
14392                                  MVT::f32, false, false, false, 4);
14393   // Extend everything to 80 bits to force it to be done on x87.
14394   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14395   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
14396 }
14397
14398 std::pair<SDValue,SDValue>
14399 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14400                                     bool IsSigned, bool IsReplace) const {
14401   SDLoc DL(Op);
14402
14403   EVT DstTy = Op.getValueType();
14404
14405   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
14406     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14407     DstTy = MVT::i64;
14408   }
14409
14410   assert(DstTy.getSimpleVT() <= MVT::i64 &&
14411          DstTy.getSimpleVT() >= MVT::i16 &&
14412          "Unknown FP_TO_INT to lower!");
14413
14414   // These are really Legal.
14415   if (DstTy == MVT::i32 &&
14416       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14417     return std::make_pair(SDValue(), SDValue());
14418   if (Subtarget->is64Bit() &&
14419       DstTy == MVT::i64 &&
14420       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14421     return std::make_pair(SDValue(), SDValue());
14422
14423   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
14424   // stack slot, or into the FTOL runtime function.
14425   MachineFunction &MF = DAG.getMachineFunction();
14426   unsigned MemSize = DstTy.getSizeInBits()/8;
14427   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14428   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14429
14430   unsigned Opc;
14431   if (!IsSigned && isIntegerTypeFTOL(DstTy))
14432     Opc = X86ISD::WIN_FTOL;
14433   else
14434     switch (DstTy.getSimpleVT().SimpleTy) {
14435     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14436     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14437     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14438     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14439     }
14440
14441   SDValue Chain = DAG.getEntryNode();
14442   SDValue Value = Op.getOperand(0);
14443   EVT TheVT = Op.getOperand(0).getValueType();
14444   // FIXME This causes a redundant load/store if the SSE-class value is already
14445   // in memory, such as if it is on the callstack.
14446   if (isScalarFPTypeInSSEReg(TheVT)) {
14447     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14448     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14449                          MachinePointerInfo::getFixedStack(SSFI),
14450                          false, false, 0);
14451     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14452     SDValue Ops[] = {
14453       Chain, StackSlot, DAG.getValueType(TheVT)
14454     };
14455
14456     MachineMemOperand *MMO =
14457       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14458                               MachineMemOperand::MOLoad, MemSize, MemSize);
14459     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14460     Chain = Value.getValue(1);
14461     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14462     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14463   }
14464
14465   MachineMemOperand *MMO =
14466     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14467                             MachineMemOperand::MOStore, MemSize, MemSize);
14468
14469   if (Opc != X86ISD::WIN_FTOL) {
14470     // Build the FP_TO_INT*_IN_MEM
14471     SDValue Ops[] = { Chain, Value, StackSlot };
14472     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
14473                                            Ops, DstTy, MMO);
14474     return std::make_pair(FIST, StackSlot);
14475   } else {
14476     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
14477       DAG.getVTList(MVT::Other, MVT::Glue),
14478       Chain, Value);
14479     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
14480       MVT::i32, ftol.getValue(1));
14481     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
14482       MVT::i32, eax.getValue(2));
14483     SDValue Ops[] = { eax, edx };
14484     SDValue pair = IsReplace
14485       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
14486       : DAG.getMergeValues(Ops, DL);
14487     return std::make_pair(pair, SDValue());
14488   }
14489 }
14490
14491 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14492                               const X86Subtarget *Subtarget) {
14493   MVT VT = Op->getSimpleValueType(0);
14494   SDValue In = Op->getOperand(0);
14495   MVT InVT = In.getSimpleValueType();
14496   SDLoc dl(Op);
14497
14498   // Optimize vectors in AVX mode:
14499   //
14500   //   v8i16 -> v8i32
14501   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
14502   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
14503   //   Concat upper and lower parts.
14504   //
14505   //   v4i32 -> v4i64
14506   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
14507   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
14508   //   Concat upper and lower parts.
14509   //
14510
14511   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14512       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14513       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14514     return SDValue();
14515
14516   if (Subtarget->hasInt256())
14517     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14518
14519   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14520   SDValue Undef = DAG.getUNDEF(InVT);
14521   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14522   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14523   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14524
14525   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14526                              VT.getVectorNumElements()/2);
14527
14528   OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
14529   OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
14530
14531   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14532 }
14533
14534 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14535                                         SelectionDAG &DAG) {
14536   MVT VT = Op->getSimpleValueType(0);
14537   SDValue In = Op->getOperand(0);
14538   MVT InVT = In.getSimpleValueType();
14539   SDLoc DL(Op);
14540   unsigned int NumElts = VT.getVectorNumElements();
14541   if (NumElts != 8 && NumElts != 16)
14542     return SDValue();
14543
14544   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14545     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14546
14547   EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
14548   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14549   // Now we have only mask extension
14550   assert(InVT.getVectorElementType() == MVT::i1);
14551   SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
14552   const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14553   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
14554   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14555   SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14556                            MachinePointerInfo::getConstantPool(),
14557                            false, false, false, Alignment);
14558
14559   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
14560   if (VT.is512BitVector())
14561     return Brcst;
14562   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
14563 }
14564
14565 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14566                                SelectionDAG &DAG) {
14567   if (Subtarget->hasFp256()) {
14568     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14569     if (Res.getNode())
14570       return Res;
14571   }
14572
14573   return SDValue();
14574 }
14575
14576 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14577                                 SelectionDAG &DAG) {
14578   SDLoc DL(Op);
14579   MVT VT = Op.getSimpleValueType();
14580   SDValue In = Op.getOperand(0);
14581   MVT SVT = In.getSimpleValueType();
14582
14583   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14584     return LowerZERO_EXTEND_AVX512(Op, DAG);
14585
14586   if (Subtarget->hasFp256()) {
14587     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14588     if (Res.getNode())
14589       return Res;
14590   }
14591
14592   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14593          VT.getVectorNumElements() != SVT.getVectorNumElements());
14594   return SDValue();
14595 }
14596
14597 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14598   SDLoc DL(Op);
14599   MVT VT = Op.getSimpleValueType();
14600   SDValue In = Op.getOperand(0);
14601   MVT InVT = In.getSimpleValueType();
14602
14603   if (VT == MVT::i1) {
14604     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14605            "Invalid scalar TRUNCATE operation");
14606     if (InVT.getSizeInBits() >= 32)
14607       return SDValue();
14608     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14609     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14610   }
14611   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14612          "Invalid TRUNCATE operation");
14613
14614   if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
14615     if (VT.getVectorElementType().getSizeInBits() >=8)
14616       return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14617
14618     assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
14619     unsigned NumElts = InVT.getVectorNumElements();
14620     assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
14621     if (InVT.getSizeInBits() < 512) {
14622       MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
14623       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14624       InVT = ExtVT;
14625     }
14626
14627     SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
14628     const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14629     SDValue CP = DAG.getConstantPool(C, getPointerTy());
14630     unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14631     SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14632                            MachinePointerInfo::getConstantPool(),
14633                            false, false, false, Alignment);
14634     SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
14635     SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
14636     return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
14637   }
14638
14639   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14640     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14641     if (Subtarget->hasInt256()) {
14642       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14643       In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
14644       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14645                                 ShufMask);
14646       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14647                          DAG.getIntPtrConstant(0));
14648     }
14649
14650     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14651                                DAG.getIntPtrConstant(0));
14652     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14653                                DAG.getIntPtrConstant(2));
14654     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14655     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14656     static const int ShufMask[] = {0, 2, 4, 6};
14657     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14658   }
14659
14660   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14661     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14662     if (Subtarget->hasInt256()) {
14663       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
14664
14665       SmallVector<SDValue,32> pshufbMask;
14666       for (unsigned i = 0; i < 2; ++i) {
14667         pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
14668         pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
14669         pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
14670         pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
14671         pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
14672         pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
14673         pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
14674         pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
14675         for (unsigned j = 0; j < 8; ++j)
14676           pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
14677       }
14678       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
14679       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14680       In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
14681
14682       static const int ShufMask[] = {0,  2,  -1,  -1};
14683       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
14684                                 &ShufMask[0]);
14685       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14686                        DAG.getIntPtrConstant(0));
14687       return DAG.getNode(ISD::BITCAST, DL, VT, In);
14688     }
14689
14690     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14691                                DAG.getIntPtrConstant(0));
14692
14693     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14694                                DAG.getIntPtrConstant(4));
14695
14696     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
14697     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
14698
14699     // The PSHUFB mask:
14700     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
14701                                    -1, -1, -1, -1, -1, -1, -1, -1};
14702
14703     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14704     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14705     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14706
14707     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14708     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14709
14710     // The MOVLHPS Mask:
14711     static const int ShufMask2[] = {0, 1, 4, 5};
14712     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14713     return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
14714   }
14715
14716   // Handle truncation of V256 to V128 using shuffles.
14717   if (!VT.is128BitVector() || !InVT.is256BitVector())
14718     return SDValue();
14719
14720   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
14721
14722   unsigned NumElems = VT.getVectorNumElements();
14723   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14724
14725   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14726   // Prepare truncation shuffle mask
14727   for (unsigned i = 0; i != NumElems; ++i)
14728     MaskVec[i] = i * 2;
14729   SDValue V = DAG.getVectorShuffle(NVT, DL,
14730                                    DAG.getNode(ISD::BITCAST, DL, NVT, In),
14731                                    DAG.getUNDEF(NVT), &MaskVec[0]);
14732   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14733                      DAG.getIntPtrConstant(0));
14734 }
14735
14736 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14737                                            SelectionDAG &DAG) const {
14738   assert(!Op.getSimpleValueType().isVector());
14739
14740   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14741     /*IsSigned=*/ true, /*IsReplace=*/ false);
14742   SDValue FIST = Vals.first, StackSlot = Vals.second;
14743   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14744   if (!FIST.getNode()) return Op;
14745
14746   if (StackSlot.getNode())
14747     // Load the result.
14748     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14749                        FIST, StackSlot, MachinePointerInfo(),
14750                        false, false, false, 0);
14751
14752   // The node is the result.
14753   return FIST;
14754 }
14755
14756 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14757                                            SelectionDAG &DAG) const {
14758   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14759     /*IsSigned=*/ false, /*IsReplace=*/ false);
14760   SDValue FIST = Vals.first, StackSlot = Vals.second;
14761   assert(FIST.getNode() && "Unexpected failure");
14762
14763   if (StackSlot.getNode())
14764     // Load the result.
14765     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14766                        FIST, StackSlot, MachinePointerInfo(),
14767                        false, false, false, 0);
14768
14769   // The node is the result.
14770   return FIST;
14771 }
14772
14773 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14774   SDLoc DL(Op);
14775   MVT VT = Op.getSimpleValueType();
14776   SDValue In = Op.getOperand(0);
14777   MVT SVT = In.getSimpleValueType();
14778
14779   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14780
14781   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14782                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14783                                  In, DAG.getUNDEF(SVT)));
14784 }
14785
14786 /// The only differences between FABS and FNEG are the mask and the logic op.
14787 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
14788 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14789   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14790          "Wrong opcode for lowering FABS or FNEG.");
14791
14792   bool IsFABS = (Op.getOpcode() == ISD::FABS);
14793
14794   // If this is a FABS and it has an FNEG user, bail out to fold the combination
14795   // into an FNABS. We'll lower the FABS after that if it is still in use.
14796   if (IsFABS)
14797     for (SDNode *User : Op->uses())
14798       if (User->getOpcode() == ISD::FNEG)
14799         return Op;
14800
14801   SDValue Op0 = Op.getOperand(0);
14802   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14803
14804   SDLoc dl(Op);
14805   MVT VT = Op.getSimpleValueType();
14806   // Assume scalar op for initialization; update for vector if needed.
14807   // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
14808   // generate a 16-byte vector constant and logic op even for the scalar case.
14809   // Using a 16-byte mask allows folding the load of the mask with
14810   // the logic op, so it can save (~4 bytes) on code size.
14811   MVT EltVT = VT;
14812   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
14813   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14814   // decide if we should generate a 16-byte constant mask when we only need 4 or
14815   // 8 bytes for the scalar case.
14816   if (VT.isVector()) {
14817     EltVT = VT.getVectorElementType();
14818     NumElts = VT.getVectorNumElements();
14819   }
14820
14821   unsigned EltBits = EltVT.getSizeInBits();
14822   LLVMContext *Context = DAG.getContext();
14823   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14824   APInt MaskElt =
14825     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14826   Constant *C = ConstantInt::get(*Context, MaskElt);
14827   C = ConstantVector::getSplat(NumElts, C);
14828   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14829   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
14830   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14831   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14832                              MachinePointerInfo::getConstantPool(),
14833                              false, false, false, Alignment);
14834
14835   if (VT.isVector()) {
14836     // For a vector, cast operands to a vector type, perform the logic op,
14837     // and cast the result back to the original value type.
14838     MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
14839     SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
14840     SDValue Operand = IsFNABS ?
14841       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
14842       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
14843     unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
14844     return DAG.getNode(ISD::BITCAST, dl, VT,
14845                        DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
14846   }
14847
14848   // If not vector, then scalar.
14849   unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14850   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14851   return DAG.getNode(BitOp, dl, VT, Operand, Mask);
14852 }
14853
14854 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14855   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14856   LLVMContext *Context = DAG.getContext();
14857   SDValue Op0 = Op.getOperand(0);
14858   SDValue Op1 = Op.getOperand(1);
14859   SDLoc dl(Op);
14860   MVT VT = Op.getSimpleValueType();
14861   MVT SrcVT = Op1.getSimpleValueType();
14862
14863   // If second operand is smaller, extend it first.
14864   if (SrcVT.bitsLT(VT)) {
14865     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14866     SrcVT = VT;
14867   }
14868   // And if it is bigger, shrink it first.
14869   if (SrcVT.bitsGT(VT)) {
14870     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
14871     SrcVT = VT;
14872   }
14873
14874   // At this point the operands and the result should have the same
14875   // type, and that won't be f80 since that is not custom lowered.
14876
14877   const fltSemantics &Sem =
14878       VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
14879   const unsigned SizeInBits = VT.getSizeInBits();
14880
14881   SmallVector<Constant *, 4> CV(
14882       VT == MVT::f64 ? 2 : 4,
14883       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14884
14885   // First, clear all bits but the sign bit from the second operand (sign).
14886   CV[0] = ConstantFP::get(*Context,
14887                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14888   Constant *C = ConstantVector::get(CV);
14889   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14890   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
14891                               MachinePointerInfo::getConstantPool(),
14892                               false, false, false, 16);
14893   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
14894
14895   // Next, clear the sign bit from the first operand (magnitude).
14896   // If it's a constant, we can clear it here.
14897   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14898     APFloat APF = Op0CN->getValueAPF();
14899     // If the magnitude is a positive zero, the sign bit alone is enough.
14900     if (APF.isPosZero())
14901       return SignBit;
14902     APF.clearSign();
14903     CV[0] = ConstantFP::get(*Context, APF);
14904   } else {
14905     CV[0] = ConstantFP::get(
14906         *Context,
14907         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14908   }
14909   C = ConstantVector::get(CV);
14910   CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14911   SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14912                             MachinePointerInfo::getConstantPool(),
14913                             false, false, false, 16);
14914   // If the magnitude operand wasn't a constant, we need to AND out the sign.
14915   if (!isa<ConstantFPSDNode>(Op0))
14916     Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
14917
14918   // OR the magnitude value with the sign bit.
14919   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
14920 }
14921
14922 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14923   SDValue N0 = Op.getOperand(0);
14924   SDLoc dl(Op);
14925   MVT VT = Op.getSimpleValueType();
14926
14927   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
14928   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
14929                                   DAG.getConstant(1, VT));
14930   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
14931 }
14932
14933 // Check whether an OR'd tree is PTEST-able.
14934 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
14935                                       SelectionDAG &DAG) {
14936   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14937
14938   if (!Subtarget->hasSSE41())
14939     return SDValue();
14940
14941   if (!Op->hasOneUse())
14942     return SDValue();
14943
14944   SDNode *N = Op.getNode();
14945   SDLoc DL(N);
14946
14947   SmallVector<SDValue, 8> Opnds;
14948   DenseMap<SDValue, unsigned> VecInMap;
14949   SmallVector<SDValue, 8> VecIns;
14950   EVT VT = MVT::Other;
14951
14952   // Recognize a special case where a vector is casted into wide integer to
14953   // test all 0s.
14954   Opnds.push_back(N->getOperand(0));
14955   Opnds.push_back(N->getOperand(1));
14956
14957   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14958     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14959     // BFS traverse all OR'd operands.
14960     if (I->getOpcode() == ISD::OR) {
14961       Opnds.push_back(I->getOperand(0));
14962       Opnds.push_back(I->getOperand(1));
14963       // Re-evaluate the number of nodes to be traversed.
14964       e += 2; // 2 more nodes (LHS and RHS) are pushed.
14965       continue;
14966     }
14967
14968     // Quit if a non-EXTRACT_VECTOR_ELT
14969     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14970       return SDValue();
14971
14972     // Quit if without a constant index.
14973     SDValue Idx = I->getOperand(1);
14974     if (!isa<ConstantSDNode>(Idx))
14975       return SDValue();
14976
14977     SDValue ExtractedFromVec = I->getOperand(0);
14978     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14979     if (M == VecInMap.end()) {
14980       VT = ExtractedFromVec.getValueType();
14981       // Quit if not 128/256-bit vector.
14982       if (!VT.is128BitVector() && !VT.is256BitVector())
14983         return SDValue();
14984       // Quit if not the same type.
14985       if (VecInMap.begin() != VecInMap.end() &&
14986           VT != VecInMap.begin()->first.getValueType())
14987         return SDValue();
14988       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14989       VecIns.push_back(ExtractedFromVec);
14990     }
14991     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14992   }
14993
14994   assert((VT.is128BitVector() || VT.is256BitVector()) &&
14995          "Not extracted from 128-/256-bit vector.");
14996
14997   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14998
14999   for (DenseMap<SDValue, unsigned>::const_iterator
15000         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
15001     // Quit if not all elements are used.
15002     if (I->second != FullMask)
15003       return SDValue();
15004   }
15005
15006   EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
15007
15008   // Cast all vectors into TestVT for PTEST.
15009   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
15010     VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
15011
15012   // If more than one full vectors are evaluated, OR them first before PTEST.
15013   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
15014     // Each iteration will OR 2 nodes and append the result until there is only
15015     // 1 node left, i.e. the final OR'd value of all vectors.
15016     SDValue LHS = VecIns[Slot];
15017     SDValue RHS = VecIns[Slot + 1];
15018     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
15019   }
15020
15021   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
15022                      VecIns.back(), VecIns.back());
15023 }
15024
15025 /// \brief return true if \c Op has a use that doesn't just read flags.
15026 static bool hasNonFlagsUse(SDValue Op) {
15027   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
15028        ++UI) {
15029     SDNode *User = *UI;
15030     unsigned UOpNo = UI.getOperandNo();
15031     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
15032       // Look pass truncate.
15033       UOpNo = User->use_begin().getOperandNo();
15034       User = *User->use_begin();
15035     }
15036
15037     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
15038         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
15039       return true;
15040   }
15041   return false;
15042 }
15043
15044 /// Emit nodes that will be selected as "test Op0,Op0", or something
15045 /// equivalent.
15046 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
15047                                     SelectionDAG &DAG) const {
15048   if (Op.getValueType() == MVT::i1)
15049     // KORTEST instruction should be selected
15050     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15051                        DAG.getConstant(0, Op.getValueType()));
15052
15053   // CF and OF aren't always set the way we want. Determine which
15054   // of these we need.
15055   bool NeedCF = false;
15056   bool NeedOF = false;
15057   switch (X86CC) {
15058   default: break;
15059   case X86::COND_A: case X86::COND_AE:
15060   case X86::COND_B: case X86::COND_BE:
15061     NeedCF = true;
15062     break;
15063   case X86::COND_G: case X86::COND_GE:
15064   case X86::COND_L: case X86::COND_LE:
15065   case X86::COND_O: case X86::COND_NO: {
15066     // Check if we really need to set the
15067     // Overflow flag. If NoSignedWrap is present
15068     // that is not actually needed.
15069     switch (Op->getOpcode()) {
15070     case ISD::ADD:
15071     case ISD::SUB:
15072     case ISD::MUL:
15073     case ISD::SHL: {
15074       const BinaryWithFlagsSDNode *BinNode =
15075           cast<BinaryWithFlagsSDNode>(Op.getNode());
15076       if (BinNode->hasNoSignedWrap())
15077         break;
15078     }
15079     default:
15080       NeedOF = true;
15081       break;
15082     }
15083     break;
15084   }
15085   }
15086   // See if we can use the EFLAGS value from the operand instead of
15087   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
15088   // we prove that the arithmetic won't overflow, we can't use OF or CF.
15089   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
15090     // Emit a CMP with 0, which is the TEST pattern.
15091     //if (Op.getValueType() == MVT::i1)
15092     //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
15093     //                     DAG.getConstant(0, MVT::i1));
15094     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15095                        DAG.getConstant(0, Op.getValueType()));
15096   }
15097   unsigned Opcode = 0;
15098   unsigned NumOperands = 0;
15099
15100   // Truncate operations may prevent the merge of the SETCC instruction
15101   // and the arithmetic instruction before it. Attempt to truncate the operands
15102   // of the arithmetic instruction and use a reduced bit-width instruction.
15103   bool NeedTruncation = false;
15104   SDValue ArithOp = Op;
15105   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
15106     SDValue Arith = Op->getOperand(0);
15107     // Both the trunc and the arithmetic op need to have one user each.
15108     if (Arith->hasOneUse())
15109       switch (Arith.getOpcode()) {
15110         default: break;
15111         case ISD::ADD:
15112         case ISD::SUB:
15113         case ISD::AND:
15114         case ISD::OR:
15115         case ISD::XOR: {
15116           NeedTruncation = true;
15117           ArithOp = Arith;
15118         }
15119       }
15120   }
15121
15122   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
15123   // which may be the result of a CAST.  We use the variable 'Op', which is the
15124   // non-casted variable when we check for possible users.
15125   switch (ArithOp.getOpcode()) {
15126   case ISD::ADD:
15127     // Due to an isel shortcoming, be conservative if this add is likely to be
15128     // selected as part of a load-modify-store instruction. When the root node
15129     // in a match is a store, isel doesn't know how to remap non-chain non-flag
15130     // uses of other nodes in the match, such as the ADD in this case. This
15131     // leads to the ADD being left around and reselected, with the result being
15132     // two adds in the output.  Alas, even if none our users are stores, that
15133     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
15134     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
15135     // climbing the DAG back to the root, and it doesn't seem to be worth the
15136     // effort.
15137     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15138          UE = Op.getNode()->use_end(); UI != UE; ++UI)
15139       if (UI->getOpcode() != ISD::CopyToReg &&
15140           UI->getOpcode() != ISD::SETCC &&
15141           UI->getOpcode() != ISD::STORE)
15142         goto default_case;
15143
15144     if (ConstantSDNode *C =
15145         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
15146       // An add of one will be selected as an INC.
15147       if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
15148         Opcode = X86ISD::INC;
15149         NumOperands = 1;
15150         break;
15151       }
15152
15153       // An add of negative one (subtract of one) will be selected as a DEC.
15154       if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
15155         Opcode = X86ISD::DEC;
15156         NumOperands = 1;
15157         break;
15158       }
15159     }
15160
15161     // Otherwise use a regular EFLAGS-setting add.
15162     Opcode = X86ISD::ADD;
15163     NumOperands = 2;
15164     break;
15165   case ISD::SHL:
15166   case ISD::SRL:
15167     // If we have a constant logical shift that's only used in a comparison
15168     // against zero turn it into an equivalent AND. This allows turning it into
15169     // a TEST instruction later.
15170     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
15171         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
15172       EVT VT = Op.getValueType();
15173       unsigned BitWidth = VT.getSizeInBits();
15174       unsigned ShAmt = Op->getConstantOperandVal(1);
15175       if (ShAmt >= BitWidth) // Avoid undefined shifts.
15176         break;
15177       APInt Mask = ArithOp.getOpcode() == ISD::SRL
15178                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
15179                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
15180       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
15181         break;
15182       SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
15183                                 DAG.getConstant(Mask, VT));
15184       DAG.ReplaceAllUsesWith(Op, New);
15185       Op = New;
15186     }
15187     break;
15188
15189   case ISD::AND:
15190     // If the primary and result isn't used, don't bother using X86ISD::AND,
15191     // because a TEST instruction will be better.
15192     if (!hasNonFlagsUse(Op))
15193       break;
15194     // FALL THROUGH
15195   case ISD::SUB:
15196   case ISD::OR:
15197   case ISD::XOR:
15198     // Due to the ISEL shortcoming noted above, be conservative if this op is
15199     // likely to be selected as part of a load-modify-store instruction.
15200     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15201            UE = Op.getNode()->use_end(); UI != UE; ++UI)
15202       if (UI->getOpcode() == ISD::STORE)
15203         goto default_case;
15204
15205     // Otherwise use a regular EFLAGS-setting instruction.
15206     switch (ArithOp.getOpcode()) {
15207     default: llvm_unreachable("unexpected operator!");
15208     case ISD::SUB: Opcode = X86ISD::SUB; break;
15209     case ISD::XOR: Opcode = X86ISD::XOR; break;
15210     case ISD::AND: Opcode = X86ISD::AND; break;
15211     case ISD::OR: {
15212       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
15213         SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
15214         if (EFLAGS.getNode())
15215           return EFLAGS;
15216       }
15217       Opcode = X86ISD::OR;
15218       break;
15219     }
15220     }
15221
15222     NumOperands = 2;
15223     break;
15224   case X86ISD::ADD:
15225   case X86ISD::SUB:
15226   case X86ISD::INC:
15227   case X86ISD::DEC:
15228   case X86ISD::OR:
15229   case X86ISD::XOR:
15230   case X86ISD::AND:
15231     return SDValue(Op.getNode(), 1);
15232   default:
15233   default_case:
15234     break;
15235   }
15236
15237   // If we found that truncation is beneficial, perform the truncation and
15238   // update 'Op'.
15239   if (NeedTruncation) {
15240     EVT VT = Op.getValueType();
15241     SDValue WideVal = Op->getOperand(0);
15242     EVT WideVT = WideVal.getValueType();
15243     unsigned ConvertedOp = 0;
15244     // Use a target machine opcode to prevent further DAGCombine
15245     // optimizations that may separate the arithmetic operations
15246     // from the setcc node.
15247     switch (WideVal.getOpcode()) {
15248       default: break;
15249       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
15250       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
15251       case ISD::AND: ConvertedOp = X86ISD::AND; break;
15252       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
15253       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15254     }
15255
15256     if (ConvertedOp) {
15257       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15258       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15259         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15260         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15261         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15262       }
15263     }
15264   }
15265
15266   if (Opcode == 0)
15267     // Emit a CMP with 0, which is the TEST pattern.
15268     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15269                        DAG.getConstant(0, Op.getValueType()));
15270
15271   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15272   SmallVector<SDValue, 4> Ops;
15273   for (unsigned i = 0; i != NumOperands; ++i)
15274     Ops.push_back(Op.getOperand(i));
15275
15276   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15277   DAG.ReplaceAllUsesWith(Op, New);
15278   return SDValue(New.getNode(), 1);
15279 }
15280
15281 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15282 /// equivalent.
15283 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15284                                    SDLoc dl, SelectionDAG &DAG) const {
15285   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
15286     if (C->getAPIntValue() == 0)
15287       return EmitTest(Op0, X86CC, dl, DAG);
15288
15289      if (Op0.getValueType() == MVT::i1)
15290        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
15291   }
15292
15293   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
15294        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
15295     // Do the comparison at i32 if it's smaller, besides the Atom case.
15296     // This avoids subregister aliasing issues. Keep the smaller reference
15297     // if we're optimizing for size, however, as that'll allow better folding
15298     // of memory operations.
15299     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
15300         !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
15301              AttributeSet::FunctionIndex, Attribute::MinSize) &&
15302         !Subtarget->isAtom()) {
15303       unsigned ExtendOp =
15304           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
15305       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
15306       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
15307     }
15308     // Use SUB instead of CMP to enable CSE between SUB and CMP.
15309     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
15310     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
15311                               Op0, Op1);
15312     return SDValue(Sub.getNode(), 1);
15313   }
15314   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
15315 }
15316
15317 /// Convert a comparison if required by the subtarget.
15318 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
15319                                                  SelectionDAG &DAG) const {
15320   // If the subtarget does not support the FUCOMI instruction, floating-point
15321   // comparisons have to be converted.
15322   if (Subtarget->hasCMov() ||
15323       Cmp.getOpcode() != X86ISD::CMP ||
15324       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
15325       !Cmp.getOperand(1).getValueType().isFloatingPoint())
15326     return Cmp;
15327
15328   // The instruction selector will select an FUCOM instruction instead of
15329   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
15330   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
15331   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
15332   SDLoc dl(Cmp);
15333   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
15334   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
15335   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
15336                             DAG.getConstant(8, MVT::i8));
15337   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
15338   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
15339 }
15340
15341 /// The minimum architected relative accuracy is 2^-12. We need one
15342 /// Newton-Raphson step to have a good float result (24 bits of precision).
15343 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
15344                                             DAGCombinerInfo &DCI,
15345                                             unsigned &RefinementSteps,
15346                                             bool &UseOneConstNR) const {
15347   // FIXME: We should use instruction latency models to calculate the cost of
15348   // each potential sequence, but this is very hard to do reliably because
15349   // at least Intel's Core* chips have variable timing based on the number of
15350   // significant digits in the divisor and/or sqrt operand.
15351   if (!Subtarget->useSqrtEst())
15352     return SDValue();
15353
15354   EVT VT = Op.getValueType();
15355
15356   // SSE1 has rsqrtss and rsqrtps.
15357   // TODO: Add support for AVX512 (v16f32).
15358   // It is likely not profitable to do this for f64 because a double-precision
15359   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
15360   // instructions: convert to single, rsqrtss, convert back to double, refine
15361   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
15362   // along with FMA, this could be a throughput win.
15363   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15364       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15365     RefinementSteps = 1;
15366     UseOneConstNR = false;
15367     return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
15368   }
15369   return SDValue();
15370 }
15371
15372 /// The minimum architected relative accuracy is 2^-12. We need one
15373 /// Newton-Raphson step to have a good float result (24 bits of precision).
15374 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
15375                                             DAGCombinerInfo &DCI,
15376                                             unsigned &RefinementSteps) const {
15377   // FIXME: We should use instruction latency models to calculate the cost of
15378   // each potential sequence, but this is very hard to do reliably because
15379   // at least Intel's Core* chips have variable timing based on the number of
15380   // significant digits in the divisor.
15381   if (!Subtarget->useReciprocalEst())
15382     return SDValue();
15383
15384   EVT VT = Op.getValueType();
15385
15386   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
15387   // TODO: Add support for AVX512 (v16f32).
15388   // It is likely not profitable to do this for f64 because a double-precision
15389   // reciprocal estimate with refinement on x86 prior to FMA requires
15390   // 15 instructions: convert to single, rcpss, convert back to double, refine
15391   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
15392   // along with FMA, this could be a throughput win.
15393   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15394       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15395     RefinementSteps = ReciprocalEstimateRefinementSteps;
15396     return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15397   }
15398   return SDValue();
15399 }
15400
15401 static bool isAllOnes(SDValue V) {
15402   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
15403   return C && C->isAllOnesValue();
15404 }
15405
15406 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
15407 /// if it's possible.
15408 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15409                                      SDLoc dl, SelectionDAG &DAG) const {
15410   SDValue Op0 = And.getOperand(0);
15411   SDValue Op1 = And.getOperand(1);
15412   if (Op0.getOpcode() == ISD::TRUNCATE)
15413     Op0 = Op0.getOperand(0);
15414   if (Op1.getOpcode() == ISD::TRUNCATE)
15415     Op1 = Op1.getOperand(0);
15416
15417   SDValue LHS, RHS;
15418   if (Op1.getOpcode() == ISD::SHL)
15419     std::swap(Op0, Op1);
15420   if (Op0.getOpcode() == ISD::SHL) {
15421     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
15422       if (And00C->getZExtValue() == 1) {
15423         // If we looked past a truncate, check that it's only truncating away
15424         // known zeros.
15425         unsigned BitWidth = Op0.getValueSizeInBits();
15426         unsigned AndBitWidth = And.getValueSizeInBits();
15427         if (BitWidth > AndBitWidth) {
15428           APInt Zeros, Ones;
15429           DAG.computeKnownBits(Op0, Zeros, Ones);
15430           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15431             return SDValue();
15432         }
15433         LHS = Op1;
15434         RHS = Op0.getOperand(1);
15435       }
15436   } else if (Op1.getOpcode() == ISD::Constant) {
15437     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15438     uint64_t AndRHSVal = AndRHS->getZExtValue();
15439     SDValue AndLHS = Op0;
15440
15441     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15442       LHS = AndLHS.getOperand(0);
15443       RHS = AndLHS.getOperand(1);
15444     }
15445
15446     // Use BT if the immediate can't be encoded in a TEST instruction.
15447     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15448       LHS = AndLHS;
15449       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
15450     }
15451   }
15452
15453   if (LHS.getNode()) {
15454     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
15455     // instruction.  Since the shift amount is in-range-or-undefined, we know
15456     // that doing a bittest on the i32 value is ok.  We extend to i32 because
15457     // the encoding for the i16 version is larger than the i32 version.
15458     // Also promote i16 to i32 for performance / code size reason.
15459     if (LHS.getValueType() == MVT::i8 ||
15460         LHS.getValueType() == MVT::i16)
15461       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15462
15463     // If the operand types disagree, extend the shift amount to match.  Since
15464     // BT ignores high bits (like shifts) we can use anyextend.
15465     if (LHS.getValueType() != RHS.getValueType())
15466       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15467
15468     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15469     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15470     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15471                        DAG.getConstant(Cond, MVT::i8), BT);
15472   }
15473
15474   return SDValue();
15475 }
15476
15477 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
15478 /// mask CMPs.
15479 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15480                               SDValue &Op1) {
15481   unsigned SSECC;
15482   bool Swap = false;
15483
15484   // SSE Condition code mapping:
15485   //  0 - EQ
15486   //  1 - LT
15487   //  2 - LE
15488   //  3 - UNORD
15489   //  4 - NEQ
15490   //  5 - NLT
15491   //  6 - NLE
15492   //  7 - ORD
15493   switch (SetCCOpcode) {
15494   default: llvm_unreachable("Unexpected SETCC condition");
15495   case ISD::SETOEQ:
15496   case ISD::SETEQ:  SSECC = 0; break;
15497   case ISD::SETOGT:
15498   case ISD::SETGT:  Swap = true; // Fallthrough
15499   case ISD::SETLT:
15500   case ISD::SETOLT: SSECC = 1; break;
15501   case ISD::SETOGE:
15502   case ISD::SETGE:  Swap = true; // Fallthrough
15503   case ISD::SETLE:
15504   case ISD::SETOLE: SSECC = 2; break;
15505   case ISD::SETUO:  SSECC = 3; break;
15506   case ISD::SETUNE:
15507   case ISD::SETNE:  SSECC = 4; break;
15508   case ISD::SETULE: Swap = true; // Fallthrough
15509   case ISD::SETUGE: SSECC = 5; break;
15510   case ISD::SETULT: Swap = true; // Fallthrough
15511   case ISD::SETUGT: SSECC = 6; break;
15512   case ISD::SETO:   SSECC = 7; break;
15513   case ISD::SETUEQ:
15514   case ISD::SETONE: SSECC = 8; break;
15515   }
15516   if (Swap)
15517     std::swap(Op0, Op1);
15518
15519   return SSECC;
15520 }
15521
15522 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
15523 // ones, and then concatenate the result back.
15524 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15525   MVT VT = Op.getSimpleValueType();
15526
15527   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15528          "Unsupported value type for operation");
15529
15530   unsigned NumElems = VT.getVectorNumElements();
15531   SDLoc dl(Op);
15532   SDValue CC = Op.getOperand(2);
15533
15534   // Extract the LHS vectors
15535   SDValue LHS = Op.getOperand(0);
15536   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
15537   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
15538
15539   // Extract the RHS vectors
15540   SDValue RHS = Op.getOperand(1);
15541   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
15542   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
15543
15544   // Issue the operation on the smaller types and concatenate the result back
15545   MVT EltVT = VT.getVectorElementType();
15546   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15547   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15548                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15549                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15550 }
15551
15552 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
15553                                      const X86Subtarget *Subtarget) {
15554   SDValue Op0 = Op.getOperand(0);
15555   SDValue Op1 = Op.getOperand(1);
15556   SDValue CC = Op.getOperand(2);
15557   MVT VT = Op.getSimpleValueType();
15558   SDLoc dl(Op);
15559
15560   assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
15561          Op.getValueType().getScalarType() == MVT::i1 &&
15562          "Cannot set masked compare for this operation");
15563
15564   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15565   unsigned  Opc = 0;
15566   bool Unsigned = false;
15567   bool Swap = false;
15568   unsigned SSECC;
15569   switch (SetCCOpcode) {
15570   default: llvm_unreachable("Unexpected SETCC condition");
15571   case ISD::SETNE:  SSECC = 4; break;
15572   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
15573   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15574   case ISD::SETLT:  Swap = true; //fall-through
15575   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
15576   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15577   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15578   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
15579   case ISD::SETULE: Unsigned = true; //fall-through
15580   case ISD::SETLE:  SSECC = 2; break;
15581   }
15582
15583   if (Swap)
15584     std::swap(Op0, Op1);
15585   if (Opc)
15586     return DAG.getNode(Opc, dl, VT, Op0, Op1);
15587   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15588   return DAG.getNode(Opc, dl, VT, Op0, Op1,
15589                      DAG.getConstant(SSECC, MVT::i8));
15590 }
15591
15592 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15593 /// operand \p Op1.  If non-trivial (for example because it's not constant)
15594 /// return an empty value.
15595 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
15596 {
15597   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15598   if (!BV)
15599     return SDValue();
15600
15601   MVT VT = Op1.getSimpleValueType();
15602   MVT EVT = VT.getVectorElementType();
15603   unsigned n = VT.getVectorNumElements();
15604   SmallVector<SDValue, 8> ULTOp1;
15605
15606   for (unsigned i = 0; i < n; ++i) {
15607     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15608     if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
15609       return SDValue();
15610
15611     // Avoid underflow.
15612     APInt Val = Elt->getAPIntValue();
15613     if (Val == 0)
15614       return SDValue();
15615
15616     ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
15617   }
15618
15619   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
15620 }
15621
15622 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
15623                            SelectionDAG &DAG) {
15624   SDValue Op0 = Op.getOperand(0);
15625   SDValue Op1 = Op.getOperand(1);
15626   SDValue CC = Op.getOperand(2);
15627   MVT VT = Op.getSimpleValueType();
15628   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15629   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15630   SDLoc dl(Op);
15631
15632   if (isFP) {
15633 #ifndef NDEBUG
15634     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15635     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15636 #endif
15637
15638     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15639     unsigned Opc = X86ISD::CMPP;
15640     if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15641       assert(VT.getVectorNumElements() <= 16);
15642       Opc = X86ISD::CMPM;
15643     }
15644     // In the two special cases we can't handle, emit two comparisons.
15645     if (SSECC == 8) {
15646       unsigned CC0, CC1;
15647       unsigned CombineOpc;
15648       if (SetCCOpcode == ISD::SETUEQ) {
15649         CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
15650       } else {
15651         assert(SetCCOpcode == ISD::SETONE);
15652         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
15653       }
15654
15655       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15656                                  DAG.getConstant(CC0, MVT::i8));
15657       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15658                                  DAG.getConstant(CC1, MVT::i8));
15659       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15660     }
15661     // Handle all other FP comparisons here.
15662     return DAG.getNode(Opc, dl, VT, Op0, Op1,
15663                        DAG.getConstant(SSECC, MVT::i8));
15664   }
15665
15666   // Break 256-bit integer vector compare into smaller ones.
15667   if (VT.is256BitVector() && !Subtarget->hasInt256())
15668     return Lower256IntVSETCC(Op, DAG);
15669
15670   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
15671   EVT OpVT = Op1.getValueType();
15672   if (Subtarget->hasAVX512()) {
15673     if (Op1.getValueType().is512BitVector() ||
15674         (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
15675         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
15676       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
15677
15678     // In AVX-512 architecture setcc returns mask with i1 elements,
15679     // But there is no compare instruction for i8 and i16 elements in KNL.
15680     // We are not talking about 512-bit operands in this case, these
15681     // types are illegal.
15682     if (MaskResult &&
15683         (OpVT.getVectorElementType().getSizeInBits() < 32 &&
15684          OpVT.getVectorElementType().getSizeInBits() >= 8))
15685       return DAG.getNode(ISD::TRUNCATE, dl, VT,
15686                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15687   }
15688
15689   // We are handling one of the integer comparisons here.  Since SSE only has
15690   // GT and EQ comparisons for integer, swapping operands and multiple
15691   // operations may be required for some comparisons.
15692   unsigned Opc;
15693   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15694   bool Subus = false;
15695
15696   switch (SetCCOpcode) {
15697   default: llvm_unreachable("Unexpected SETCC condition");
15698   case ISD::SETNE:  Invert = true;
15699   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
15700   case ISD::SETLT:  Swap = true;
15701   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
15702   case ISD::SETGE:  Swap = true;
15703   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
15704                     Invert = true; break;
15705   case ISD::SETULT: Swap = true;
15706   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15707                     FlipSigns = true; break;
15708   case ISD::SETUGE: Swap = true;
15709   case ISD::SETULE: Opc = X86ISD::PCMPGT;
15710                     FlipSigns = true; Invert = true; break;
15711   }
15712
15713   // Special case: Use min/max operations for SETULE/SETUGE
15714   MVT VET = VT.getVectorElementType();
15715   bool hasMinMax =
15716        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15717     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
15718
15719   if (hasMinMax) {
15720     switch (SetCCOpcode) {
15721     default: break;
15722     case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
15723     case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
15724     }
15725
15726     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15727   }
15728
15729   bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15730   if (!MinMax && hasSubus) {
15731     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15732     // Op0 u<= Op1:
15733     //   t = psubus Op0, Op1
15734     //   pcmpeq t, <0..0>
15735     switch (SetCCOpcode) {
15736     default: break;
15737     case ISD::SETULT: {
15738       // If the comparison is against a constant we can turn this into a
15739       // setule.  With psubus, setule does not require a swap.  This is
15740       // beneficial because the constant in the register is no longer
15741       // destructed as the destination so it can be hoisted out of a loop.
15742       // Only do this pre-AVX since vpcmp* is no longer destructive.
15743       if (Subtarget->hasAVX())
15744         break;
15745       SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
15746       if (ULEOp1.getNode()) {
15747         Op1 = ULEOp1;
15748         Subus = true; Invert = false; Swap = false;
15749       }
15750       break;
15751     }
15752     // Psubus is better than flip-sign because it requires no inversion.
15753     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
15754     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15755     }
15756
15757     if (Subus) {
15758       Opc = X86ISD::SUBUS;
15759       FlipSigns = false;
15760     }
15761   }
15762
15763   if (Swap)
15764     std::swap(Op0, Op1);
15765
15766   // Check that the operation in question is available (most are plain SSE2,
15767   // but PCMPGTQ and PCMPEQQ have different requirements).
15768   if (VT == MVT::v2i64) {
15769     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
15770       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
15771
15772       // First cast everything to the right type.
15773       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15774       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15775
15776       // Since SSE has no unsigned integer comparisons, we need to flip the sign
15777       // bits of the inputs before performing those operations. The lower
15778       // compare is always unsigned.
15779       SDValue SB;
15780       if (FlipSigns) {
15781         SB = DAG.getConstant(0x80000000U, MVT::v4i32);
15782       } else {
15783         SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
15784         SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
15785         SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
15786                          Sign, Zero, Sign, Zero);
15787       }
15788       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15789       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15790
15791       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15792       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15793       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15794
15795       // Create masks for only the low parts/high parts of the 64 bit integers.
15796       static const int MaskHi[] = { 1, 1, 3, 3 };
15797       static const int MaskLo[] = { 0, 0, 2, 2 };
15798       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15799       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15800       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15801
15802       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15803       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15804
15805       if (Invert)
15806         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15807
15808       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15809     }
15810
15811     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
15812       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15813       // pcmpeqd + pshufd + pand.
15814       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
15815
15816       // First cast everything to the right type.
15817       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15818       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15819
15820       // Do the compare.
15821       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15822
15823       // Make sure the lower and upper halves are both all-ones.
15824       static const int Mask[] = { 1, 0, 3, 2 };
15825       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15826       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15827
15828       if (Invert)
15829         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15830
15831       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15832     }
15833   }
15834
15835   // Since SSE has no unsigned integer comparisons, we need to flip the sign
15836   // bits of the inputs before performing those operations.
15837   if (FlipSigns) {
15838     EVT EltVT = VT.getVectorElementType();
15839     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
15840     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15841     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15842   }
15843
15844   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15845
15846   // If the logical-not of the result is required, perform that now.
15847   if (Invert)
15848     Result = DAG.getNOT(dl, Result, VT);
15849
15850   if (MinMax)
15851     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15852
15853   if (Subus)
15854     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15855                          getZeroVector(VT, Subtarget, DAG, dl));
15856
15857   return Result;
15858 }
15859
15860 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15861
15862   MVT VT = Op.getSimpleValueType();
15863
15864   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15865
15866   assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15867          && "SetCC type must be 8-bit or 1-bit integer");
15868   SDValue Op0 = Op.getOperand(0);
15869   SDValue Op1 = Op.getOperand(1);
15870   SDLoc dl(Op);
15871   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15872
15873   // Optimize to BT if possible.
15874   // Lower (X & (1 << N)) == 0 to BT(X, N).
15875   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15876   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15877   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15878       Op1.getOpcode() == ISD::Constant &&
15879       cast<ConstantSDNode>(Op1)->isNullValue() &&
15880       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15881     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
15882     if (NewSetCC.getNode()) {
15883       if (VT == MVT::i1)
15884         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15885       return NewSetCC;
15886     }
15887   }
15888
15889   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
15890   // these.
15891   if (Op1.getOpcode() == ISD::Constant &&
15892       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
15893        cast<ConstantSDNode>(Op1)->isNullValue()) &&
15894       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15895
15896     // If the input is a setcc, then reuse the input setcc or use a new one with
15897     // the inverted condition.
15898     if (Op0.getOpcode() == X86ISD::SETCC) {
15899       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15900       bool Invert = (CC == ISD::SETNE) ^
15901         cast<ConstantSDNode>(Op1)->isNullValue();
15902       if (!Invert)
15903         return Op0;
15904
15905       CCode = X86::GetOppositeBranchCondition(CCode);
15906       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15907                                   DAG.getConstant(CCode, MVT::i8),
15908                                   Op0.getOperand(1));
15909       if (VT == MVT::i1)
15910         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15911       return SetCC;
15912     }
15913   }
15914   if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
15915       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
15916       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15917
15918     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15919     return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
15920   }
15921
15922   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15923   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
15924   if (X86CC == X86::COND_INVALID)
15925     return SDValue();
15926
15927   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15928   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15929   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15930                               DAG.getConstant(X86CC, MVT::i8), EFLAGS);
15931   if (VT == MVT::i1)
15932     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15933   return SetCC;
15934 }
15935
15936 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
15937 static bool isX86LogicalCmp(SDValue Op) {
15938   unsigned Opc = Op.getNode()->getOpcode();
15939   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15940       Opc == X86ISD::SAHF)
15941     return true;
15942   if (Op.getResNo() == 1 &&
15943       (Opc == X86ISD::ADD ||
15944        Opc == X86ISD::SUB ||
15945        Opc == X86ISD::ADC ||
15946        Opc == X86ISD::SBB ||
15947        Opc == X86ISD::SMUL ||
15948        Opc == X86ISD::UMUL ||
15949        Opc == X86ISD::INC ||
15950        Opc == X86ISD::DEC ||
15951        Opc == X86ISD::OR ||
15952        Opc == X86ISD::XOR ||
15953        Opc == X86ISD::AND))
15954     return true;
15955
15956   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15957     return true;
15958
15959   return false;
15960 }
15961
15962 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15963   if (V.getOpcode() != ISD::TRUNCATE)
15964     return false;
15965
15966   SDValue VOp0 = V.getOperand(0);
15967   unsigned InBits = VOp0.getValueSizeInBits();
15968   unsigned Bits = V.getValueSizeInBits();
15969   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
15970 }
15971
15972 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15973   bool addTest = true;
15974   SDValue Cond  = Op.getOperand(0);
15975   SDValue Op1 = Op.getOperand(1);
15976   SDValue Op2 = Op.getOperand(2);
15977   SDLoc DL(Op);
15978   EVT VT = Op1.getValueType();
15979   SDValue CC;
15980
15981   // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15982   // are available. Otherwise fp cmovs get lowered into a less efficient branch
15983   // sequence later on.
15984   if (Cond.getOpcode() == ISD::SETCC &&
15985       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15986        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
15987       VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
15988     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15989     int SSECC = translateX86FSETCC(
15990         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15991
15992     if (SSECC != 8) {
15993       if (Subtarget->hasAVX512()) {
15994         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15995                                   DAG.getConstant(SSECC, MVT::i8));
15996         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15997       }
15998       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
15999                                 DAG.getConstant(SSECC, MVT::i8));
16000       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
16001       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
16002       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
16003     }
16004   }
16005
16006   if (Cond.getOpcode() == ISD::SETCC) {
16007     SDValue NewCond = LowerSETCC(Cond, DAG);
16008     if (NewCond.getNode())
16009       Cond = NewCond;
16010   }
16011
16012   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
16013   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
16014   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
16015   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
16016   if (Cond.getOpcode() == X86ISD::SETCC &&
16017       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
16018       isZero(Cond.getOperand(1).getOperand(1))) {
16019     SDValue Cmp = Cond.getOperand(1);
16020
16021     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
16022
16023     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
16024         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
16025       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
16026
16027       SDValue CmpOp0 = Cmp.getOperand(0);
16028       // Apply further optimizations for special cases
16029       // (select (x != 0), -1, 0) -> neg & sbb
16030       // (select (x == 0), 0, -1) -> neg & sbb
16031       if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
16032         if (YC->isNullValue() &&
16033             (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
16034           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
16035           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
16036                                     DAG.getConstant(0, CmpOp0.getValueType()),
16037                                     CmpOp0);
16038           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16039                                     DAG.getConstant(X86::COND_B, MVT::i8),
16040                                     SDValue(Neg.getNode(), 1));
16041           return Res;
16042         }
16043
16044       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
16045                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
16046       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16047
16048       SDValue Res =   // Res = 0 or -1.
16049         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16050                     DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
16051
16052       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
16053         Res = DAG.getNOT(DL, Res, Res.getValueType());
16054
16055       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
16056       if (!N2C || !N2C->isNullValue())
16057         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
16058       return Res;
16059     }
16060   }
16061
16062   // Look past (and (setcc_carry (cmp ...)), 1).
16063   if (Cond.getOpcode() == ISD::AND &&
16064       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16065     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16066     if (C && C->getAPIntValue() == 1)
16067       Cond = Cond.getOperand(0);
16068   }
16069
16070   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16071   // setting operand in place of the X86ISD::SETCC.
16072   unsigned CondOpcode = Cond.getOpcode();
16073   if (CondOpcode == X86ISD::SETCC ||
16074       CondOpcode == X86ISD::SETCC_CARRY) {
16075     CC = Cond.getOperand(0);
16076
16077     SDValue Cmp = Cond.getOperand(1);
16078     unsigned Opc = Cmp.getOpcode();
16079     MVT VT = Op.getSimpleValueType();
16080
16081     bool IllegalFPCMov = false;
16082     if (VT.isFloatingPoint() && !VT.isVector() &&
16083         !isScalarFPTypeInSSEReg(VT))  // FPStack?
16084       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
16085
16086     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
16087         Opc == X86ISD::BT) { // FIXME
16088       Cond = Cmp;
16089       addTest = false;
16090     }
16091   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16092              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16093              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16094               Cond.getOperand(0).getValueType() != MVT::i8)) {
16095     SDValue LHS = Cond.getOperand(0);
16096     SDValue RHS = Cond.getOperand(1);
16097     unsigned X86Opcode;
16098     unsigned X86Cond;
16099     SDVTList VTs;
16100     switch (CondOpcode) {
16101     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16102     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16103     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16104     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16105     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16106     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16107     default: llvm_unreachable("unexpected overflowing operator");
16108     }
16109     if (CondOpcode == ISD::UMULO)
16110       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16111                           MVT::i32);
16112     else
16113       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16114
16115     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
16116
16117     if (CondOpcode == ISD::UMULO)
16118       Cond = X86Op.getValue(2);
16119     else
16120       Cond = X86Op.getValue(1);
16121
16122     CC = DAG.getConstant(X86Cond, MVT::i8);
16123     addTest = false;
16124   }
16125
16126   if (addTest) {
16127     // Look pass the truncate if the high bits are known zero.
16128     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16129         Cond = Cond.getOperand(0);
16130
16131     // We know the result of AND is compared against zero. Try to match
16132     // it to BT.
16133     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16134       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
16135       if (NewSetCC.getNode()) {
16136         CC = NewSetCC.getOperand(0);
16137         Cond = NewSetCC.getOperand(1);
16138         addTest = false;
16139       }
16140     }
16141   }
16142
16143   if (addTest) {
16144     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16145     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
16146   }
16147
16148   // a <  b ? -1 :  0 -> RES = ~setcc_carry
16149   // a <  b ?  0 : -1 -> RES = setcc_carry
16150   // a >= b ? -1 :  0 -> RES = setcc_carry
16151   // a >= b ?  0 : -1 -> RES = ~setcc_carry
16152   if (Cond.getOpcode() == X86ISD::SUB) {
16153     Cond = ConvertCmpIfNecessary(Cond, DAG);
16154     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
16155
16156     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
16157         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
16158       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16159                                 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
16160       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
16161         return DAG.getNOT(DL, Res, Res.getValueType());
16162       return Res;
16163     }
16164   }
16165
16166   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
16167   // widen the cmov and push the truncate through. This avoids introducing a new
16168   // branch during isel and doesn't add any extensions.
16169   if (Op.getValueType() == MVT::i8 &&
16170       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
16171     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
16172     if (T1.getValueType() == T2.getValueType() &&
16173         // Blacklist CopyFromReg to avoid partial register stalls.
16174         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
16175       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
16176       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
16177       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
16178     }
16179   }
16180
16181   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
16182   // condition is true.
16183   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
16184   SDValue Ops[] = { Op2, Op1, CC, Cond };
16185   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
16186 }
16187
16188 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
16189                                        SelectionDAG &DAG) {
16190   MVT VT = Op->getSimpleValueType(0);
16191   SDValue In = Op->getOperand(0);
16192   MVT InVT = In.getSimpleValueType();
16193   MVT VTElt = VT.getVectorElementType();
16194   MVT InVTElt = InVT.getVectorElementType();
16195   SDLoc dl(Op);
16196
16197   // SKX processor
16198   if ((InVTElt == MVT::i1) &&
16199       (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
16200         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
16201
16202        ((Subtarget->hasBWI() && VT.is512BitVector() &&
16203         VTElt.getSizeInBits() <= 16)) ||
16204
16205        ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
16206         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
16207
16208        ((Subtarget->hasDQI() && VT.is512BitVector() &&
16209         VTElt.getSizeInBits() >= 32))))
16210     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16211
16212   unsigned int NumElts = VT.getVectorNumElements();
16213
16214   if (NumElts != 8 && NumElts != 16)
16215     return SDValue();
16216
16217   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
16218     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
16219       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
16220     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16221   }
16222
16223   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16224   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16225
16226   MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
16227   Constant *C = ConstantInt::get(*DAG.getContext(),
16228     APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
16229
16230   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
16231   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
16232   SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
16233                           MachinePointerInfo::getConstantPool(),
16234                           false, false, false, Alignment);
16235   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
16236   if (VT.is512BitVector())
16237     return Brcst;
16238   return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
16239 }
16240
16241 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
16242                                 SelectionDAG &DAG) {
16243   MVT VT = Op->getSimpleValueType(0);
16244   SDValue In = Op->getOperand(0);
16245   MVT InVT = In.getSimpleValueType();
16246   SDLoc dl(Op);
16247
16248   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16249     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16250
16251   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16252       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16253       (VT != MVT::v16i16 || InVT != MVT::v16i8))
16254     return SDValue();
16255
16256   if (Subtarget->hasInt256())
16257     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16258
16259   // Optimize vectors in AVX mode
16260   // Sign extend  v8i16 to v8i32 and
16261   //              v4i32 to v4i64
16262   //
16263   // Divide input vector into two parts
16264   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16265   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16266   // concat the vectors to original VT
16267
16268   unsigned NumElems = InVT.getVectorNumElements();
16269   SDValue Undef = DAG.getUNDEF(InVT);
16270
16271   SmallVector<int,8> ShufMask1(NumElems, -1);
16272   for (unsigned i = 0; i != NumElems/2; ++i)
16273     ShufMask1[i] = i;
16274
16275   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
16276
16277   SmallVector<int,8> ShufMask2(NumElems, -1);
16278   for (unsigned i = 0; i != NumElems/2; ++i)
16279     ShufMask2[i] = i + NumElems/2;
16280
16281   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
16282
16283   MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
16284                                 VT.getVectorNumElements()/2);
16285
16286   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16287   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16288
16289   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16290 }
16291
16292 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16293 // may emit an illegal shuffle but the expansion is still better than scalar
16294 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16295 // we'll emit a shuffle and a arithmetic shift.
16296 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
16297 // TODO: It is possible to support ZExt by zeroing the undef values during
16298 // the shuffle phase or after the shuffle.
16299 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
16300                                  SelectionDAG &DAG) {
16301   MVT RegVT = Op.getSimpleValueType();
16302   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16303   assert(RegVT.isInteger() &&
16304          "We only custom lower integer vector sext loads.");
16305
16306   // Nothing useful we can do without SSE2 shuffles.
16307   assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
16308
16309   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16310   SDLoc dl(Ld);
16311   EVT MemVT = Ld->getMemoryVT();
16312   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16313   unsigned RegSz = RegVT.getSizeInBits();
16314
16315   ISD::LoadExtType Ext = Ld->getExtensionType();
16316
16317   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16318          && "Only anyext and sext are currently implemented.");
16319   assert(MemVT != RegVT && "Cannot extend to the same type");
16320   assert(MemVT.isVector() && "Must load a vector from memory");
16321
16322   unsigned NumElems = RegVT.getVectorNumElements();
16323   unsigned MemSz = MemVT.getSizeInBits();
16324   assert(RegSz > MemSz && "Register size must be greater than the mem size");
16325
16326   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
16327     // The only way in which we have a legal 256-bit vector result but not the
16328     // integer 256-bit operations needed to directly lower a sextload is if we
16329     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16330     // a 128-bit vector and a normal sign_extend to 256-bits that should get
16331     // correctly legalized. We do this late to allow the canonical form of
16332     // sextload to persist throughout the rest of the DAG combiner -- it wants
16333     // to fold together any extensions it can, and so will fuse a sign_extend
16334     // of an sextload into a sextload targeting a wider value.
16335     SDValue Load;
16336     if (MemSz == 128) {
16337       // Just switch this to a normal load.
16338       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16339                                        "it must be a legal 128-bit vector "
16340                                        "type!");
16341       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16342                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16343                   Ld->isInvariant(), Ld->getAlignment());
16344     } else {
16345       assert(MemSz < 128 &&
16346              "Can't extend a type wider than 128 bits to a 256 bit vector!");
16347       // Do an sext load to a 128-bit vector type. We want to use the same
16348       // number of elements, but elements half as wide. This will end up being
16349       // recursively lowered by this routine, but will succeed as we definitely
16350       // have all the necessary features if we're using AVX1.
16351       EVT HalfEltVT =
16352           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16353       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16354       Load =
16355           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16356                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16357                          Ld->isNonTemporal(), Ld->isInvariant(),
16358                          Ld->getAlignment());
16359     }
16360
16361     // Replace chain users with the new chain.
16362     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16363     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16364
16365     // Finally, do a normal sign-extend to the desired register.
16366     return DAG.getSExtOrTrunc(Load, dl, RegVT);
16367   }
16368
16369   // All sizes must be a power of two.
16370   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16371          "Non-power-of-two elements are not custom lowered!");
16372
16373   // Attempt to load the original value using scalar loads.
16374   // Find the largest scalar type that divides the total loaded size.
16375   MVT SclrLoadTy = MVT::i8;
16376   for (MVT Tp : MVT::integer_valuetypes()) {
16377     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16378       SclrLoadTy = Tp;
16379     }
16380   }
16381
16382   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16383   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16384       (64 <= MemSz))
16385     SclrLoadTy = MVT::f64;
16386
16387   // Calculate the number of scalar loads that we need to perform
16388   // in order to load our vector from memory.
16389   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16390
16391   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16392          "Can only lower sext loads with a single scalar load!");
16393
16394   unsigned loadRegZize = RegSz;
16395   if (Ext == ISD::SEXTLOAD && RegSz == 256)
16396     loadRegZize /= 2;
16397
16398   // Represent our vector as a sequence of elements which are the
16399   // largest scalar that we can load.
16400   EVT LoadUnitVecVT = EVT::getVectorVT(
16401       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16402
16403   // Represent the data using the same element type that is stored in
16404   // memory. In practice, we ''widen'' MemVT.
16405   EVT WideVecVT =
16406       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16407                        loadRegZize / MemVT.getScalarType().getSizeInBits());
16408
16409   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16410          "Invalid vector type");
16411
16412   // We can't shuffle using an illegal type.
16413   assert(TLI.isTypeLegal(WideVecVT) &&
16414          "We only lower types that form legal widened vector types");
16415
16416   SmallVector<SDValue, 8> Chains;
16417   SDValue Ptr = Ld->getBasePtr();
16418   SDValue Increment =
16419       DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
16420   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16421
16422   for (unsigned i = 0; i < NumLoads; ++i) {
16423     // Perform a single load.
16424     SDValue ScalarLoad =
16425         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16426                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16427                     Ld->getAlignment());
16428     Chains.push_back(ScalarLoad.getValue(1));
16429     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16430     // another round of DAGCombining.
16431     if (i == 0)
16432       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16433     else
16434       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16435                         ScalarLoad, DAG.getIntPtrConstant(i));
16436
16437     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16438   }
16439
16440   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16441
16442   // Bitcast the loaded value to a vector of the original element type, in
16443   // the size of the target vector type.
16444   SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
16445   unsigned SizeRatio = RegSz / MemSz;
16446
16447   if (Ext == ISD::SEXTLOAD) {
16448     // If we have SSE4.1, we can directly emit a VSEXT node.
16449     if (Subtarget->hasSSE41()) {
16450       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16451       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16452       return Sext;
16453     }
16454
16455     // Otherwise we'll shuffle the small elements in the high bits of the
16456     // larger type and perform an arithmetic shift. If the shift is not legal
16457     // it's better to scalarize.
16458     assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
16459            "We can't implement a sext load without an arithmetic right shift!");
16460
16461     // Redistribute the loaded elements into the different locations.
16462     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16463     for (unsigned i = 0; i != NumElems; ++i)
16464       ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
16465
16466     SDValue Shuff = DAG.getVectorShuffle(
16467         WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16468
16469     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16470
16471     // Build the arithmetic shift.
16472     unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16473                    MemVT.getVectorElementType().getSizeInBits();
16474     Shuff =
16475         DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
16476
16477     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16478     return Shuff;
16479   }
16480
16481   // Redistribute the loaded elements into the different locations.
16482   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16483   for (unsigned i = 0; i != NumElems; ++i)
16484     ShuffleVec[i * SizeRatio] = i;
16485
16486   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16487                                        DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16488
16489   // Bitcast to the requested type.
16490   Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16491   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16492   return Shuff;
16493 }
16494
16495 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
16496 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
16497 // from the AND / OR.
16498 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16499   Opc = Op.getOpcode();
16500   if (Opc != ISD::OR && Opc != ISD::AND)
16501     return false;
16502   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16503           Op.getOperand(0).hasOneUse() &&
16504           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16505           Op.getOperand(1).hasOneUse());
16506 }
16507
16508 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
16509 // 1 and that the SETCC node has a single use.
16510 static bool isXor1OfSetCC(SDValue Op) {
16511   if (Op.getOpcode() != ISD::XOR)
16512     return false;
16513   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16514   if (N1C && N1C->getAPIntValue() == 1) {
16515     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16516       Op.getOperand(0).hasOneUse();
16517   }
16518   return false;
16519 }
16520
16521 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16522   bool addTest = true;
16523   SDValue Chain = Op.getOperand(0);
16524   SDValue Cond  = Op.getOperand(1);
16525   SDValue Dest  = Op.getOperand(2);
16526   SDLoc dl(Op);
16527   SDValue CC;
16528   bool Inverted = false;
16529
16530   if (Cond.getOpcode() == ISD::SETCC) {
16531     // Check for setcc([su]{add,sub,mul}o == 0).
16532     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16533         isa<ConstantSDNode>(Cond.getOperand(1)) &&
16534         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
16535         Cond.getOperand(0).getResNo() == 1 &&
16536         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16537          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16538          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16539          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16540          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16541          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16542       Inverted = true;
16543       Cond = Cond.getOperand(0);
16544     } else {
16545       SDValue NewCond = LowerSETCC(Cond, DAG);
16546       if (NewCond.getNode())
16547         Cond = NewCond;
16548     }
16549   }
16550 #if 0
16551   // FIXME: LowerXALUO doesn't handle these!!
16552   else if (Cond.getOpcode() == X86ISD::ADD  ||
16553            Cond.getOpcode() == X86ISD::SUB  ||
16554            Cond.getOpcode() == X86ISD::SMUL ||
16555            Cond.getOpcode() == X86ISD::UMUL)
16556     Cond = LowerXALUO(Cond, DAG);
16557 #endif
16558
16559   // Look pass (and (setcc_carry (cmp ...)), 1).
16560   if (Cond.getOpcode() == ISD::AND &&
16561       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16562     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16563     if (C && C->getAPIntValue() == 1)
16564       Cond = Cond.getOperand(0);
16565   }
16566
16567   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16568   // setting operand in place of the X86ISD::SETCC.
16569   unsigned CondOpcode = Cond.getOpcode();
16570   if (CondOpcode == X86ISD::SETCC ||
16571       CondOpcode == X86ISD::SETCC_CARRY) {
16572     CC = Cond.getOperand(0);
16573
16574     SDValue Cmp = Cond.getOperand(1);
16575     unsigned Opc = Cmp.getOpcode();
16576     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16577     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16578       Cond = Cmp;
16579       addTest = false;
16580     } else {
16581       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16582       default: break;
16583       case X86::COND_O:
16584       case X86::COND_B:
16585         // These can only come from an arithmetic instruction with overflow,
16586         // e.g. SADDO, UADDO.
16587         Cond = Cond.getNode()->getOperand(1);
16588         addTest = false;
16589         break;
16590       }
16591     }
16592   }
16593   CondOpcode = Cond.getOpcode();
16594   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16595       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16596       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16597        Cond.getOperand(0).getValueType() != MVT::i8)) {
16598     SDValue LHS = Cond.getOperand(0);
16599     SDValue RHS = Cond.getOperand(1);
16600     unsigned X86Opcode;
16601     unsigned X86Cond;
16602     SDVTList VTs;
16603     // Keep this in sync with LowerXALUO, otherwise we might create redundant
16604     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16605     // X86ISD::INC).
16606     switch (CondOpcode) {
16607     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16608     case ISD::SADDO:
16609       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16610         if (C->isOne()) {
16611           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16612           break;
16613         }
16614       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16615     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16616     case ISD::SSUBO:
16617       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16618         if (C->isOne()) {
16619           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16620           break;
16621         }
16622       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16623     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16624     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16625     default: llvm_unreachable("unexpected overflowing operator");
16626     }
16627     if (Inverted)
16628       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16629     if (CondOpcode == ISD::UMULO)
16630       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16631                           MVT::i32);
16632     else
16633       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16634
16635     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16636
16637     if (CondOpcode == ISD::UMULO)
16638       Cond = X86Op.getValue(2);
16639     else
16640       Cond = X86Op.getValue(1);
16641
16642     CC = DAG.getConstant(X86Cond, MVT::i8);
16643     addTest = false;
16644   } else {
16645     unsigned CondOpc;
16646     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16647       SDValue Cmp = Cond.getOperand(0).getOperand(1);
16648       if (CondOpc == ISD::OR) {
16649         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16650         // two branches instead of an explicit OR instruction with a
16651         // separate test.
16652         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16653             isX86LogicalCmp(Cmp)) {
16654           CC = Cond.getOperand(0).getOperand(0);
16655           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16656                               Chain, Dest, CC, Cmp);
16657           CC = Cond.getOperand(1).getOperand(0);
16658           Cond = Cmp;
16659           addTest = false;
16660         }
16661       } else { // ISD::AND
16662         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16663         // two branches instead of an explicit AND instruction with a
16664         // separate test. However, we only do this if this block doesn't
16665         // have a fall-through edge, because this requires an explicit
16666         // jmp when the condition is false.
16667         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16668             isX86LogicalCmp(Cmp) &&
16669             Op.getNode()->hasOneUse()) {
16670           X86::CondCode CCode =
16671             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16672           CCode = X86::GetOppositeBranchCondition(CCode);
16673           CC = DAG.getConstant(CCode, MVT::i8);
16674           SDNode *User = *Op.getNode()->use_begin();
16675           // Look for an unconditional branch following this conditional branch.
16676           // We need this because we need to reverse the successors in order
16677           // to implement FCMP_OEQ.
16678           if (User->getOpcode() == ISD::BR) {
16679             SDValue FalseBB = User->getOperand(1);
16680             SDNode *NewBR =
16681               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16682             assert(NewBR == User);
16683             (void)NewBR;
16684             Dest = FalseBB;
16685
16686             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16687                                 Chain, Dest, CC, Cmp);
16688             X86::CondCode CCode =
16689               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16690             CCode = X86::GetOppositeBranchCondition(CCode);
16691             CC = DAG.getConstant(CCode, MVT::i8);
16692             Cond = Cmp;
16693             addTest = false;
16694           }
16695         }
16696       }
16697     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16698       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16699       // It should be transformed during dag combiner except when the condition
16700       // is set by a arithmetics with overflow node.
16701       X86::CondCode CCode =
16702         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16703       CCode = X86::GetOppositeBranchCondition(CCode);
16704       CC = DAG.getConstant(CCode, MVT::i8);
16705       Cond = Cond.getOperand(0).getOperand(1);
16706       addTest = false;
16707     } else if (Cond.getOpcode() == ISD::SETCC &&
16708                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16709       // For FCMP_OEQ, we can emit
16710       // two branches instead of an explicit AND instruction with a
16711       // separate test. However, we only do this if this block doesn't
16712       // have a fall-through edge, because this requires an explicit
16713       // jmp when the condition is false.
16714       if (Op.getNode()->hasOneUse()) {
16715         SDNode *User = *Op.getNode()->use_begin();
16716         // Look for an unconditional branch following this conditional branch.
16717         // We need this because we need to reverse the successors in order
16718         // to implement FCMP_OEQ.
16719         if (User->getOpcode() == ISD::BR) {
16720           SDValue FalseBB = User->getOperand(1);
16721           SDNode *NewBR =
16722             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16723           assert(NewBR == User);
16724           (void)NewBR;
16725           Dest = FalseBB;
16726
16727           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16728                                     Cond.getOperand(0), Cond.getOperand(1));
16729           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16730           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16731           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16732                               Chain, Dest, CC, Cmp);
16733           CC = DAG.getConstant(X86::COND_P, MVT::i8);
16734           Cond = Cmp;
16735           addTest = false;
16736         }
16737       }
16738     } else if (Cond.getOpcode() == ISD::SETCC &&
16739                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16740       // For FCMP_UNE, we can emit
16741       // two branches instead of an explicit AND instruction with a
16742       // separate test. However, we only do this if this block doesn't
16743       // have a fall-through edge, because this requires an explicit
16744       // jmp when the condition is false.
16745       if (Op.getNode()->hasOneUse()) {
16746         SDNode *User = *Op.getNode()->use_begin();
16747         // Look for an unconditional branch following this conditional branch.
16748         // We need this because we need to reverse the successors in order
16749         // to implement FCMP_UNE.
16750         if (User->getOpcode() == ISD::BR) {
16751           SDValue FalseBB = User->getOperand(1);
16752           SDNode *NewBR =
16753             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16754           assert(NewBR == User);
16755           (void)NewBR;
16756
16757           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16758                                     Cond.getOperand(0), Cond.getOperand(1));
16759           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16760           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16761           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16762                               Chain, Dest, CC, Cmp);
16763           CC = DAG.getConstant(X86::COND_NP, MVT::i8);
16764           Cond = Cmp;
16765           addTest = false;
16766           Dest = FalseBB;
16767         }
16768       }
16769     }
16770   }
16771
16772   if (addTest) {
16773     // Look pass the truncate if the high bits are known zero.
16774     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16775         Cond = Cond.getOperand(0);
16776
16777     // We know the result of AND is compared against zero. Try to match
16778     // it to BT.
16779     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16780       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
16781       if (NewSetCC.getNode()) {
16782         CC = NewSetCC.getOperand(0);
16783         Cond = NewSetCC.getOperand(1);
16784         addTest = false;
16785       }
16786     }
16787   }
16788
16789   if (addTest) {
16790     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16791     CC = DAG.getConstant(X86Cond, MVT::i8);
16792     Cond = EmitTest(Cond, X86Cond, dl, DAG);
16793   }
16794   Cond = ConvertCmpIfNecessary(Cond, DAG);
16795   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16796                      Chain, Dest, CC, Cond);
16797 }
16798
16799 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16800 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16801 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16802 // that the guard pages used by the OS virtual memory manager are allocated in
16803 // correct sequence.
16804 SDValue
16805 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16806                                            SelectionDAG &DAG) const {
16807   MachineFunction &MF = DAG.getMachineFunction();
16808   bool SplitStack = MF.shouldSplitStack();
16809   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
16810                SplitStack;
16811   SDLoc dl(Op);
16812
16813   if (!Lower) {
16814     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16815     SDNode* Node = Op.getNode();
16816
16817     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16818     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16819         " not tell us which reg is the stack pointer!");
16820     EVT VT = Node->getValueType(0);
16821     SDValue Tmp1 = SDValue(Node, 0);
16822     SDValue Tmp2 = SDValue(Node, 1);
16823     SDValue Tmp3 = Node->getOperand(2);
16824     SDValue Chain = Tmp1.getOperand(0);
16825
16826     // Chain the dynamic stack allocation so that it doesn't modify the stack
16827     // pointer when other instructions are using the stack.
16828     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
16829         SDLoc(Node));
16830
16831     SDValue Size = Tmp2.getOperand(1);
16832     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16833     Chain = SP.getValue(1);
16834     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
16835     const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
16836     unsigned StackAlign = TFI.getStackAlignment();
16837     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16838     if (Align > StackAlign)
16839       Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
16840           DAG.getConstant(-(uint64_t)Align, VT));
16841     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
16842
16843     Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
16844         DAG.getIntPtrConstant(0, true), SDValue(),
16845         SDLoc(Node));
16846
16847     SDValue Ops[2] = { Tmp1, Tmp2 };
16848     return DAG.getMergeValues(Ops, dl);
16849   }
16850
16851   // Get the inputs.
16852   SDValue Chain = Op.getOperand(0);
16853   SDValue Size  = Op.getOperand(1);
16854   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16855   EVT VT = Op.getNode()->getValueType(0);
16856
16857   bool Is64Bit = Subtarget->is64Bit();
16858   EVT SPTy = getPointerTy();
16859
16860   if (SplitStack) {
16861     MachineRegisterInfo &MRI = MF.getRegInfo();
16862
16863     if (Is64Bit) {
16864       // The 64 bit implementation of segmented stacks needs to clobber both r10
16865       // r11. This makes it impossible to use it along with nested parameters.
16866       const Function *F = MF.getFunction();
16867
16868       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
16869            I != E; ++I)
16870         if (I->hasNestAttr())
16871           report_fatal_error("Cannot use segmented stacks with functions that "
16872                              "have nested arguments.");
16873     }
16874
16875     const TargetRegisterClass *AddrRegClass =
16876       getRegClassFor(getPointerTy());
16877     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16878     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16879     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16880                                 DAG.getRegister(Vreg, SPTy));
16881     SDValue Ops1[2] = { Value, Chain };
16882     return DAG.getMergeValues(Ops1, dl);
16883   } else {
16884     SDValue Flag;
16885     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
16886
16887     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
16888     Flag = Chain.getValue(1);
16889     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16890
16891     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
16892
16893     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
16894     unsigned SPReg = RegInfo->getStackRegister();
16895     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16896     Chain = SP.getValue(1);
16897
16898     if (Align) {
16899       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16900                        DAG.getConstant(-(uint64_t)Align, VT));
16901       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16902     }
16903
16904     SDValue Ops1[2] = { SP, Chain };
16905     return DAG.getMergeValues(Ops1, dl);
16906   }
16907 }
16908
16909 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16910   MachineFunction &MF = DAG.getMachineFunction();
16911   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16912
16913   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16914   SDLoc DL(Op);
16915
16916   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
16917     // vastart just stores the address of the VarArgsFrameIndex slot into the
16918     // memory location argument.
16919     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16920                                    getPointerTy());
16921     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16922                         MachinePointerInfo(SV), false, false, 0);
16923   }
16924
16925   // __va_list_tag:
16926   //   gp_offset         (0 - 6 * 8)
16927   //   fp_offset         (48 - 48 + 8 * 16)
16928   //   overflow_arg_area (point to parameters coming in memory).
16929   //   reg_save_area
16930   SmallVector<SDValue, 8> MemOps;
16931   SDValue FIN = Op.getOperand(1);
16932   // Store gp_offset
16933   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
16934                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
16935                                                MVT::i32),
16936                                FIN, MachinePointerInfo(SV), false, false, 0);
16937   MemOps.push_back(Store);
16938
16939   // Store fp_offset
16940   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16941                     FIN, DAG.getIntPtrConstant(4));
16942   Store = DAG.getStore(Op.getOperand(0), DL,
16943                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
16944                                        MVT::i32),
16945                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
16946   MemOps.push_back(Store);
16947
16948   // Store ptr to overflow_arg_area
16949   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16950                     FIN, DAG.getIntPtrConstant(4));
16951   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16952                                     getPointerTy());
16953   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
16954                        MachinePointerInfo(SV, 8),
16955                        false, false, 0);
16956   MemOps.push_back(Store);
16957
16958   // Store ptr to reg_save_area.
16959   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16960                     FIN, DAG.getIntPtrConstant(8));
16961   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
16962                                     getPointerTy());
16963   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
16964                        MachinePointerInfo(SV, 16), false, false, 0);
16965   MemOps.push_back(Store);
16966   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16967 }
16968
16969 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16970   assert(Subtarget->is64Bit() &&
16971          "LowerVAARG only handles 64-bit va_arg!");
16972   assert((Subtarget->isTargetLinux() ||
16973           Subtarget->isTargetDarwin()) &&
16974           "Unhandled target in LowerVAARG");
16975   assert(Op.getNode()->getNumOperands() == 4);
16976   SDValue Chain = Op.getOperand(0);
16977   SDValue SrcPtr = Op.getOperand(1);
16978   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16979   unsigned Align = Op.getConstantOperandVal(3);
16980   SDLoc dl(Op);
16981
16982   EVT ArgVT = Op.getNode()->getValueType(0);
16983   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16984   uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
16985   uint8_t ArgMode;
16986
16987   // Decide which area this value should be read from.
16988   // TODO: Implement the AMD64 ABI in its entirety. This simple
16989   // selection mechanism works only for the basic types.
16990   if (ArgVT == MVT::f80) {
16991     llvm_unreachable("va_arg for f80 not yet implemented");
16992   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16993     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
16994   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16995     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
16996   } else {
16997     llvm_unreachable("Unhandled argument type in LowerVAARG");
16998   }
16999
17000   if (ArgMode == 2) {
17001     // Sanity Check: Make sure using fp_offset makes sense.
17002     assert(!DAG.getTarget().Options.UseSoftFloat &&
17003            !(DAG.getMachineFunction()
17004                 .getFunction()->getAttributes()
17005                 .hasAttribute(AttributeSet::FunctionIndex,
17006                               Attribute::NoImplicitFloat)) &&
17007            Subtarget->hasSSE1());
17008   }
17009
17010   // Insert VAARG_64 node into the DAG
17011   // VAARG_64 returns two values: Variable Argument Address, Chain
17012   SmallVector<SDValue, 11> InstOps;
17013   InstOps.push_back(Chain);
17014   InstOps.push_back(SrcPtr);
17015   InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
17016   InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
17017   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
17018   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
17019   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
17020                                           VTs, InstOps, MVT::i64,
17021                                           MachinePointerInfo(SV),
17022                                           /*Align=*/0,
17023                                           /*Volatile=*/false,
17024                                           /*ReadMem=*/true,
17025                                           /*WriteMem=*/true);
17026   Chain = VAARG.getValue(1);
17027
17028   // Load the next argument and return it
17029   return DAG.getLoad(ArgVT, dl,
17030                      Chain,
17031                      VAARG,
17032                      MachinePointerInfo(),
17033                      false, false, false, 0);
17034 }
17035
17036 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
17037                            SelectionDAG &DAG) {
17038   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
17039   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
17040   SDValue Chain = Op.getOperand(0);
17041   SDValue DstPtr = Op.getOperand(1);
17042   SDValue SrcPtr = Op.getOperand(2);
17043   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
17044   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17045   SDLoc DL(Op);
17046
17047   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
17048                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
17049                        false,
17050                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
17051 }
17052
17053 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
17054 // amount is a constant. Takes immediate version of shift as input.
17055 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
17056                                           SDValue SrcOp, uint64_t ShiftAmt,
17057                                           SelectionDAG &DAG) {
17058   MVT ElementType = VT.getVectorElementType();
17059
17060   // Fold this packed shift into its first operand if ShiftAmt is 0.
17061   if (ShiftAmt == 0)
17062     return SrcOp;
17063
17064   // Check for ShiftAmt >= element width
17065   if (ShiftAmt >= ElementType.getSizeInBits()) {
17066     if (Opc == X86ISD::VSRAI)
17067       ShiftAmt = ElementType.getSizeInBits() - 1;
17068     else
17069       return DAG.getConstant(0, VT);
17070   }
17071
17072   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
17073          && "Unknown target vector shift-by-constant node");
17074
17075   // Fold this packed vector shift into a build vector if SrcOp is a
17076   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
17077   if (VT == SrcOp.getSimpleValueType() &&
17078       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
17079     SmallVector<SDValue, 8> Elts;
17080     unsigned NumElts = SrcOp->getNumOperands();
17081     ConstantSDNode *ND;
17082
17083     switch(Opc) {
17084     default: llvm_unreachable(nullptr);
17085     case X86ISD::VSHLI:
17086       for (unsigned i=0; i!=NumElts; ++i) {
17087         SDValue CurrentOp = SrcOp->getOperand(i);
17088         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17089           Elts.push_back(CurrentOp);
17090           continue;
17091         }
17092         ND = cast<ConstantSDNode>(CurrentOp);
17093         const APInt &C = ND->getAPIntValue();
17094         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
17095       }
17096       break;
17097     case X86ISD::VSRLI:
17098       for (unsigned i=0; i!=NumElts; ++i) {
17099         SDValue CurrentOp = SrcOp->getOperand(i);
17100         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17101           Elts.push_back(CurrentOp);
17102           continue;
17103         }
17104         ND = cast<ConstantSDNode>(CurrentOp);
17105         const APInt &C = ND->getAPIntValue();
17106         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
17107       }
17108       break;
17109     case X86ISD::VSRAI:
17110       for (unsigned i=0; i!=NumElts; ++i) {
17111         SDValue CurrentOp = SrcOp->getOperand(i);
17112         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17113           Elts.push_back(CurrentOp);
17114           continue;
17115         }
17116         ND = cast<ConstantSDNode>(CurrentOp);
17117         const APInt &C = ND->getAPIntValue();
17118         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
17119       }
17120       break;
17121     }
17122
17123     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
17124   }
17125
17126   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
17127 }
17128
17129 // getTargetVShiftNode - Handle vector element shifts where the shift amount
17130 // may or may not be a constant. Takes immediate version of shift as input.
17131 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
17132                                    SDValue SrcOp, SDValue ShAmt,
17133                                    SelectionDAG &DAG) {
17134   MVT SVT = ShAmt.getSimpleValueType();
17135   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
17136
17137   // Catch shift-by-constant.
17138   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
17139     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
17140                                       CShAmt->getZExtValue(), DAG);
17141
17142   // Change opcode to non-immediate version
17143   switch (Opc) {
17144     default: llvm_unreachable("Unknown target vector shift node");
17145     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
17146     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
17147     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
17148   }
17149
17150   const X86Subtarget &Subtarget =
17151       static_cast<const X86Subtarget &>(DAG.getSubtarget());
17152   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
17153       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
17154     // Let the shuffle legalizer expand this shift amount node.
17155     SDValue Op0 = ShAmt.getOperand(0);
17156     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
17157     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
17158   } else {
17159     // Need to build a vector containing shift amount.
17160     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
17161     SmallVector<SDValue, 4> ShOps;
17162     ShOps.push_back(ShAmt);
17163     if (SVT == MVT::i32) {
17164       ShOps.push_back(DAG.getConstant(0, SVT));
17165       ShOps.push_back(DAG.getUNDEF(SVT));
17166     }
17167     ShOps.push_back(DAG.getUNDEF(SVT));
17168
17169     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
17170     ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
17171   }
17172
17173   // The return type has to be a 128-bit type with the same element
17174   // type as the input type.
17175   MVT EltVT = VT.getVectorElementType();
17176   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
17177
17178   ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
17179   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
17180 }
17181
17182 /// \brief Return (and \p Op, \p Mask) for compare instructions or
17183 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
17184 /// necessary casting for \p Mask when lowering masking intrinsics.
17185 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
17186                                     SDValue PreservedSrc,
17187                                     const X86Subtarget *Subtarget,
17188                                     SelectionDAG &DAG) {
17189     EVT VT = Op.getValueType();
17190     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
17191                                   MVT::i1, VT.getVectorNumElements());
17192     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17193                                      Mask.getValueType().getSizeInBits());
17194     SDLoc dl(Op);
17195
17196     assert(MaskVT.isSimple() && "invalid mask type");
17197
17198     if (isAllOnes(Mask))
17199       return Op;
17200
17201     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
17202     // are extracted by EXTRACT_SUBVECTOR.
17203     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17204                               DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17205                               DAG.getIntPtrConstant(0));
17206
17207     switch (Op.getOpcode()) {
17208       default: break;
17209       case X86ISD::PCMPEQM:
17210       case X86ISD::PCMPGTM:
17211       case X86ISD::CMPM:
17212       case X86ISD::CMPMU:
17213         return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
17214     }
17215     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17216       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17217     return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
17218 }
17219
17220 /// \brief Creates an SDNode for a predicated scalar operation.
17221 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17222 /// The mask is comming as MVT::i8 and it should be truncated
17223 /// to MVT::i1 while lowering masking intrinsics.
17224 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17225 /// "X86select" instead of "vselect". We just can't create the "vselect" node for
17226 /// a scalar instruction.
17227 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17228                                     SDValue PreservedSrc,
17229                                     const X86Subtarget *Subtarget,
17230                                     SelectionDAG &DAG) {
17231     if (isAllOnes(Mask))
17232       return Op;
17233
17234     EVT VT = Op.getValueType();
17235     SDLoc dl(Op);
17236     // The mask should be of type MVT::i1
17237     SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17238
17239     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17240       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17241     return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17242 }
17243
17244 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17245                                        SelectionDAG &DAG) {
17246   SDLoc dl(Op);
17247   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17248   EVT VT = Op.getValueType();
17249   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17250   if (IntrData) {
17251     switch(IntrData->Type) {
17252     case INTR_TYPE_1OP:
17253       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17254     case INTR_TYPE_2OP:
17255       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17256         Op.getOperand(2));
17257     case INTR_TYPE_3OP:
17258       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17259         Op.getOperand(2), Op.getOperand(3));
17260     case INTR_TYPE_1OP_MASK_RM: {
17261       SDValue Src = Op.getOperand(1);
17262       SDValue Src0 = Op.getOperand(2);
17263       SDValue Mask = Op.getOperand(3);
17264       SDValue RoundingMode = Op.getOperand(4);
17265       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17266                                               RoundingMode),
17267                                   Mask, Src0, Subtarget, DAG);
17268     }
17269     case INTR_TYPE_SCALAR_MASK_RM: {
17270       SDValue Src1 = Op.getOperand(1);
17271       SDValue Src2 = Op.getOperand(2);
17272       SDValue Src0 = Op.getOperand(3);
17273       SDValue Mask = Op.getOperand(4);
17274       SDValue RoundingMode = Op.getOperand(5);
17275       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17276                                               RoundingMode),
17277                                   Mask, Src0, Subtarget, DAG);
17278     }
17279     case INTR_TYPE_2OP_MASK: {
17280       SDValue Mask = Op.getOperand(4);
17281       SDValue PassThru = Op.getOperand(3);
17282       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17283       if (IntrWithRoundingModeOpcode != 0) {
17284         unsigned Round = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
17285         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17286           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17287                                       dl, Op.getValueType(),
17288                                       Op.getOperand(1), Op.getOperand(2),
17289                                       Op.getOperand(3), Op.getOperand(5)),
17290                                       Mask, PassThru, Subtarget, DAG);
17291         }
17292       }
17293       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17294                                               Op.getOperand(1),
17295                                               Op.getOperand(2)),
17296                                   Mask, PassThru, Subtarget, DAG);
17297     }
17298     case FMA_OP_MASK: {
17299       SDValue Src1 = Op.getOperand(1);
17300       SDValue Src2 = Op.getOperand(2);
17301       SDValue Src3 = Op.getOperand(3);
17302       SDValue Mask = Op.getOperand(4);
17303       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17304       if (IntrWithRoundingModeOpcode != 0) {
17305         SDValue Rnd = Op.getOperand(5);
17306         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17307             X86::STATIC_ROUNDING::CUR_DIRECTION)
17308           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17309                                                   dl, Op.getValueType(),
17310                                                   Src1, Src2, Src3, Rnd),
17311                                       Mask, Src1, Subtarget, DAG);
17312       }
17313       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17314                                               dl, Op.getValueType(),
17315                                               Src1, Src2, Src3),
17316                                   Mask, Src1, Subtarget, DAG);
17317     }
17318     case CMP_MASK:
17319     case CMP_MASK_CC: {
17320       // Comparison intrinsics with masks.
17321       // Example of transformation:
17322       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17323       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17324       // (i8 (bitcast
17325       //   (v8i1 (insert_subvector undef,
17326       //           (v2i1 (and (PCMPEQM %a, %b),
17327       //                      (extract_subvector
17328       //                         (v8i1 (bitcast %mask)), 0))), 0))))
17329       EVT VT = Op.getOperand(1).getValueType();
17330       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17331                                     VT.getVectorNumElements());
17332       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17333       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17334                                        Mask.getValueType().getSizeInBits());
17335       SDValue Cmp;
17336       if (IntrData->Type == CMP_MASK_CC) {
17337         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17338                     Op.getOperand(2), Op.getOperand(3));
17339       } else {
17340         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17341         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17342                     Op.getOperand(2));
17343       }
17344       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17345                                              DAG.getTargetConstant(0, MaskVT),
17346                                              Subtarget, DAG);
17347       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17348                                 DAG.getUNDEF(BitcastVT), CmpMask,
17349                                 DAG.getIntPtrConstant(0));
17350       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
17351     }
17352     case COMI: { // Comparison intrinsics
17353       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17354       SDValue LHS = Op.getOperand(1);
17355       SDValue RHS = Op.getOperand(2);
17356       unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
17357       assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
17358       SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17359       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17360                                   DAG.getConstant(X86CC, MVT::i8), Cond);
17361       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17362     }
17363     case VSHIFT:
17364       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17365                                  Op.getOperand(1), Op.getOperand(2), DAG);
17366     case VSHIFT_MASK:
17367       return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
17368                                                       Op.getSimpleValueType(),
17369                                                       Op.getOperand(1),
17370                                                       Op.getOperand(2), DAG),
17371                                   Op.getOperand(4), Op.getOperand(3), Subtarget,
17372                                   DAG);
17373     case COMPRESS_EXPAND_IN_REG: {
17374       SDValue Mask = Op.getOperand(3);
17375       SDValue DataToCompress = Op.getOperand(1);
17376       SDValue PassThru = Op.getOperand(2);
17377       if (isAllOnes(Mask)) // return data as is
17378         return Op.getOperand(1);
17379       EVT VT = Op.getValueType();
17380       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17381                                     VT.getVectorNumElements());
17382       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17383                                        Mask.getValueType().getSizeInBits());
17384       SDLoc dl(Op);
17385       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17386                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17387                                   DAG.getIntPtrConstant(0));
17388
17389       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
17390                          PassThru);
17391     }
17392     case BLEND: {
17393       SDValue Mask = Op.getOperand(3);
17394       EVT VT = Op.getValueType();
17395       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17396                                     VT.getVectorNumElements());
17397       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17398                                        Mask.getValueType().getSizeInBits());
17399       SDLoc dl(Op);
17400       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17401                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17402                                   DAG.getIntPtrConstant(0));
17403       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
17404                          Op.getOperand(2));
17405     }
17406     default:
17407       break;
17408     }
17409   }
17410
17411   switch (IntNo) {
17412   default: return SDValue();    // Don't custom lower most intrinsics.
17413
17414   case Intrinsic::x86_avx512_mask_valign_q_512:
17415   case Intrinsic::x86_avx512_mask_valign_d_512:
17416     // Vector source operands are swapped.
17417     return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
17418                                             Op.getValueType(), Op.getOperand(2),
17419                                             Op.getOperand(1),
17420                                             Op.getOperand(3)),
17421                                 Op.getOperand(5), Op.getOperand(4),
17422                                 Subtarget, DAG);
17423
17424   // ptest and testp intrinsics. The intrinsic these come from are designed to
17425   // return an integer value, not just an instruction so lower it to the ptest
17426   // or testp pattern and a setcc for the result.
17427   case Intrinsic::x86_sse41_ptestz:
17428   case Intrinsic::x86_sse41_ptestc:
17429   case Intrinsic::x86_sse41_ptestnzc:
17430   case Intrinsic::x86_avx_ptestz_256:
17431   case Intrinsic::x86_avx_ptestc_256:
17432   case Intrinsic::x86_avx_ptestnzc_256:
17433   case Intrinsic::x86_avx_vtestz_ps:
17434   case Intrinsic::x86_avx_vtestc_ps:
17435   case Intrinsic::x86_avx_vtestnzc_ps:
17436   case Intrinsic::x86_avx_vtestz_pd:
17437   case Intrinsic::x86_avx_vtestc_pd:
17438   case Intrinsic::x86_avx_vtestnzc_pd:
17439   case Intrinsic::x86_avx_vtestz_ps_256:
17440   case Intrinsic::x86_avx_vtestc_ps_256:
17441   case Intrinsic::x86_avx_vtestnzc_ps_256:
17442   case Intrinsic::x86_avx_vtestz_pd_256:
17443   case Intrinsic::x86_avx_vtestc_pd_256:
17444   case Intrinsic::x86_avx_vtestnzc_pd_256: {
17445     bool IsTestPacked = false;
17446     unsigned X86CC;
17447     switch (IntNo) {
17448     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17449     case Intrinsic::x86_avx_vtestz_ps:
17450     case Intrinsic::x86_avx_vtestz_pd:
17451     case Intrinsic::x86_avx_vtestz_ps_256:
17452     case Intrinsic::x86_avx_vtestz_pd_256:
17453       IsTestPacked = true; // Fallthrough
17454     case Intrinsic::x86_sse41_ptestz:
17455     case Intrinsic::x86_avx_ptestz_256:
17456       // ZF = 1
17457       X86CC = X86::COND_E;
17458       break;
17459     case Intrinsic::x86_avx_vtestc_ps:
17460     case Intrinsic::x86_avx_vtestc_pd:
17461     case Intrinsic::x86_avx_vtestc_ps_256:
17462     case Intrinsic::x86_avx_vtestc_pd_256:
17463       IsTestPacked = true; // Fallthrough
17464     case Intrinsic::x86_sse41_ptestc:
17465     case Intrinsic::x86_avx_ptestc_256:
17466       // CF = 1
17467       X86CC = X86::COND_B;
17468       break;
17469     case Intrinsic::x86_avx_vtestnzc_ps:
17470     case Intrinsic::x86_avx_vtestnzc_pd:
17471     case Intrinsic::x86_avx_vtestnzc_ps_256:
17472     case Intrinsic::x86_avx_vtestnzc_pd_256:
17473       IsTestPacked = true; // Fallthrough
17474     case Intrinsic::x86_sse41_ptestnzc:
17475     case Intrinsic::x86_avx_ptestnzc_256:
17476       // ZF and CF = 0
17477       X86CC = X86::COND_A;
17478       break;
17479     }
17480
17481     SDValue LHS = Op.getOperand(1);
17482     SDValue RHS = Op.getOperand(2);
17483     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
17484     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
17485     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17486     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
17487     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17488   }
17489   case Intrinsic::x86_avx512_kortestz_w:
17490   case Intrinsic::x86_avx512_kortestc_w: {
17491     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
17492     SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
17493     SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
17494     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17495     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
17496     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
17497     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17498   }
17499
17500   case Intrinsic::x86_sse42_pcmpistria128:
17501   case Intrinsic::x86_sse42_pcmpestria128:
17502   case Intrinsic::x86_sse42_pcmpistric128:
17503   case Intrinsic::x86_sse42_pcmpestric128:
17504   case Intrinsic::x86_sse42_pcmpistrio128:
17505   case Intrinsic::x86_sse42_pcmpestrio128:
17506   case Intrinsic::x86_sse42_pcmpistris128:
17507   case Intrinsic::x86_sse42_pcmpestris128:
17508   case Intrinsic::x86_sse42_pcmpistriz128:
17509   case Intrinsic::x86_sse42_pcmpestriz128: {
17510     unsigned Opcode;
17511     unsigned X86CC;
17512     switch (IntNo) {
17513     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
17514     case Intrinsic::x86_sse42_pcmpistria128:
17515       Opcode = X86ISD::PCMPISTRI;
17516       X86CC = X86::COND_A;
17517       break;
17518     case Intrinsic::x86_sse42_pcmpestria128:
17519       Opcode = X86ISD::PCMPESTRI;
17520       X86CC = X86::COND_A;
17521       break;
17522     case Intrinsic::x86_sse42_pcmpistric128:
17523       Opcode = X86ISD::PCMPISTRI;
17524       X86CC = X86::COND_B;
17525       break;
17526     case Intrinsic::x86_sse42_pcmpestric128:
17527       Opcode = X86ISD::PCMPESTRI;
17528       X86CC = X86::COND_B;
17529       break;
17530     case Intrinsic::x86_sse42_pcmpistrio128:
17531       Opcode = X86ISD::PCMPISTRI;
17532       X86CC = X86::COND_O;
17533       break;
17534     case Intrinsic::x86_sse42_pcmpestrio128:
17535       Opcode = X86ISD::PCMPESTRI;
17536       X86CC = X86::COND_O;
17537       break;
17538     case Intrinsic::x86_sse42_pcmpistris128:
17539       Opcode = X86ISD::PCMPISTRI;
17540       X86CC = X86::COND_S;
17541       break;
17542     case Intrinsic::x86_sse42_pcmpestris128:
17543       Opcode = X86ISD::PCMPESTRI;
17544       X86CC = X86::COND_S;
17545       break;
17546     case Intrinsic::x86_sse42_pcmpistriz128:
17547       Opcode = X86ISD::PCMPISTRI;
17548       X86CC = X86::COND_E;
17549       break;
17550     case Intrinsic::x86_sse42_pcmpestriz128:
17551       Opcode = X86ISD::PCMPESTRI;
17552       X86CC = X86::COND_E;
17553       break;
17554     }
17555     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17556     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17557     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
17558     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17559                                 DAG.getConstant(X86CC, MVT::i8),
17560                                 SDValue(PCMP.getNode(), 1));
17561     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17562   }
17563
17564   case Intrinsic::x86_sse42_pcmpistri128:
17565   case Intrinsic::x86_sse42_pcmpestri128: {
17566     unsigned Opcode;
17567     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
17568       Opcode = X86ISD::PCMPISTRI;
17569     else
17570       Opcode = X86ISD::PCMPESTRI;
17571
17572     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17573     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17574     return DAG.getNode(Opcode, dl, VTs, NewOps);
17575   }
17576   }
17577 }
17578
17579 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17580                               SDValue Src, SDValue Mask, SDValue Base,
17581                               SDValue Index, SDValue ScaleOp, SDValue Chain,
17582                               const X86Subtarget * Subtarget) {
17583   SDLoc dl(Op);
17584   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17585   assert(C && "Invalid scale type");
17586   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17587   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17588                              Index.getSimpleValueType().getVectorNumElements());
17589   SDValue MaskInReg;
17590   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17591   if (MaskC)
17592     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17593   else
17594     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17595   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
17596   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17597   SDValue Segment = DAG.getRegister(0, MVT::i32);
17598   if (Src.getOpcode() == ISD::UNDEF)
17599     Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
17600   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17601   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17602   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
17603   return DAG.getMergeValues(RetOps, dl);
17604 }
17605
17606 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17607                                SDValue Src, SDValue Mask, SDValue Base,
17608                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
17609   SDLoc dl(Op);
17610   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17611   assert(C && "Invalid scale type");
17612   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17613   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17614   SDValue Segment = DAG.getRegister(0, MVT::i32);
17615   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17616                              Index.getSimpleValueType().getVectorNumElements());
17617   SDValue MaskInReg;
17618   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17619   if (MaskC)
17620     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17621   else
17622     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17623   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
17624   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
17625   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17626   return SDValue(Res, 1);
17627 }
17628
17629 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17630                                SDValue Mask, SDValue Base, SDValue Index,
17631                                SDValue ScaleOp, SDValue Chain) {
17632   SDLoc dl(Op);
17633   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17634   assert(C && "Invalid scale type");
17635   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17636   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17637   SDValue Segment = DAG.getRegister(0, MVT::i32);
17638   EVT MaskVT =
17639     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
17640   SDValue MaskInReg;
17641   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17642   if (MaskC)
17643     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17644   else
17645     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17646   //SDVTList VTs = DAG.getVTList(MVT::Other);
17647   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17648   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
17649   return SDValue(Res, 0);
17650 }
17651
17652 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
17653 // read performance monitor counters (x86_rdpmc).
17654 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
17655                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17656                               SmallVectorImpl<SDValue> &Results) {
17657   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17658   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17659   SDValue LO, HI;
17660
17661   // The ECX register is used to select the index of the performance counter
17662   // to read.
17663   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
17664                                    N->getOperand(2));
17665   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
17666
17667   // Reads the content of a 64-bit performance counter and returns it in the
17668   // registers EDX:EAX.
17669   if (Subtarget->is64Bit()) {
17670     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17671     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17672                             LO.getValue(2));
17673   } else {
17674     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17675     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17676                             LO.getValue(2));
17677   }
17678   Chain = HI.getValue(1);
17679
17680   if (Subtarget->is64Bit()) {
17681     // The EAX register is loaded with the low-order 32 bits. The EDX register
17682     // is loaded with the supported high-order bits of the counter.
17683     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17684                               DAG.getConstant(32, MVT::i8));
17685     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17686     Results.push_back(Chain);
17687     return;
17688   }
17689
17690   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17691   SDValue Ops[] = { LO, HI };
17692   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17693   Results.push_back(Pair);
17694   Results.push_back(Chain);
17695 }
17696
17697 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
17698 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
17699 // also used to custom lower READCYCLECOUNTER nodes.
17700 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
17701                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17702                               SmallVectorImpl<SDValue> &Results) {
17703   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17704   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
17705   SDValue LO, HI;
17706
17707   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
17708   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
17709   // and the EAX register is loaded with the low-order 32 bits.
17710   if (Subtarget->is64Bit()) {
17711     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17712     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17713                             LO.getValue(2));
17714   } else {
17715     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17716     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17717                             LO.getValue(2));
17718   }
17719   SDValue Chain = HI.getValue(1);
17720
17721   if (Opcode == X86ISD::RDTSCP_DAG) {
17722     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17723
17724     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
17725     // the ECX register. Add 'ecx' explicitly to the chain.
17726     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
17727                                      HI.getValue(2));
17728     // Explicitly store the content of ECX at the location passed in input
17729     // to the 'rdtscp' intrinsic.
17730     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
17731                          MachinePointerInfo(), false, false, 0);
17732   }
17733
17734   if (Subtarget->is64Bit()) {
17735     // The EDX register is loaded with the high-order 32 bits of the MSR, and
17736     // the EAX register is loaded with the low-order 32 bits.
17737     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17738                               DAG.getConstant(32, MVT::i8));
17739     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17740     Results.push_back(Chain);
17741     return;
17742   }
17743
17744   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17745   SDValue Ops[] = { LO, HI };
17746   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17747   Results.push_back(Pair);
17748   Results.push_back(Chain);
17749 }
17750
17751 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
17752                                      SelectionDAG &DAG) {
17753   SmallVector<SDValue, 2> Results;
17754   SDLoc DL(Op);
17755   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
17756                           Results);
17757   return DAG.getMergeValues(Results, DL);
17758 }
17759
17760
17761 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17762                                       SelectionDAG &DAG) {
17763   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
17764
17765   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
17766   if (!IntrData)
17767     return SDValue();
17768
17769   SDLoc dl(Op);
17770   switch(IntrData->Type) {
17771   default:
17772     llvm_unreachable("Unknown Intrinsic Type");
17773     break;
17774   case RDSEED:
17775   case RDRAND: {
17776     // Emit the node with the right value type.
17777     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
17778     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17779
17780     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
17781     // Otherwise return the value from Rand, which is always 0, casted to i32.
17782     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
17783                       DAG.getConstant(1, Op->getValueType(1)),
17784                       DAG.getConstant(X86::COND_B, MVT::i32),
17785                       SDValue(Result.getNode(), 1) };
17786     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
17787                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
17788                                   Ops);
17789
17790     // Return { result, isValid, chain }.
17791     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
17792                        SDValue(Result.getNode(), 2));
17793   }
17794   case GATHER: {
17795   //gather(v1, mask, index, base, scale);
17796     SDValue Chain = Op.getOperand(0);
17797     SDValue Src   = Op.getOperand(2);
17798     SDValue Base  = Op.getOperand(3);
17799     SDValue Index = Op.getOperand(4);
17800     SDValue Mask  = Op.getOperand(5);
17801     SDValue Scale = Op.getOperand(6);
17802     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
17803                           Subtarget);
17804   }
17805   case SCATTER: {
17806   //scatter(base, mask, index, v1, scale);
17807     SDValue Chain = Op.getOperand(0);
17808     SDValue Base  = Op.getOperand(2);
17809     SDValue Mask  = Op.getOperand(3);
17810     SDValue Index = Op.getOperand(4);
17811     SDValue Src   = Op.getOperand(5);
17812     SDValue Scale = Op.getOperand(6);
17813     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
17814   }
17815   case PREFETCH: {
17816     SDValue Hint = Op.getOperand(6);
17817     unsigned HintVal;
17818     if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
17819         (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
17820       llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
17821     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
17822     SDValue Chain = Op.getOperand(0);
17823     SDValue Mask  = Op.getOperand(2);
17824     SDValue Index = Op.getOperand(3);
17825     SDValue Base  = Op.getOperand(4);
17826     SDValue Scale = Op.getOperand(5);
17827     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
17828   }
17829   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
17830   case RDTSC: {
17831     SmallVector<SDValue, 2> Results;
17832     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
17833     return DAG.getMergeValues(Results, dl);
17834   }
17835   // Read Performance Monitoring Counters.
17836   case RDPMC: {
17837     SmallVector<SDValue, 2> Results;
17838     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
17839     return DAG.getMergeValues(Results, dl);
17840   }
17841   // XTEST intrinsics.
17842   case XTEST: {
17843     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17844     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17845     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17846                                 DAG.getConstant(X86::COND_NE, MVT::i8),
17847                                 InTrans);
17848     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
17849     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
17850                        Ret, SDValue(InTrans.getNode(), 1));
17851   }
17852   // ADC/ADCX/SBB
17853   case ADX: {
17854     SmallVector<SDValue, 2> Results;
17855     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17856     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
17857     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
17858                                 DAG.getConstant(-1, MVT::i8));
17859     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
17860                               Op.getOperand(4), GenCF.getValue(1));
17861     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
17862                                  Op.getOperand(5), MachinePointerInfo(),
17863                                  false, false, 0);
17864     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17865                                 DAG.getConstant(X86::COND_B, MVT::i8),
17866                                 Res.getValue(1));
17867     Results.push_back(SetCC);
17868     Results.push_back(Store);
17869     return DAG.getMergeValues(Results, dl);
17870   }
17871   case COMPRESS_TO_MEM: {
17872     SDLoc dl(Op);
17873     SDValue Mask = Op.getOperand(4);
17874     SDValue DataToCompress = Op.getOperand(3);
17875     SDValue Addr = Op.getOperand(2);
17876     SDValue Chain = Op.getOperand(0);
17877
17878     if (isAllOnes(Mask)) // return just a store
17879       return DAG.getStore(Chain, dl, DataToCompress, Addr,
17880                           MachinePointerInfo(), false, false, 0);
17881
17882     EVT VT = DataToCompress.getValueType();
17883     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17884                                   VT.getVectorNumElements());
17885     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17886                                      Mask.getValueType().getSizeInBits());
17887     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17888                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17889                                 DAG.getIntPtrConstant(0));
17890
17891     SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
17892                                       DataToCompress, DAG.getUNDEF(VT));
17893     return DAG.getStore(Chain, dl, Compressed, Addr,
17894                         MachinePointerInfo(), false, false, 0);
17895   }
17896   case EXPAND_FROM_MEM: {
17897     SDLoc dl(Op);
17898     SDValue Mask = Op.getOperand(4);
17899     SDValue PathThru = Op.getOperand(3);
17900     SDValue Addr = Op.getOperand(2);
17901     SDValue Chain = Op.getOperand(0);
17902     EVT VT = Op.getValueType();
17903
17904     if (isAllOnes(Mask)) // return just a load
17905       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
17906                          false, 0);
17907     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17908                                   VT.getVectorNumElements());
17909     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17910                                      Mask.getValueType().getSizeInBits());
17911     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17912                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17913                                 DAG.getIntPtrConstant(0));
17914
17915     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
17916                                    false, false, false, 0);
17917
17918     SmallVector<SDValue, 2> Results;
17919     Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
17920                                   PathThru));
17921     Results.push_back(Chain);
17922     return DAG.getMergeValues(Results, dl);
17923   }
17924   }
17925 }
17926
17927 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
17928                                            SelectionDAG &DAG) const {
17929   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17930   MFI->setReturnAddressIsTaken(true);
17931
17932   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
17933     return SDValue();
17934
17935   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17936   SDLoc dl(Op);
17937   EVT PtrVT = getPointerTy();
17938
17939   if (Depth > 0) {
17940     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
17941     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17942     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
17943     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17944                        DAG.getNode(ISD::ADD, dl, PtrVT,
17945                                    FrameAddr, Offset),
17946                        MachinePointerInfo(), false, false, false, 0);
17947   }
17948
17949   // Just load the return address.
17950   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
17951   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17952                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
17953 }
17954
17955 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
17956   MachineFunction &MF = DAG.getMachineFunction();
17957   MachineFrameInfo *MFI = MF.getFrameInfo();
17958   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
17959   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17960   EVT VT = Op.getValueType();
17961
17962   MFI->setFrameAddressIsTaken(true);
17963
17964   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
17965     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
17966     // is not possible to crawl up the stack without looking at the unwind codes
17967     // simultaneously.
17968     int FrameAddrIndex = FuncInfo->getFAIndex();
17969     if (!FrameAddrIndex) {
17970       // Set up a frame object for the return address.
17971       unsigned SlotSize = RegInfo->getSlotSize();
17972       FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
17973           SlotSize, /*Offset=*/INT64_MIN, /*IsImmutable=*/false);
17974       FuncInfo->setFAIndex(FrameAddrIndex);
17975     }
17976     return DAG.getFrameIndex(FrameAddrIndex, VT);
17977   }
17978
17979   unsigned FrameReg =
17980       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
17981   SDLoc dl(Op);  // FIXME probably not meaningful
17982   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17983   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
17984           (FrameReg == X86::EBP && VT == MVT::i32)) &&
17985          "Invalid Frame Register!");
17986   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
17987   while (Depth--)
17988     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
17989                             MachinePointerInfo(),
17990                             false, false, false, 0);
17991   return FrameAddr;
17992 }
17993
17994 // FIXME? Maybe this could be a TableGen attribute on some registers and
17995 // this table could be generated automatically from RegInfo.
17996 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
17997                                               EVT VT) const {
17998   unsigned Reg = StringSwitch<unsigned>(RegName)
17999                        .Case("esp", X86::ESP)
18000                        .Case("rsp", X86::RSP)
18001                        .Default(0);
18002   if (Reg)
18003     return Reg;
18004   report_fatal_error("Invalid register name global variable");
18005 }
18006
18007 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
18008                                                      SelectionDAG &DAG) const {
18009   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18010   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
18011 }
18012
18013 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
18014   SDValue Chain     = Op.getOperand(0);
18015   SDValue Offset    = Op.getOperand(1);
18016   SDValue Handler   = Op.getOperand(2);
18017   SDLoc dl      (Op);
18018
18019   EVT PtrVT = getPointerTy();
18020   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18021   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
18022   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
18023           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
18024          "Invalid Frame Register!");
18025   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
18026   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
18027
18028   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
18029                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
18030   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
18031   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
18032                        false, false, 0);
18033   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
18034
18035   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
18036                      DAG.getRegister(StoreAddrReg, PtrVT));
18037 }
18038
18039 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
18040                                                SelectionDAG &DAG) const {
18041   SDLoc DL(Op);
18042   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
18043                      DAG.getVTList(MVT::i32, MVT::Other),
18044                      Op.getOperand(0), Op.getOperand(1));
18045 }
18046
18047 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
18048                                                 SelectionDAG &DAG) const {
18049   SDLoc DL(Op);
18050   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
18051                      Op.getOperand(0), Op.getOperand(1));
18052 }
18053
18054 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
18055   return Op.getOperand(0);
18056 }
18057
18058 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
18059                                                 SelectionDAG &DAG) const {
18060   SDValue Root = Op.getOperand(0);
18061   SDValue Trmp = Op.getOperand(1); // trampoline
18062   SDValue FPtr = Op.getOperand(2); // nested function
18063   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
18064   SDLoc dl (Op);
18065
18066   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18067   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
18068
18069   if (Subtarget->is64Bit()) {
18070     SDValue OutChains[6];
18071
18072     // Large code-model.
18073     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
18074     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
18075
18076     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
18077     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
18078
18079     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
18080
18081     // Load the pointer to the nested function into R11.
18082     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
18083     SDValue Addr = Trmp;
18084     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18085                                 Addr, MachinePointerInfo(TrmpAddr),
18086                                 false, false, 0);
18087
18088     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18089                        DAG.getConstant(2, MVT::i64));
18090     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
18091                                 MachinePointerInfo(TrmpAddr, 2),
18092                                 false, false, 2);
18093
18094     // Load the 'nest' parameter value into R10.
18095     // R10 is specified in X86CallingConv.td
18096     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
18097     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18098                        DAG.getConstant(10, MVT::i64));
18099     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18100                                 Addr, MachinePointerInfo(TrmpAddr, 10),
18101                                 false, false, 0);
18102
18103     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18104                        DAG.getConstant(12, MVT::i64));
18105     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
18106                                 MachinePointerInfo(TrmpAddr, 12),
18107                                 false, false, 2);
18108
18109     // Jump to the nested function.
18110     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
18111     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18112                        DAG.getConstant(20, MVT::i64));
18113     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18114                                 Addr, MachinePointerInfo(TrmpAddr, 20),
18115                                 false, false, 0);
18116
18117     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
18118     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18119                        DAG.getConstant(22, MVT::i64));
18120     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
18121                                 MachinePointerInfo(TrmpAddr, 22),
18122                                 false, false, 0);
18123
18124     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18125   } else {
18126     const Function *Func =
18127       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
18128     CallingConv::ID CC = Func->getCallingConv();
18129     unsigned NestReg;
18130
18131     switch (CC) {
18132     default:
18133       llvm_unreachable("Unsupported calling convention");
18134     case CallingConv::C:
18135     case CallingConv::X86_StdCall: {
18136       // Pass 'nest' parameter in ECX.
18137       // Must be kept in sync with X86CallingConv.td
18138       NestReg = X86::ECX;
18139
18140       // Check that ECX wasn't needed by an 'inreg' parameter.
18141       FunctionType *FTy = Func->getFunctionType();
18142       const AttributeSet &Attrs = Func->getAttributes();
18143
18144       if (!Attrs.isEmpty() && !Func->isVarArg()) {
18145         unsigned InRegCount = 0;
18146         unsigned Idx = 1;
18147
18148         for (FunctionType::param_iterator I = FTy->param_begin(),
18149              E = FTy->param_end(); I != E; ++I, ++Idx)
18150           if (Attrs.hasAttribute(Idx, Attribute::InReg))
18151             // FIXME: should only count parameters that are lowered to integers.
18152             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
18153
18154         if (InRegCount > 2) {
18155           report_fatal_error("Nest register in use - reduce number of inreg"
18156                              " parameters!");
18157         }
18158       }
18159       break;
18160     }
18161     case CallingConv::X86_FastCall:
18162     case CallingConv::X86_ThisCall:
18163     case CallingConv::Fast:
18164       // Pass 'nest' parameter in EAX.
18165       // Must be kept in sync with X86CallingConv.td
18166       NestReg = X86::EAX;
18167       break;
18168     }
18169
18170     SDValue OutChains[4];
18171     SDValue Addr, Disp;
18172
18173     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18174                        DAG.getConstant(10, MVT::i32));
18175     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
18176
18177     // This is storing the opcode for MOV32ri.
18178     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
18179     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
18180     OutChains[0] = DAG.getStore(Root, dl,
18181                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
18182                                 Trmp, MachinePointerInfo(TrmpAddr),
18183                                 false, false, 0);
18184
18185     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18186                        DAG.getConstant(1, MVT::i32));
18187     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
18188                                 MachinePointerInfo(TrmpAddr, 1),
18189                                 false, false, 1);
18190
18191     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18192     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18193                        DAG.getConstant(5, MVT::i32));
18194     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
18195                                 MachinePointerInfo(TrmpAddr, 5),
18196                                 false, false, 1);
18197
18198     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18199                        DAG.getConstant(6, MVT::i32));
18200     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
18201                                 MachinePointerInfo(TrmpAddr, 6),
18202                                 false, false, 1);
18203
18204     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18205   }
18206 }
18207
18208 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18209                                             SelectionDAG &DAG) const {
18210   /*
18211    The rounding mode is in bits 11:10 of FPSR, and has the following
18212    settings:
18213      00 Round to nearest
18214      01 Round to -inf
18215      10 Round to +inf
18216      11 Round to 0
18217
18218   FLT_ROUNDS, on the other hand, expects the following:
18219     -1 Undefined
18220      0 Round to 0
18221      1 Round to nearest
18222      2 Round to +inf
18223      3 Round to -inf
18224
18225   To perform the conversion, we do:
18226     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18227   */
18228
18229   MachineFunction &MF = DAG.getMachineFunction();
18230   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
18231   unsigned StackAlignment = TFI.getStackAlignment();
18232   MVT VT = Op.getSimpleValueType();
18233   SDLoc DL(Op);
18234
18235   // Save FP Control Word to stack slot
18236   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18237   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
18238
18239   MachineMemOperand *MMO =
18240    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
18241                            MachineMemOperand::MOStore, 2, 2);
18242
18243   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18244   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18245                                           DAG.getVTList(MVT::Other),
18246                                           Ops, MVT::i16, MMO);
18247
18248   // Load FP Control Word from stack slot
18249   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18250                             MachinePointerInfo(), false, false, false, 0);
18251
18252   // Transform as necessary
18253   SDValue CWD1 =
18254     DAG.getNode(ISD::SRL, DL, MVT::i16,
18255                 DAG.getNode(ISD::AND, DL, MVT::i16,
18256                             CWD, DAG.getConstant(0x800, MVT::i16)),
18257                 DAG.getConstant(11, MVT::i8));
18258   SDValue CWD2 =
18259     DAG.getNode(ISD::SRL, DL, MVT::i16,
18260                 DAG.getNode(ISD::AND, DL, MVT::i16,
18261                             CWD, DAG.getConstant(0x400, MVT::i16)),
18262                 DAG.getConstant(9, MVT::i8));
18263
18264   SDValue RetVal =
18265     DAG.getNode(ISD::AND, DL, MVT::i16,
18266                 DAG.getNode(ISD::ADD, DL, MVT::i16,
18267                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18268                             DAG.getConstant(1, MVT::i16)),
18269                 DAG.getConstant(3, MVT::i16));
18270
18271   return DAG.getNode((VT.getSizeInBits() < 16 ?
18272                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18273 }
18274
18275 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
18276   MVT VT = Op.getSimpleValueType();
18277   EVT OpVT = VT;
18278   unsigned NumBits = VT.getSizeInBits();
18279   SDLoc dl(Op);
18280
18281   Op = Op.getOperand(0);
18282   if (VT == MVT::i8) {
18283     // Zero extend to i32 since there is not an i8 bsr.
18284     OpVT = MVT::i32;
18285     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18286   }
18287
18288   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
18289   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18290   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18291
18292   // If src is zero (i.e. bsr sets ZF), returns NumBits.
18293   SDValue Ops[] = {
18294     Op,
18295     DAG.getConstant(NumBits+NumBits-1, OpVT),
18296     DAG.getConstant(X86::COND_E, MVT::i8),
18297     Op.getValue(1)
18298   };
18299   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
18300
18301   // Finally xor with NumBits-1.
18302   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18303
18304   if (VT == MVT::i8)
18305     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18306   return Op;
18307 }
18308
18309 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
18310   MVT VT = Op.getSimpleValueType();
18311   EVT OpVT = VT;
18312   unsigned NumBits = VT.getSizeInBits();
18313   SDLoc dl(Op);
18314
18315   Op = Op.getOperand(0);
18316   if (VT == MVT::i8) {
18317     // Zero extend to i32 since there is not an i8 bsr.
18318     OpVT = MVT::i32;
18319     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18320   }
18321
18322   // Issue a bsr (scan bits in reverse).
18323   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18324   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18325
18326   // And xor with NumBits-1.
18327   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18328
18329   if (VT == MVT::i8)
18330     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18331   return Op;
18332 }
18333
18334 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
18335   MVT VT = Op.getSimpleValueType();
18336   unsigned NumBits = VT.getSizeInBits();
18337   SDLoc dl(Op);
18338   Op = Op.getOperand(0);
18339
18340   // Issue a bsf (scan bits forward) which also sets EFLAGS.
18341   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18342   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
18343
18344   // If src is zero (i.e. bsf sets ZF), returns NumBits.
18345   SDValue Ops[] = {
18346     Op,
18347     DAG.getConstant(NumBits, VT),
18348     DAG.getConstant(X86::COND_E, MVT::i8),
18349     Op.getValue(1)
18350   };
18351   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
18352 }
18353
18354 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
18355 // ones, and then concatenate the result back.
18356 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
18357   MVT VT = Op.getSimpleValueType();
18358
18359   assert(VT.is256BitVector() && VT.isInteger() &&
18360          "Unsupported value type for operation");
18361
18362   unsigned NumElems = VT.getVectorNumElements();
18363   SDLoc dl(Op);
18364
18365   // Extract the LHS vectors
18366   SDValue LHS = Op.getOperand(0);
18367   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
18368   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
18369
18370   // Extract the RHS vectors
18371   SDValue RHS = Op.getOperand(1);
18372   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
18373   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
18374
18375   MVT EltVT = VT.getVectorElementType();
18376   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18377
18378   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18379                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
18380                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
18381 }
18382
18383 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
18384   assert(Op.getSimpleValueType().is256BitVector() &&
18385          Op.getSimpleValueType().isInteger() &&
18386          "Only handle AVX 256-bit vector integer operation");
18387   return Lower256IntArith(Op, DAG);
18388 }
18389
18390 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
18391   assert(Op.getSimpleValueType().is256BitVector() &&
18392          Op.getSimpleValueType().isInteger() &&
18393          "Only handle AVX 256-bit vector integer operation");
18394   return Lower256IntArith(Op, DAG);
18395 }
18396
18397 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
18398                         SelectionDAG &DAG) {
18399   SDLoc dl(Op);
18400   MVT VT = Op.getSimpleValueType();
18401
18402   // Decompose 256-bit ops into smaller 128-bit ops.
18403   if (VT.is256BitVector() && !Subtarget->hasInt256())
18404     return Lower256IntArith(Op, DAG);
18405
18406   SDValue A = Op.getOperand(0);
18407   SDValue B = Op.getOperand(1);
18408
18409   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
18410   if (VT == MVT::v4i32) {
18411     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
18412            "Should not custom lower when pmuldq is available!");
18413
18414     // Extract the odd parts.
18415     static const int UnpackMask[] = { 1, -1, 3, -1 };
18416     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
18417     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
18418
18419     // Multiply the even parts.
18420     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
18421     // Now multiply odd parts.
18422     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
18423
18424     Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
18425     Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
18426
18427     // Merge the two vectors back together with a shuffle. This expands into 2
18428     // shuffles.
18429     static const int ShufMask[] = { 0, 4, 2, 6 };
18430     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
18431   }
18432
18433   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
18434          "Only know how to lower V2I64/V4I64/V8I64 multiply");
18435
18436   //  Ahi = psrlqi(a, 32);
18437   //  Bhi = psrlqi(b, 32);
18438   //
18439   //  AloBlo = pmuludq(a, b);
18440   //  AloBhi = pmuludq(a, Bhi);
18441   //  AhiBlo = pmuludq(Ahi, b);
18442
18443   //  AloBhi = psllqi(AloBhi, 32);
18444   //  AhiBlo = psllqi(AhiBlo, 32);
18445   //  return AloBlo + AloBhi + AhiBlo;
18446
18447   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
18448   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
18449
18450   // Bit cast to 32-bit vectors for MULUDQ
18451   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
18452                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
18453   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
18454   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
18455   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
18456   Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
18457
18458   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
18459   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
18460   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
18461
18462   AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
18463   AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
18464
18465   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
18466   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
18467 }
18468
18469 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
18470   assert(Subtarget->isTargetWin64() && "Unexpected target");
18471   EVT VT = Op.getValueType();
18472   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
18473          "Unexpected return type for lowering");
18474
18475   RTLIB::Libcall LC;
18476   bool isSigned;
18477   switch (Op->getOpcode()) {
18478   default: llvm_unreachable("Unexpected request for libcall!");
18479   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
18480   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
18481   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
18482   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
18483   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
18484   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
18485   }
18486
18487   SDLoc dl(Op);
18488   SDValue InChain = DAG.getEntryNode();
18489
18490   TargetLowering::ArgListTy Args;
18491   TargetLowering::ArgListEntry Entry;
18492   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
18493     EVT ArgVT = Op->getOperand(i).getValueType();
18494     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
18495            "Unexpected argument type for lowering");
18496     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
18497     Entry.Node = StackPtr;
18498     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
18499                            false, false, 16);
18500     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18501     Entry.Ty = PointerType::get(ArgTy,0);
18502     Entry.isSExt = false;
18503     Entry.isZExt = false;
18504     Args.push_back(Entry);
18505   }
18506
18507   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18508                                          getPointerTy());
18509
18510   TargetLowering::CallLoweringInfo CLI(DAG);
18511   CLI.setDebugLoc(dl).setChain(InChain)
18512     .setCallee(getLibcallCallingConv(LC),
18513                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
18514                Callee, std::move(Args), 0)
18515     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
18516
18517   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
18518   return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
18519 }
18520
18521 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
18522                              SelectionDAG &DAG) {
18523   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
18524   EVT VT = Op0.getValueType();
18525   SDLoc dl(Op);
18526
18527   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
18528          (VT == MVT::v8i32 && Subtarget->hasInt256()));
18529
18530   // PMULxD operations multiply each even value (starting at 0) of LHS with
18531   // the related value of RHS and produce a widen result.
18532   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18533   // => <2 x i64> <ae|cg>
18534   //
18535   // In other word, to have all the results, we need to perform two PMULxD:
18536   // 1. one with the even values.
18537   // 2. one with the odd values.
18538   // To achieve #2, with need to place the odd values at an even position.
18539   //
18540   // Place the odd value at an even position (basically, shift all values 1
18541   // step to the left):
18542   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
18543   // <a|b|c|d> => <b|undef|d|undef>
18544   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
18545   // <e|f|g|h> => <f|undef|h|undef>
18546   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
18547
18548   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
18549   // ints.
18550   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
18551   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
18552   unsigned Opcode =
18553       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
18554   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18555   // => <2 x i64> <ae|cg>
18556   SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
18557                              DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
18558   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
18559   // => <2 x i64> <bf|dh>
18560   SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
18561                              DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
18562
18563   // Shuffle it back into the right order.
18564   SDValue Highs, Lows;
18565   if (VT == MVT::v8i32) {
18566     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
18567     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18568     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
18569     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18570   } else {
18571     const int HighMask[] = {1, 5, 3, 7};
18572     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18573     const int LowMask[] = {0, 4, 2, 6};
18574     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18575   }
18576
18577   // If we have a signed multiply but no PMULDQ fix up the high parts of a
18578   // unsigned multiply.
18579   if (IsSigned && !Subtarget->hasSSE41()) {
18580     SDValue ShAmt =
18581         DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
18582     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
18583                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
18584     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
18585                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
18586
18587     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
18588     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
18589   }
18590
18591   // The first result of MUL_LOHI is actually the low value, followed by the
18592   // high value.
18593   SDValue Ops[] = {Lows, Highs};
18594   return DAG.getMergeValues(Ops, dl);
18595 }
18596
18597 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
18598                                          const X86Subtarget *Subtarget) {
18599   MVT VT = Op.getSimpleValueType();
18600   SDLoc dl(Op);
18601   SDValue R = Op.getOperand(0);
18602   SDValue Amt = Op.getOperand(1);
18603
18604   // Optimize shl/srl/sra with constant shift amount.
18605   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
18606     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
18607       uint64_t ShiftAmt = ShiftConst->getZExtValue();
18608
18609       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
18610           (Subtarget->hasInt256() &&
18611            (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18612           (Subtarget->hasAVX512() &&
18613            (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18614         if (Op.getOpcode() == ISD::SHL)
18615           return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18616                                             DAG);
18617         if (Op.getOpcode() == ISD::SRL)
18618           return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18619                                             DAG);
18620         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
18621           return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18622                                             DAG);
18623       }
18624
18625       if (VT == MVT::v16i8) {
18626         if (Op.getOpcode() == ISD::SHL) {
18627           // Make a large shift.
18628           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18629                                                    MVT::v8i16, R, ShiftAmt,
18630                                                    DAG);
18631           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18632           // Zero out the rightmost bits.
18633           SmallVector<SDValue, 16> V(16,
18634                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18635                                                      MVT::i8));
18636           return DAG.getNode(ISD::AND, dl, VT, SHL,
18637                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18638         }
18639         if (Op.getOpcode() == ISD::SRL) {
18640           // Make a large shift.
18641           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18642                                                    MVT::v8i16, R, ShiftAmt,
18643                                                    DAG);
18644           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18645           // Zero out the leftmost bits.
18646           SmallVector<SDValue, 16> V(16,
18647                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18648                                                      MVT::i8));
18649           return DAG.getNode(ISD::AND, dl, VT, SRL,
18650                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18651         }
18652         if (Op.getOpcode() == ISD::SRA) {
18653           if (ShiftAmt == 7) {
18654             // R s>> 7  ===  R s< 0
18655             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18656             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18657           }
18658
18659           // R s>> a === ((R u>> a) ^ m) - m
18660           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18661           SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
18662                                                          MVT::i8));
18663           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18664           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18665           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18666           return Res;
18667         }
18668         llvm_unreachable("Unknown shift opcode.");
18669       }
18670
18671       if (Subtarget->hasInt256() && VT == MVT::v32i8) {
18672         if (Op.getOpcode() == ISD::SHL) {
18673           // Make a large shift.
18674           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18675                                                    MVT::v16i16, R, ShiftAmt,
18676                                                    DAG);
18677           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18678           // Zero out the rightmost bits.
18679           SmallVector<SDValue, 32> V(32,
18680                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18681                                                      MVT::i8));
18682           return DAG.getNode(ISD::AND, dl, VT, SHL,
18683                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18684         }
18685         if (Op.getOpcode() == ISD::SRL) {
18686           // Make a large shift.
18687           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18688                                                    MVT::v16i16, R, ShiftAmt,
18689                                                    DAG);
18690           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18691           // Zero out the leftmost bits.
18692           SmallVector<SDValue, 32> V(32,
18693                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18694                                                      MVT::i8));
18695           return DAG.getNode(ISD::AND, dl, VT, SRL,
18696                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18697         }
18698         if (Op.getOpcode() == ISD::SRA) {
18699           if (ShiftAmt == 7) {
18700             // R s>> 7  ===  R s< 0
18701             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18702             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18703           }
18704
18705           // R s>> a === ((R u>> a) ^ m) - m
18706           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18707           SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
18708                                                          MVT::i8));
18709           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18710           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18711           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18712           return Res;
18713         }
18714         llvm_unreachable("Unknown shift opcode.");
18715       }
18716     }
18717   }
18718
18719   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18720   if (!Subtarget->is64Bit() &&
18721       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
18722       Amt.getOpcode() == ISD::BITCAST &&
18723       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18724     Amt = Amt.getOperand(0);
18725     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18726                      VT.getVectorNumElements();
18727     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
18728     uint64_t ShiftAmt = 0;
18729     for (unsigned i = 0; i != Ratio; ++i) {
18730       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
18731       if (!C)
18732         return SDValue();
18733       // 6 == Log2(64)
18734       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
18735     }
18736     // Check remaining shift amounts.
18737     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18738       uint64_t ShAmt = 0;
18739       for (unsigned j = 0; j != Ratio; ++j) {
18740         ConstantSDNode *C =
18741           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
18742         if (!C)
18743           return SDValue();
18744         // 6 == Log2(64)
18745         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
18746       }
18747       if (ShAmt != ShiftAmt)
18748         return SDValue();
18749     }
18750     switch (Op.getOpcode()) {
18751     default:
18752       llvm_unreachable("Unknown shift opcode!");
18753     case ISD::SHL:
18754       return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18755                                         DAG);
18756     case ISD::SRL:
18757       return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18758                                         DAG);
18759     case ISD::SRA:
18760       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18761                                         DAG);
18762     }
18763   }
18764
18765   return SDValue();
18766 }
18767
18768 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
18769                                         const X86Subtarget* Subtarget) {
18770   MVT VT = Op.getSimpleValueType();
18771   SDLoc dl(Op);
18772   SDValue R = Op.getOperand(0);
18773   SDValue Amt = Op.getOperand(1);
18774
18775   if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
18776       VT == MVT::v4i32 || VT == MVT::v8i16 ||
18777       (Subtarget->hasInt256() &&
18778        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
18779         VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18780        (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18781     SDValue BaseShAmt;
18782     EVT EltVT = VT.getVectorElementType();
18783
18784     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
18785       // Check if this build_vector node is doing a splat.
18786       // If so, then set BaseShAmt equal to the splat value.
18787       BaseShAmt = BV->getSplatValue();
18788       if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
18789         BaseShAmt = SDValue();
18790     } else {
18791       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
18792         Amt = Amt.getOperand(0);
18793
18794       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
18795       if (SVN && SVN->isSplat()) {
18796         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
18797         SDValue InVec = Amt.getOperand(0);
18798         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
18799           assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
18800                  "Unexpected shuffle index found!");
18801           BaseShAmt = InVec.getOperand(SplatIdx);
18802         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
18803            if (ConstantSDNode *C =
18804                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
18805              if (C->getZExtValue() == SplatIdx)
18806                BaseShAmt = InVec.getOperand(1);
18807            }
18808         }
18809
18810         if (!BaseShAmt)
18811           // Avoid introducing an extract element from a shuffle.
18812           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
18813                                     DAG.getIntPtrConstant(SplatIdx));
18814       }
18815     }
18816
18817     if (BaseShAmt.getNode()) {
18818       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
18819       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
18820         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
18821       else if (EltVT.bitsLT(MVT::i32))
18822         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
18823
18824       switch (Op.getOpcode()) {
18825       default:
18826         llvm_unreachable("Unknown shift opcode!");
18827       case ISD::SHL:
18828         switch (VT.SimpleTy) {
18829         default: return SDValue();
18830         case MVT::v2i64:
18831         case MVT::v4i32:
18832         case MVT::v8i16:
18833         case MVT::v4i64:
18834         case MVT::v8i32:
18835         case MVT::v16i16:
18836         case MVT::v16i32:
18837         case MVT::v8i64:
18838           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
18839         }
18840       case ISD::SRA:
18841         switch (VT.SimpleTy) {
18842         default: return SDValue();
18843         case MVT::v4i32:
18844         case MVT::v8i16:
18845         case MVT::v8i32:
18846         case MVT::v16i16:
18847         case MVT::v16i32:
18848         case MVT::v8i64:
18849           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
18850         }
18851       case ISD::SRL:
18852         switch (VT.SimpleTy) {
18853         default: return SDValue();
18854         case MVT::v2i64:
18855         case MVT::v4i32:
18856         case MVT::v8i16:
18857         case MVT::v4i64:
18858         case MVT::v8i32:
18859         case MVT::v16i16:
18860         case MVT::v16i32:
18861         case MVT::v8i64:
18862           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
18863         }
18864       }
18865     }
18866   }
18867
18868   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18869   if (!Subtarget->is64Bit() &&
18870       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
18871       (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
18872       Amt.getOpcode() == ISD::BITCAST &&
18873       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18874     Amt = Amt.getOperand(0);
18875     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18876                      VT.getVectorNumElements();
18877     std::vector<SDValue> Vals(Ratio);
18878     for (unsigned i = 0; i != Ratio; ++i)
18879       Vals[i] = Amt.getOperand(i);
18880     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18881       for (unsigned j = 0; j != Ratio; ++j)
18882         if (Vals[j] != Amt.getOperand(i + j))
18883           return SDValue();
18884     }
18885     switch (Op.getOpcode()) {
18886     default:
18887       llvm_unreachable("Unknown shift opcode!");
18888     case ISD::SHL:
18889       return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
18890     case ISD::SRL:
18891       return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
18892     case ISD::SRA:
18893       return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
18894     }
18895   }
18896
18897   return SDValue();
18898 }
18899
18900 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
18901                           SelectionDAG &DAG) {
18902   MVT VT = Op.getSimpleValueType();
18903   SDLoc dl(Op);
18904   SDValue R = Op.getOperand(0);
18905   SDValue Amt = Op.getOperand(1);
18906   SDValue V;
18907
18908   assert(VT.isVector() && "Custom lowering only for vector shifts!");
18909   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
18910
18911   V = LowerScalarImmediateShift(Op, DAG, Subtarget);
18912   if (V.getNode())
18913     return V;
18914
18915   V = LowerScalarVariableShift(Op, DAG, Subtarget);
18916   if (V.getNode())
18917       return V;
18918
18919   if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
18920     return Op;
18921   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
18922   if (Subtarget->hasInt256()) {
18923     if (Op.getOpcode() == ISD::SRL &&
18924         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18925          VT == MVT::v4i64 || VT == MVT::v8i32))
18926       return Op;
18927     if (Op.getOpcode() == ISD::SHL &&
18928         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18929          VT == MVT::v4i64 || VT == MVT::v8i32))
18930       return Op;
18931     if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
18932       return Op;
18933   }
18934
18935   // If possible, lower this packed shift into a vector multiply instead of
18936   // expanding it into a sequence of scalar shifts.
18937   // Do this only if the vector shift count is a constant build_vector.
18938   if (Op.getOpcode() == ISD::SHL &&
18939       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
18940        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
18941       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18942     SmallVector<SDValue, 8> Elts;
18943     EVT SVT = VT.getScalarType();
18944     unsigned SVTBits = SVT.getSizeInBits();
18945     const APInt &One = APInt(SVTBits, 1);
18946     unsigned NumElems = VT.getVectorNumElements();
18947
18948     for (unsigned i=0; i !=NumElems; ++i) {
18949       SDValue Op = Amt->getOperand(i);
18950       if (Op->getOpcode() == ISD::UNDEF) {
18951         Elts.push_back(Op);
18952         continue;
18953       }
18954
18955       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
18956       const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
18957       uint64_t ShAmt = C.getZExtValue();
18958       if (ShAmt >= SVTBits) {
18959         Elts.push_back(DAG.getUNDEF(SVT));
18960         continue;
18961       }
18962       Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
18963     }
18964     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
18965     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
18966   }
18967
18968   // Lower SHL with variable shift amount.
18969   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
18970     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
18971
18972     Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
18973     Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
18974     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
18975     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
18976   }
18977
18978   // If possible, lower this shift as a sequence of two shifts by
18979   // constant plus a MOVSS/MOVSD instead of scalarizing it.
18980   // Example:
18981   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
18982   //
18983   // Could be rewritten as:
18984   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
18985   //
18986   // The advantage is that the two shifts from the example would be
18987   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
18988   // the vector shift into four scalar shifts plus four pairs of vector
18989   // insert/extract.
18990   if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
18991       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18992     unsigned TargetOpcode = X86ISD::MOVSS;
18993     bool CanBeSimplified;
18994     // The splat value for the first packed shift (the 'X' from the example).
18995     SDValue Amt1 = Amt->getOperand(0);
18996     // The splat value for the second packed shift (the 'Y' from the example).
18997     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
18998                                         Amt->getOperand(2);
18999
19000     // See if it is possible to replace this node with a sequence of
19001     // two shifts followed by a MOVSS/MOVSD
19002     if (VT == MVT::v4i32) {
19003       // Check if it is legal to use a MOVSS.
19004       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
19005                         Amt2 == Amt->getOperand(3);
19006       if (!CanBeSimplified) {
19007         // Otherwise, check if we can still simplify this node using a MOVSD.
19008         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
19009                           Amt->getOperand(2) == Amt->getOperand(3);
19010         TargetOpcode = X86ISD::MOVSD;
19011         Amt2 = Amt->getOperand(2);
19012       }
19013     } else {
19014       // Do similar checks for the case where the machine value type
19015       // is MVT::v8i16.
19016       CanBeSimplified = Amt1 == Amt->getOperand(1);
19017       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
19018         CanBeSimplified = Amt2 == Amt->getOperand(i);
19019
19020       if (!CanBeSimplified) {
19021         TargetOpcode = X86ISD::MOVSD;
19022         CanBeSimplified = true;
19023         Amt2 = Amt->getOperand(4);
19024         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
19025           CanBeSimplified = Amt1 == Amt->getOperand(i);
19026         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
19027           CanBeSimplified = Amt2 == Amt->getOperand(j);
19028       }
19029     }
19030
19031     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
19032         isa<ConstantSDNode>(Amt2)) {
19033       // Replace this node with two shifts followed by a MOVSS/MOVSD.
19034       EVT CastVT = MVT::v4i32;
19035       SDValue Splat1 =
19036         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
19037       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
19038       SDValue Splat2 =
19039         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
19040       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
19041       if (TargetOpcode == X86ISD::MOVSD)
19042         CastVT = MVT::v2i64;
19043       SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
19044       SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
19045       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
19046                                             BitCast1, DAG);
19047       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
19048     }
19049   }
19050
19051   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
19052     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
19053
19054     // a = a << 5;
19055     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
19056     Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
19057
19058     // Turn 'a' into a mask suitable for VSELECT
19059     SDValue VSelM = DAG.getConstant(0x80, VT);
19060     SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19061     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19062
19063     SDValue CM1 = DAG.getConstant(0x0f, VT);
19064     SDValue CM2 = DAG.getConstant(0x3f, VT);
19065
19066     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
19067     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
19068     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
19069     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19070     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19071
19072     // a += a
19073     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19074     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19075     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19076
19077     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
19078     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
19079     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
19080     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19081     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19082
19083     // a += a
19084     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19085     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19086     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19087
19088     // return VSELECT(r, r+r, a);
19089     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
19090                     DAG.getNode(ISD::ADD, dl, VT, R, R), R);
19091     return R;
19092   }
19093
19094   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
19095   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
19096   // solution better.
19097   if (Subtarget->hasInt256() && VT == MVT::v8i16) {
19098     MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
19099     unsigned ExtOpc =
19100         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
19101     R = DAG.getNode(ExtOpc, dl, NewVT, R);
19102     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
19103     return DAG.getNode(ISD::TRUNCATE, dl, VT,
19104                        DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
19105     }
19106
19107   // Decompose 256-bit shifts into smaller 128-bit shifts.
19108   if (VT.is256BitVector()) {
19109     unsigned NumElems = VT.getVectorNumElements();
19110     MVT EltVT = VT.getVectorElementType();
19111     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19112
19113     // Extract the two vectors
19114     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
19115     SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
19116
19117     // Recreate the shift amount vectors
19118     SDValue Amt1, Amt2;
19119     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
19120       // Constant shift amount
19121       SmallVector<SDValue, 4> Amt1Csts;
19122       SmallVector<SDValue, 4> Amt2Csts;
19123       for (unsigned i = 0; i != NumElems/2; ++i)
19124         Amt1Csts.push_back(Amt->getOperand(i));
19125       for (unsigned i = NumElems/2; i != NumElems; ++i)
19126         Amt2Csts.push_back(Amt->getOperand(i));
19127
19128       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
19129       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
19130     } else {
19131       // Variable shift amount
19132       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
19133       Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
19134     }
19135
19136     // Issue new vector shifts for the smaller types
19137     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
19138     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
19139
19140     // Concatenate the result back
19141     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
19142   }
19143
19144   return SDValue();
19145 }
19146
19147 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
19148   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
19149   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
19150   // looks for this combo and may remove the "setcc" instruction if the "setcc"
19151   // has only one use.
19152   SDNode *N = Op.getNode();
19153   SDValue LHS = N->getOperand(0);
19154   SDValue RHS = N->getOperand(1);
19155   unsigned BaseOp = 0;
19156   unsigned Cond = 0;
19157   SDLoc DL(Op);
19158   switch (Op.getOpcode()) {
19159   default: llvm_unreachable("Unknown ovf instruction!");
19160   case ISD::SADDO:
19161     // A subtract of one will be selected as a INC. Note that INC doesn't
19162     // set CF, so we can't do this for UADDO.
19163     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19164       if (C->isOne()) {
19165         BaseOp = X86ISD::INC;
19166         Cond = X86::COND_O;
19167         break;
19168       }
19169     BaseOp = X86ISD::ADD;
19170     Cond = X86::COND_O;
19171     break;
19172   case ISD::UADDO:
19173     BaseOp = X86ISD::ADD;
19174     Cond = X86::COND_B;
19175     break;
19176   case ISD::SSUBO:
19177     // A subtract of one will be selected as a DEC. Note that DEC doesn't
19178     // set CF, so we can't do this for USUBO.
19179     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19180       if (C->isOne()) {
19181         BaseOp = X86ISD::DEC;
19182         Cond = X86::COND_O;
19183         break;
19184       }
19185     BaseOp = X86ISD::SUB;
19186     Cond = X86::COND_O;
19187     break;
19188   case ISD::USUBO:
19189     BaseOp = X86ISD::SUB;
19190     Cond = X86::COND_B;
19191     break;
19192   case ISD::SMULO:
19193     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
19194     Cond = X86::COND_O;
19195     break;
19196   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
19197     if (N->getValueType(0) == MVT::i8) {
19198       BaseOp = X86ISD::UMUL8;
19199       Cond = X86::COND_O;
19200       break;
19201     }
19202     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
19203                                  MVT::i32);
19204     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
19205
19206     SDValue SetCC =
19207       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
19208                   DAG.getConstant(X86::COND_O, MVT::i32),
19209                   SDValue(Sum.getNode(), 2));
19210
19211     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19212   }
19213   }
19214
19215   // Also sets EFLAGS.
19216   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
19217   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
19218
19219   SDValue SetCC =
19220     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
19221                 DAG.getConstant(Cond, MVT::i32),
19222                 SDValue(Sum.getNode(), 1));
19223
19224   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19225 }
19226
19227 // Sign extension of the low part of vector elements. This may be used either
19228 // when sign extend instructions are not available or if the vector element
19229 // sizes already match the sign-extended size. If the vector elements are in
19230 // their pre-extended size and sign extend instructions are available, that will
19231 // be handled by LowerSIGN_EXTEND.
19232 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
19233                                                   SelectionDAG &DAG) const {
19234   SDLoc dl(Op);
19235   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
19236   MVT VT = Op.getSimpleValueType();
19237
19238   if (!Subtarget->hasSSE2() || !VT.isVector())
19239     return SDValue();
19240
19241   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
19242                       ExtraVT.getScalarType().getSizeInBits();
19243
19244   switch (VT.SimpleTy) {
19245     default: return SDValue();
19246     case MVT::v8i32:
19247     case MVT::v16i16:
19248       if (!Subtarget->hasFp256())
19249         return SDValue();
19250       if (!Subtarget->hasInt256()) {
19251         // needs to be split
19252         unsigned NumElems = VT.getVectorNumElements();
19253
19254         // Extract the LHS vectors
19255         SDValue LHS = Op.getOperand(0);
19256         SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
19257         SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
19258
19259         MVT EltVT = VT.getVectorElementType();
19260         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19261
19262         EVT ExtraEltVT = ExtraVT.getVectorElementType();
19263         unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
19264         ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
19265                                    ExtraNumElems/2);
19266         SDValue Extra = DAG.getValueType(ExtraVT);
19267
19268         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
19269         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
19270
19271         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
19272       }
19273       // fall through
19274     case MVT::v4i32:
19275     case MVT::v8i16: {
19276       SDValue Op0 = Op.getOperand(0);
19277
19278       // This is a sign extension of some low part of vector elements without
19279       // changing the size of the vector elements themselves:
19280       // Shift-Left + Shift-Right-Algebraic.
19281       SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
19282                                                BitsDiff, DAG);
19283       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
19284                                         DAG);
19285     }
19286   }
19287 }
19288
19289 /// Returns true if the operand type is exactly twice the native width, and
19290 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
19291 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
19292 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
19293 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
19294   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
19295
19296   if (OpWidth == 64)
19297     return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
19298   else if (OpWidth == 128)
19299     return Subtarget->hasCmpxchg16b();
19300   else
19301     return false;
19302 }
19303
19304 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19305   return needsCmpXchgNb(SI->getValueOperand()->getType());
19306 }
19307
19308 // Note: this turns large loads into lock cmpxchg8b/16b.
19309 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
19310 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19311   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
19312   return needsCmpXchgNb(PTy->getElementType());
19313 }
19314
19315 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
19316   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19317   const Type *MemType = AI->getType();
19318
19319   // If the operand is too big, we must see if cmpxchg8/16b is available
19320   // and default to library calls otherwise.
19321   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19322     return needsCmpXchgNb(MemType);
19323
19324   AtomicRMWInst::BinOp Op = AI->getOperation();
19325   switch (Op) {
19326   default:
19327     llvm_unreachable("Unknown atomic operation");
19328   case AtomicRMWInst::Xchg:
19329   case AtomicRMWInst::Add:
19330   case AtomicRMWInst::Sub:
19331     // It's better to use xadd, xsub or xchg for these in all cases.
19332     return false;
19333   case AtomicRMWInst::Or:
19334   case AtomicRMWInst::And:
19335   case AtomicRMWInst::Xor:
19336     // If the atomicrmw's result isn't actually used, we can just add a "lock"
19337     // prefix to a normal instruction for these operations.
19338     return !AI->use_empty();
19339   case AtomicRMWInst::Nand:
19340   case AtomicRMWInst::Max:
19341   case AtomicRMWInst::Min:
19342   case AtomicRMWInst::UMax:
19343   case AtomicRMWInst::UMin:
19344     // These always require a non-trivial set of data operations on x86. We must
19345     // use a cmpxchg loop.
19346     return true;
19347   }
19348 }
19349
19350 static bool hasMFENCE(const X86Subtarget& Subtarget) {
19351   // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
19352   // no-sse2). There isn't any reason to disable it if the target processor
19353   // supports it.
19354   return Subtarget.hasSSE2() || Subtarget.is64Bit();
19355 }
19356
19357 LoadInst *
19358 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19359   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19360   const Type *MemType = AI->getType();
19361   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
19362   // there is no benefit in turning such RMWs into loads, and it is actually
19363   // harmful as it introduces a mfence.
19364   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19365     return nullptr;
19366
19367   auto Builder = IRBuilder<>(AI);
19368   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
19369   auto SynchScope = AI->getSynchScope();
19370   // We must restrict the ordering to avoid generating loads with Release or
19371   // ReleaseAcquire orderings.
19372   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
19373   auto Ptr = AI->getPointerOperand();
19374
19375   // Before the load we need a fence. Here is an example lifted from
19376   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
19377   // is required:
19378   // Thread 0:
19379   //   x.store(1, relaxed);
19380   //   r1 = y.fetch_add(0, release);
19381   // Thread 1:
19382   //   y.fetch_add(42, acquire);
19383   //   r2 = x.load(relaxed);
19384   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
19385   // lowered to just a load without a fence. A mfence flushes the store buffer,
19386   // making the optimization clearly correct.
19387   // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
19388   // otherwise, we might be able to be more agressive on relaxed idempotent
19389   // rmw. In practice, they do not look useful, so we don't try to be
19390   // especially clever.
19391   if (SynchScope == SingleThread) {
19392     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
19393     // the IR level, so we must wrap it in an intrinsic.
19394     return nullptr;
19395   } else if (hasMFENCE(*Subtarget)) {
19396     Function *MFence = llvm::Intrinsic::getDeclaration(M,
19397             Intrinsic::x86_sse2_mfence);
19398     Builder.CreateCall(MFence);
19399   } else {
19400     // FIXME: it might make sense to use a locked operation here but on a
19401     // different cache-line to prevent cache-line bouncing. In practice it
19402     // is probably a small win, and x86 processors without mfence are rare
19403     // enough that we do not bother.
19404     return nullptr;
19405   }
19406
19407   // Finally we can emit the atomic load.
19408   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
19409           AI->getType()->getPrimitiveSizeInBits());
19410   Loaded->setAtomic(Order, SynchScope);
19411   AI->replaceAllUsesWith(Loaded);
19412   AI->eraseFromParent();
19413   return Loaded;
19414 }
19415
19416 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
19417                                  SelectionDAG &DAG) {
19418   SDLoc dl(Op);
19419   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
19420     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
19421   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
19422     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
19423
19424   // The only fence that needs an instruction is a sequentially-consistent
19425   // cross-thread fence.
19426   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
19427     if (hasMFENCE(*Subtarget))
19428       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
19429
19430     SDValue Chain = Op.getOperand(0);
19431     SDValue Zero = DAG.getConstant(0, MVT::i32);
19432     SDValue Ops[] = {
19433       DAG.getRegister(X86::ESP, MVT::i32), // Base
19434       DAG.getTargetConstant(1, MVT::i8),   // Scale
19435       DAG.getRegister(0, MVT::i32),        // Index
19436       DAG.getTargetConstant(0, MVT::i32),  // Disp
19437       DAG.getRegister(0, MVT::i32),        // Segment.
19438       Zero,
19439       Chain
19440     };
19441     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
19442     return SDValue(Res, 0);
19443   }
19444
19445   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
19446   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
19447 }
19448
19449 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
19450                              SelectionDAG &DAG) {
19451   MVT T = Op.getSimpleValueType();
19452   SDLoc DL(Op);
19453   unsigned Reg = 0;
19454   unsigned size = 0;
19455   switch(T.SimpleTy) {
19456   default: llvm_unreachable("Invalid value type!");
19457   case MVT::i8:  Reg = X86::AL;  size = 1; break;
19458   case MVT::i16: Reg = X86::AX;  size = 2; break;
19459   case MVT::i32: Reg = X86::EAX; size = 4; break;
19460   case MVT::i64:
19461     assert(Subtarget->is64Bit() && "Node not type legal!");
19462     Reg = X86::RAX; size = 8;
19463     break;
19464   }
19465   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
19466                                   Op.getOperand(2), SDValue());
19467   SDValue Ops[] = { cpIn.getValue(0),
19468                     Op.getOperand(1),
19469                     Op.getOperand(3),
19470                     DAG.getTargetConstant(size, MVT::i8),
19471                     cpIn.getValue(1) };
19472   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19473   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
19474   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
19475                                            Ops, T, MMO);
19476
19477   SDValue cpOut =
19478     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
19479   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
19480                                       MVT::i32, cpOut.getValue(2));
19481   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
19482                                 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19483
19484   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
19485   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
19486   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
19487   return SDValue();
19488 }
19489
19490 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
19491                             SelectionDAG &DAG) {
19492   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
19493   MVT DstVT = Op.getSimpleValueType();
19494
19495   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
19496     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19497     if (DstVT != MVT::f64)
19498       // This conversion needs to be expanded.
19499       return SDValue();
19500
19501     SDValue InVec = Op->getOperand(0);
19502     SDLoc dl(Op);
19503     unsigned NumElts = SrcVT.getVectorNumElements();
19504     EVT SVT = SrcVT.getVectorElementType();
19505
19506     // Widen the vector in input in the case of MVT::v2i32.
19507     // Example: from MVT::v2i32 to MVT::v4i32.
19508     SmallVector<SDValue, 16> Elts;
19509     for (unsigned i = 0, e = NumElts; i != e; ++i)
19510       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
19511                                  DAG.getIntPtrConstant(i)));
19512
19513     // Explicitly mark the extra elements as Undef.
19514     SDValue Undef = DAG.getUNDEF(SVT);
19515     for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
19516       Elts.push_back(Undef);
19517
19518     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19519     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
19520     SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
19521     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
19522                        DAG.getIntPtrConstant(0));
19523   }
19524
19525   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
19526          Subtarget->hasMMX() && "Unexpected custom BITCAST");
19527   assert((DstVT == MVT::i64 ||
19528           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
19529          "Unexpected custom BITCAST");
19530   // i64 <=> MMX conversions are Legal.
19531   if (SrcVT==MVT::i64 && DstVT.isVector())
19532     return Op;
19533   if (DstVT==MVT::i64 && SrcVT.isVector())
19534     return Op;
19535   // MMX <=> MMX conversions are Legal.
19536   if (SrcVT.isVector() && DstVT.isVector())
19537     return Op;
19538   // All other conversions need to be expanded.
19539   return SDValue();
19540 }
19541
19542 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
19543                           SelectionDAG &DAG) {
19544   SDNode *Node = Op.getNode();
19545   SDLoc dl(Node);
19546
19547   Op = Op.getOperand(0);
19548   EVT VT = Op.getValueType();
19549   assert((VT.is128BitVector() || VT.is256BitVector()) &&
19550          "CTPOP lowering only implemented for 128/256-bit wide vector types");
19551
19552   unsigned NumElts = VT.getVectorNumElements();
19553   EVT EltVT = VT.getVectorElementType();
19554   unsigned Len = EltVT.getSizeInBits();
19555
19556   // This is the vectorized version of the "best" algorithm from
19557   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
19558   // with a minor tweak to use a series of adds + shifts instead of vector
19559   // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
19560   //
19561   //  v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
19562   //  v8i32 => Always profitable
19563   //
19564   // FIXME: There a couple of possible improvements:
19565   //
19566   // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
19567   // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
19568   //
19569   assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
19570          "CTPOP not implemented for this vector element type.");
19571
19572   // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
19573   // extra legalization.
19574   bool NeedsBitcast = EltVT == MVT::i32;
19575   MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
19576
19577   SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
19578   SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
19579   SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
19580
19581   // v = v - ((v >> 1) & 0x55555555...)
19582   SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
19583   SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
19584   SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
19585   if (NeedsBitcast)
19586     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19587
19588   SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
19589   SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
19590   if (NeedsBitcast)
19591     M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
19592
19593   SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
19594   if (VT != And.getValueType())
19595     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19596   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
19597
19598   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
19599   SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
19600   SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
19601   SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
19602   SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
19603
19604   Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
19605   if (NeedsBitcast) {
19606     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19607     M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
19608     Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
19609   }
19610
19611   SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
19612   SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
19613   if (VT != AndRHS.getValueType()) {
19614     AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
19615     AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
19616   }
19617   SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
19618
19619   // v = (v + (v >> 4)) & 0x0F0F0F0F...
19620   SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
19621   SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
19622   Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
19623   Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19624
19625   SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
19626   SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
19627   if (NeedsBitcast) {
19628     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19629     M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
19630   }
19631   And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
19632   if (VT != And.getValueType())
19633     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19634
19635   // The algorithm mentioned above uses:
19636   //    v = (v * 0x01010101...) >> (Len - 8)
19637   //
19638   // Change it to use vector adds + vector shifts which yield faster results on
19639   // Haswell than using vector integer multiplication.
19640   //
19641   // For i32 elements:
19642   //    v = v + (v >> 8)
19643   //    v = v + (v >> 16)
19644   //
19645   // For i64 elements:
19646   //    v = v + (v >> 8)
19647   //    v = v + (v >> 16)
19648   //    v = v + (v >> 32)
19649   //
19650   Add = And;
19651   SmallVector<SDValue, 8> Csts;
19652   for (unsigned i = 8; i <= Len/2; i *= 2) {
19653     Csts.assign(NumElts, DAG.getConstant(i, EltVT));
19654     SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
19655     Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
19656     Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19657     Csts.clear();
19658   }
19659
19660   // The result is on the least significant 6-bits on i32 and 7-bits on i64.
19661   SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
19662   SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
19663   SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
19664   if (NeedsBitcast) {
19665     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19666     M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
19667   }
19668   And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
19669   if (VT != And.getValueType())
19670     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19671
19672   return And;
19673 }
19674
19675 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
19676   SDNode *Node = Op.getNode();
19677   SDLoc dl(Node);
19678   EVT T = Node->getValueType(0);
19679   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
19680                               DAG.getConstant(0, T), Node->getOperand(2));
19681   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
19682                        cast<AtomicSDNode>(Node)->getMemoryVT(),
19683                        Node->getOperand(0),
19684                        Node->getOperand(1), negOp,
19685                        cast<AtomicSDNode>(Node)->getMemOperand(),
19686                        cast<AtomicSDNode>(Node)->getOrdering(),
19687                        cast<AtomicSDNode>(Node)->getSynchScope());
19688 }
19689
19690 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
19691   SDNode *Node = Op.getNode();
19692   SDLoc dl(Node);
19693   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
19694
19695   // Convert seq_cst store -> xchg
19696   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
19697   // FIXME: On 32-bit, store -> fist or movq would be more efficient
19698   //        (The only way to get a 16-byte store is cmpxchg16b)
19699   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
19700   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
19701       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
19702     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
19703                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
19704                                  Node->getOperand(0),
19705                                  Node->getOperand(1), Node->getOperand(2),
19706                                  cast<AtomicSDNode>(Node)->getMemOperand(),
19707                                  cast<AtomicSDNode>(Node)->getOrdering(),
19708                                  cast<AtomicSDNode>(Node)->getSynchScope());
19709     return Swap.getValue(1);
19710   }
19711   // Other atomic stores have a simple pattern.
19712   return Op;
19713 }
19714
19715 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
19716   EVT VT = Op.getNode()->getSimpleValueType(0);
19717
19718   // Let legalize expand this if it isn't a legal type yet.
19719   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19720     return SDValue();
19721
19722   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19723
19724   unsigned Opc;
19725   bool ExtraOp = false;
19726   switch (Op.getOpcode()) {
19727   default: llvm_unreachable("Invalid code");
19728   case ISD::ADDC: Opc = X86ISD::ADD; break;
19729   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
19730   case ISD::SUBC: Opc = X86ISD::SUB; break;
19731   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
19732   }
19733
19734   if (!ExtraOp)
19735     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19736                        Op.getOperand(1));
19737   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19738                      Op.getOperand(1), Op.getOperand(2));
19739 }
19740
19741 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
19742                             SelectionDAG &DAG) {
19743   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
19744
19745   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
19746   // which returns the values as { float, float } (in XMM0) or
19747   // { double, double } (which is returned in XMM0, XMM1).
19748   SDLoc dl(Op);
19749   SDValue Arg = Op.getOperand(0);
19750   EVT ArgVT = Arg.getValueType();
19751   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19752
19753   TargetLowering::ArgListTy Args;
19754   TargetLowering::ArgListEntry Entry;
19755
19756   Entry.Node = Arg;
19757   Entry.Ty = ArgTy;
19758   Entry.isSExt = false;
19759   Entry.isZExt = false;
19760   Args.push_back(Entry);
19761
19762   bool isF64 = ArgVT == MVT::f64;
19763   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
19764   // the small struct {f32, f32} is returned in (eax, edx). For f64,
19765   // the results are returned via SRet in memory.
19766   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
19767   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19768   SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
19769
19770   Type *RetTy = isF64
19771     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
19772     : (Type*)VectorType::get(ArgTy, 4);
19773
19774   TargetLowering::CallLoweringInfo CLI(DAG);
19775   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
19776     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
19777
19778   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
19779
19780   if (isF64)
19781     // Returned in xmm0 and xmm1.
19782     return CallResult.first;
19783
19784   // Returned in bits 0:31 and 32:64 xmm0.
19785   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19786                                CallResult.first, DAG.getIntPtrConstant(0));
19787   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19788                                CallResult.first, DAG.getIntPtrConstant(1));
19789   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
19790   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
19791 }
19792
19793 /// LowerOperation - Provide custom lowering hooks for some operations.
19794 ///
19795 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
19796   switch (Op.getOpcode()) {
19797   default: llvm_unreachable("Should not custom lower this!");
19798   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
19799   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
19800   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
19801     return LowerCMP_SWAP(Op, Subtarget, DAG);
19802   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
19803   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
19804   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
19805   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
19806   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
19807   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
19808   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
19809   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
19810   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
19811   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
19812   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
19813   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
19814   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
19815   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
19816   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
19817   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
19818   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
19819   case ISD::SHL_PARTS:
19820   case ISD::SRA_PARTS:
19821   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
19822   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
19823   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
19824   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
19825   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
19826   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
19827   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
19828   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
19829   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
19830   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
19831   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
19832   case ISD::FABS:
19833   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
19834   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
19835   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
19836   case ISD::SETCC:              return LowerSETCC(Op, DAG);
19837   case ISD::SELECT:             return LowerSELECT(Op, DAG);
19838   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
19839   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
19840   case ISD::VASTART:            return LowerVASTART(Op, DAG);
19841   case ISD::VAARG:              return LowerVAARG(Op, DAG);
19842   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
19843   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
19844   case ISD::INTRINSIC_VOID:
19845   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
19846   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
19847   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
19848   case ISD::FRAME_TO_ARGS_OFFSET:
19849                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
19850   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
19851   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
19852   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
19853   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
19854   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
19855   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
19856   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
19857   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
19858   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
19859   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
19860   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
19861   case ISD::UMUL_LOHI:
19862   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
19863   case ISD::SRA:
19864   case ISD::SRL:
19865   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
19866   case ISD::SADDO:
19867   case ISD::UADDO:
19868   case ISD::SSUBO:
19869   case ISD::USUBO:
19870   case ISD::SMULO:
19871   case ISD::UMULO:              return LowerXALUO(Op, DAG);
19872   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
19873   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
19874   case ISD::ADDC:
19875   case ISD::ADDE:
19876   case ISD::SUBC:
19877   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
19878   case ISD::ADD:                return LowerADD(Op, DAG);
19879   case ISD::SUB:                return LowerSUB(Op, DAG);
19880   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
19881   }
19882 }
19883
19884 /// ReplaceNodeResults - Replace a node with an illegal result type
19885 /// with a new node built out of custom code.
19886 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
19887                                            SmallVectorImpl<SDValue>&Results,
19888                                            SelectionDAG &DAG) const {
19889   SDLoc dl(N);
19890   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19891   switch (N->getOpcode()) {
19892   default:
19893     llvm_unreachable("Do not know how to custom type legalize this operation!");
19894   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
19895   case X86ISD::FMINC:
19896   case X86ISD::FMIN:
19897   case X86ISD::FMAXC:
19898   case X86ISD::FMAX: {
19899     EVT VT = N->getValueType(0);
19900     if (VT != MVT::v2f32)
19901       llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
19902     SDValue UNDEF = DAG.getUNDEF(VT);
19903     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19904                               N->getOperand(0), UNDEF);
19905     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19906                               N->getOperand(1), UNDEF);
19907     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
19908     return;
19909   }
19910   case ISD::SIGN_EXTEND_INREG:
19911   case ISD::ADDC:
19912   case ISD::ADDE:
19913   case ISD::SUBC:
19914   case ISD::SUBE:
19915     // We don't want to expand or promote these.
19916     return;
19917   case ISD::SDIV:
19918   case ISD::UDIV:
19919   case ISD::SREM:
19920   case ISD::UREM:
19921   case ISD::SDIVREM:
19922   case ISD::UDIVREM: {
19923     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
19924     Results.push_back(V);
19925     return;
19926   }
19927   case ISD::FP_TO_SINT:
19928   case ISD::FP_TO_UINT: {
19929     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
19930
19931     if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
19932       return;
19933
19934     std::pair<SDValue,SDValue> Vals =
19935         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
19936     SDValue FIST = Vals.first, StackSlot = Vals.second;
19937     if (FIST.getNode()) {
19938       EVT VT = N->getValueType(0);
19939       // Return a load from the stack slot.
19940       if (StackSlot.getNode())
19941         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
19942                                       MachinePointerInfo(),
19943                                       false, false, false, 0));
19944       else
19945         Results.push_back(FIST);
19946     }
19947     return;
19948   }
19949   case ISD::UINT_TO_FP: {
19950     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19951     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
19952         N->getValueType(0) != MVT::v2f32)
19953       return;
19954     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
19955                                  N->getOperand(0));
19956     SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
19957                                      MVT::f64);
19958     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
19959     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
19960                              DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
19961     Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
19962     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
19963     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
19964     return;
19965   }
19966   case ISD::FP_ROUND: {
19967     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
19968         return;
19969     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
19970     Results.push_back(V);
19971     return;
19972   }
19973   case ISD::INTRINSIC_W_CHAIN: {
19974     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
19975     switch (IntNo) {
19976     default : llvm_unreachable("Do not know how to custom type "
19977                                "legalize this intrinsic operation!");
19978     case Intrinsic::x86_rdtsc:
19979       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19980                                      Results);
19981     case Intrinsic::x86_rdtscp:
19982       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
19983                                      Results);
19984     case Intrinsic::x86_rdpmc:
19985       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
19986     }
19987   }
19988   case ISD::READCYCLECOUNTER: {
19989     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19990                                    Results);
19991   }
19992   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
19993     EVT T = N->getValueType(0);
19994     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
19995     bool Regs64bit = T == MVT::i128;
19996     EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
19997     SDValue cpInL, cpInH;
19998     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19999                         DAG.getConstant(0, HalfT));
20000     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
20001                         DAG.getConstant(1, HalfT));
20002     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
20003                              Regs64bit ? X86::RAX : X86::EAX,
20004                              cpInL, SDValue());
20005     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
20006                              Regs64bit ? X86::RDX : X86::EDX,
20007                              cpInH, cpInL.getValue(1));
20008     SDValue swapInL, swapInH;
20009     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
20010                           DAG.getConstant(0, HalfT));
20011     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
20012                           DAG.getConstant(1, HalfT));
20013     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
20014                                Regs64bit ? X86::RBX : X86::EBX,
20015                                swapInL, cpInH.getValue(1));
20016     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
20017                                Regs64bit ? X86::RCX : X86::ECX,
20018                                swapInH, swapInL.getValue(1));
20019     SDValue Ops[] = { swapInH.getValue(0),
20020                       N->getOperand(1),
20021                       swapInH.getValue(1) };
20022     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20023     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
20024     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
20025                                   X86ISD::LCMPXCHG8_DAG;
20026     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
20027     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
20028                                         Regs64bit ? X86::RAX : X86::EAX,
20029                                         HalfT, Result.getValue(1));
20030     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
20031                                         Regs64bit ? X86::RDX : X86::EDX,
20032                                         HalfT, cpOutL.getValue(2));
20033     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
20034
20035     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
20036                                         MVT::i32, cpOutH.getValue(2));
20037     SDValue Success =
20038         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
20039                     DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
20040     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
20041
20042     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
20043     Results.push_back(Success);
20044     Results.push_back(EFLAGS.getValue(1));
20045     return;
20046   }
20047   case ISD::ATOMIC_SWAP:
20048   case ISD::ATOMIC_LOAD_ADD:
20049   case ISD::ATOMIC_LOAD_SUB:
20050   case ISD::ATOMIC_LOAD_AND:
20051   case ISD::ATOMIC_LOAD_OR:
20052   case ISD::ATOMIC_LOAD_XOR:
20053   case ISD::ATOMIC_LOAD_NAND:
20054   case ISD::ATOMIC_LOAD_MIN:
20055   case ISD::ATOMIC_LOAD_MAX:
20056   case ISD::ATOMIC_LOAD_UMIN:
20057   case ISD::ATOMIC_LOAD_UMAX:
20058   case ISD::ATOMIC_LOAD: {
20059     // Delegate to generic TypeLegalization. Situations we can really handle
20060     // should have already been dealt with by AtomicExpandPass.cpp.
20061     break;
20062   }
20063   case ISD::BITCAST: {
20064     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
20065     EVT DstVT = N->getValueType(0);
20066     EVT SrcVT = N->getOperand(0)->getValueType(0);
20067
20068     if (SrcVT != MVT::f64 ||
20069         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
20070       return;
20071
20072     unsigned NumElts = DstVT.getVectorNumElements();
20073     EVT SVT = DstVT.getVectorElementType();
20074     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
20075     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
20076                                    MVT::v2f64, N->getOperand(0));
20077     SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
20078
20079     if (ExperimentalVectorWideningLegalization) {
20080       // If we are legalizing vectors by widening, we already have the desired
20081       // legal vector type, just return it.
20082       Results.push_back(ToVecInt);
20083       return;
20084     }
20085
20086     SmallVector<SDValue, 8> Elts;
20087     for (unsigned i = 0, e = NumElts; i != e; ++i)
20088       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
20089                                    ToVecInt, DAG.getIntPtrConstant(i)));
20090
20091     Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
20092   }
20093   }
20094 }
20095
20096 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
20097   switch (Opcode) {
20098   default: return nullptr;
20099   case X86ISD::BSF:                return "X86ISD::BSF";
20100   case X86ISD::BSR:                return "X86ISD::BSR";
20101   case X86ISD::SHLD:               return "X86ISD::SHLD";
20102   case X86ISD::SHRD:               return "X86ISD::SHRD";
20103   case X86ISD::FAND:               return "X86ISD::FAND";
20104   case X86ISD::FANDN:              return "X86ISD::FANDN";
20105   case X86ISD::FOR:                return "X86ISD::FOR";
20106   case X86ISD::FXOR:               return "X86ISD::FXOR";
20107   case X86ISD::FSRL:               return "X86ISD::FSRL";
20108   case X86ISD::FILD:               return "X86ISD::FILD";
20109   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
20110   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
20111   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
20112   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
20113   case X86ISD::FLD:                return "X86ISD::FLD";
20114   case X86ISD::FST:                return "X86ISD::FST";
20115   case X86ISD::CALL:               return "X86ISD::CALL";
20116   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
20117   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
20118   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
20119   case X86ISD::BT:                 return "X86ISD::BT";
20120   case X86ISD::CMP:                return "X86ISD::CMP";
20121   case X86ISD::COMI:               return "X86ISD::COMI";
20122   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
20123   case X86ISD::CMPM:               return "X86ISD::CMPM";
20124   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
20125   case X86ISD::SETCC:              return "X86ISD::SETCC";
20126   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
20127   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
20128   case X86ISD::CMOV:               return "X86ISD::CMOV";
20129   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
20130   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
20131   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
20132   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
20133   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
20134   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
20135   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
20136   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
20137   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
20138   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
20139   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
20140   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
20141   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
20142   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
20143   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
20144   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
20145   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
20146   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
20147   case X86ISD::HADD:               return "X86ISD::HADD";
20148   case X86ISD::HSUB:               return "X86ISD::HSUB";
20149   case X86ISD::FHADD:              return "X86ISD::FHADD";
20150   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
20151   case X86ISD::UMAX:               return "X86ISD::UMAX";
20152   case X86ISD::UMIN:               return "X86ISD::UMIN";
20153   case X86ISD::SMAX:               return "X86ISD::SMAX";
20154   case X86ISD::SMIN:               return "X86ISD::SMIN";
20155   case X86ISD::FMAX:               return "X86ISD::FMAX";
20156   case X86ISD::FMIN:               return "X86ISD::FMIN";
20157   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
20158   case X86ISD::FMINC:              return "X86ISD::FMINC";
20159   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
20160   case X86ISD::FRCP:               return "X86ISD::FRCP";
20161   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
20162   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
20163   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
20164   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
20165   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
20166   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
20167   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
20168   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
20169   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
20170   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
20171   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
20172   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
20173   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
20174   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
20175   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
20176   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
20177   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
20178   case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
20179   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
20180   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
20181   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
20182   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
20183   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
20184   case X86ISD::VSHL:               return "X86ISD::VSHL";
20185   case X86ISD::VSRL:               return "X86ISD::VSRL";
20186   case X86ISD::VSRA:               return "X86ISD::VSRA";
20187   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
20188   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
20189   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
20190   case X86ISD::CMPP:               return "X86ISD::CMPP";
20191   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
20192   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
20193   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
20194   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
20195   case X86ISD::ADD:                return "X86ISD::ADD";
20196   case X86ISD::SUB:                return "X86ISD::SUB";
20197   case X86ISD::ADC:                return "X86ISD::ADC";
20198   case X86ISD::SBB:                return "X86ISD::SBB";
20199   case X86ISD::SMUL:               return "X86ISD::SMUL";
20200   case X86ISD::UMUL:               return "X86ISD::UMUL";
20201   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
20202   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
20203   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
20204   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
20205   case X86ISD::INC:                return "X86ISD::INC";
20206   case X86ISD::DEC:                return "X86ISD::DEC";
20207   case X86ISD::OR:                 return "X86ISD::OR";
20208   case X86ISD::XOR:                return "X86ISD::XOR";
20209   case X86ISD::AND:                return "X86ISD::AND";
20210   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
20211   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
20212   case X86ISD::PTEST:              return "X86ISD::PTEST";
20213   case X86ISD::TESTP:              return "X86ISD::TESTP";
20214   case X86ISD::TESTM:              return "X86ISD::TESTM";
20215   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
20216   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
20217   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
20218   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
20219   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
20220   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
20221   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
20222   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
20223   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
20224   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
20225   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
20226   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
20227   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
20228   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
20229   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
20230   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
20231   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
20232   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
20233   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
20234   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
20235   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
20236   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
20237   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
20238   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
20239   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
20240   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
20241   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
20242   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
20243   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
20244   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
20245   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
20246   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
20247   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
20248   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
20249   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
20250   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
20251   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
20252   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
20253   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
20254   case X86ISD::SAHF:               return "X86ISD::SAHF";
20255   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
20256   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
20257   case X86ISD::FMADD:              return "X86ISD::FMADD";
20258   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
20259   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
20260   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
20261   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
20262   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
20263   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
20264   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
20265   case X86ISD::XTEST:              return "X86ISD::XTEST";
20266   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
20267   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
20268   case X86ISD::SELECT:             return "X86ISD::SELECT";
20269   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
20270   case X86ISD::RCP28:              return "X86ISD::RCP28";
20271   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
20272   }
20273 }
20274
20275 // isLegalAddressingMode - Return true if the addressing mode represented
20276 // by AM is legal for this target, for a load/store of the specified type.
20277 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
20278                                               Type *Ty) const {
20279   // X86 supports extremely general addressing modes.
20280   CodeModel::Model M = getTargetMachine().getCodeModel();
20281   Reloc::Model R = getTargetMachine().getRelocationModel();
20282
20283   // X86 allows a sign-extended 32-bit immediate field as a displacement.
20284   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
20285     return false;
20286
20287   if (AM.BaseGV) {
20288     unsigned GVFlags =
20289       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
20290
20291     // If a reference to this global requires an extra load, we can't fold it.
20292     if (isGlobalStubReference(GVFlags))
20293       return false;
20294
20295     // If BaseGV requires a register for the PIC base, we cannot also have a
20296     // BaseReg specified.
20297     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
20298       return false;
20299
20300     // If lower 4G is not available, then we must use rip-relative addressing.
20301     if ((M != CodeModel::Small || R != Reloc::Static) &&
20302         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
20303       return false;
20304   }
20305
20306   switch (AM.Scale) {
20307   case 0:
20308   case 1:
20309   case 2:
20310   case 4:
20311   case 8:
20312     // These scales always work.
20313     break;
20314   case 3:
20315   case 5:
20316   case 9:
20317     // These scales are formed with basereg+scalereg.  Only accept if there is
20318     // no basereg yet.
20319     if (AM.HasBaseReg)
20320       return false;
20321     break;
20322   default:  // Other stuff never works.
20323     return false;
20324   }
20325
20326   return true;
20327 }
20328
20329 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
20330   unsigned Bits = Ty->getScalarSizeInBits();
20331
20332   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
20333   // particularly cheaper than those without.
20334   if (Bits == 8)
20335     return false;
20336
20337   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
20338   // variable shifts just as cheap as scalar ones.
20339   if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
20340     return false;
20341
20342   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
20343   // fully general vector.
20344   return true;
20345 }
20346
20347 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
20348   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20349     return false;
20350   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
20351   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
20352   return NumBits1 > NumBits2;
20353 }
20354
20355 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
20356   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20357     return false;
20358
20359   if (!isTypeLegal(EVT::getEVT(Ty1)))
20360     return false;
20361
20362   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
20363
20364   // Assuming the caller doesn't have a zeroext or signext return parameter,
20365   // truncation all the way down to i1 is valid.
20366   return true;
20367 }
20368
20369 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
20370   return isInt<32>(Imm);
20371 }
20372
20373 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
20374   // Can also use sub to handle negated immediates.
20375   return isInt<32>(Imm);
20376 }
20377
20378 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
20379   if (!VT1.isInteger() || !VT2.isInteger())
20380     return false;
20381   unsigned NumBits1 = VT1.getSizeInBits();
20382   unsigned NumBits2 = VT2.getSizeInBits();
20383   return NumBits1 > NumBits2;
20384 }
20385
20386 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
20387   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20388   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
20389 }
20390
20391 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
20392   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20393   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
20394 }
20395
20396 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
20397   EVT VT1 = Val.getValueType();
20398   if (isZExtFree(VT1, VT2))
20399     return true;
20400
20401   if (Val.getOpcode() != ISD::LOAD)
20402     return false;
20403
20404   if (!VT1.isSimple() || !VT1.isInteger() ||
20405       !VT2.isSimple() || !VT2.isInteger())
20406     return false;
20407
20408   switch (VT1.getSimpleVT().SimpleTy) {
20409   default: break;
20410   case MVT::i8:
20411   case MVT::i16:
20412   case MVT::i32:
20413     // X86 has 8, 16, and 32-bit zero-extending loads.
20414     return true;
20415   }
20416
20417   return false;
20418 }
20419
20420 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
20421
20422 bool
20423 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
20424   if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
20425     return false;
20426
20427   VT = VT.getScalarType();
20428
20429   if (!VT.isSimple())
20430     return false;
20431
20432   switch (VT.getSimpleVT().SimpleTy) {
20433   case MVT::f32:
20434   case MVT::f64:
20435     return true;
20436   default:
20437     break;
20438   }
20439
20440   return false;
20441 }
20442
20443 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
20444   // i16 instructions are longer (0x66 prefix) and potentially slower.
20445   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
20446 }
20447
20448 /// isShuffleMaskLegal - Targets can use this to indicate that they only
20449 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
20450 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
20451 /// are assumed to be legal.
20452 bool
20453 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
20454                                       EVT VT) const {
20455   if (!VT.isSimple())
20456     return false;
20457
20458   MVT SVT = VT.getSimpleVT();
20459
20460   // Very little shuffling can be done for 64-bit vectors right now.
20461   if (VT.getSizeInBits() == 64)
20462     return false;
20463
20464   // This is an experimental legality test that is tailored to match the
20465   // legality test of the experimental lowering more closely. They are gated
20466   // separately to ease testing of performance differences.
20467   if (ExperimentalVectorShuffleLegality)
20468     // We only care that the types being shuffled are legal. The lowering can
20469     // handle any possible shuffle mask that results.
20470     return isTypeLegal(SVT);
20471
20472   // If this is a single-input shuffle with no 128 bit lane crossings we can
20473   // lower it into pshufb.
20474   if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
20475       (SVT.is256BitVector() && Subtarget->hasInt256())) {
20476     bool isLegal = true;
20477     for (unsigned I = 0, E = M.size(); I != E; ++I) {
20478       if (M[I] >= (int)SVT.getVectorNumElements() ||
20479           ShuffleCrosses128bitLane(SVT, I, M[I])) {
20480         isLegal = false;
20481         break;
20482       }
20483     }
20484     if (isLegal)
20485       return true;
20486   }
20487
20488   // FIXME: blends, shifts.
20489   return (SVT.getVectorNumElements() == 2 ||
20490           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
20491           isMOVLMask(M, SVT) ||
20492           isCommutedMOVLMask(M, SVT) ||
20493           isMOVHLPSMask(M, SVT) ||
20494           isSHUFPMask(M, SVT) ||
20495           isSHUFPMask(M, SVT, /* Commuted */ true) ||
20496           isPSHUFDMask(M, SVT) ||
20497           isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
20498           isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
20499           isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
20500           isPALIGNRMask(M, SVT, Subtarget) ||
20501           isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
20502           isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
20503           isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20504           isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20505           isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
20506           (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
20507 }
20508
20509 bool
20510 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
20511                                           EVT VT) const {
20512   if (!VT.isSimple())
20513     return false;
20514
20515   MVT SVT = VT.getSimpleVT();
20516
20517   // This is an experimental legality test that is tailored to match the
20518   // legality test of the experimental lowering more closely. They are gated
20519   // separately to ease testing of performance differences.
20520   if (ExperimentalVectorShuffleLegality)
20521     // The new vector shuffle lowering is very good at managing zero-inputs.
20522     return isShuffleMaskLegal(Mask, VT);
20523
20524   unsigned NumElts = SVT.getVectorNumElements();
20525   // FIXME: This collection of masks seems suspect.
20526   if (NumElts == 2)
20527     return true;
20528   if (NumElts == 4 && SVT.is128BitVector()) {
20529     return (isMOVLMask(Mask, SVT)  ||
20530             isCommutedMOVLMask(Mask, SVT, true) ||
20531             isSHUFPMask(Mask, SVT) ||
20532             isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
20533             isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
20534                         Subtarget->hasInt256()));
20535   }
20536   return false;
20537 }
20538
20539 //===----------------------------------------------------------------------===//
20540 //                           X86 Scheduler Hooks
20541 //===----------------------------------------------------------------------===//
20542
20543 /// Utility function to emit xbegin specifying the start of an RTM region.
20544 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
20545                                      const TargetInstrInfo *TII) {
20546   DebugLoc DL = MI->getDebugLoc();
20547
20548   const BasicBlock *BB = MBB->getBasicBlock();
20549   MachineFunction::iterator I = MBB;
20550   ++I;
20551
20552   // For the v = xbegin(), we generate
20553   //
20554   // thisMBB:
20555   //  xbegin sinkMBB
20556   //
20557   // mainMBB:
20558   //  eax = -1
20559   //
20560   // sinkMBB:
20561   //  v = eax
20562
20563   MachineBasicBlock *thisMBB = MBB;
20564   MachineFunction *MF = MBB->getParent();
20565   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
20566   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
20567   MF->insert(I, mainMBB);
20568   MF->insert(I, sinkMBB);
20569
20570   // Transfer the remainder of BB and its successor edges to sinkMBB.
20571   sinkMBB->splice(sinkMBB->begin(), MBB,
20572                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20573   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
20574
20575   // thisMBB:
20576   //  xbegin sinkMBB
20577   //  # fallthrough to mainMBB
20578   //  # abortion to sinkMBB
20579   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
20580   thisMBB->addSuccessor(mainMBB);
20581   thisMBB->addSuccessor(sinkMBB);
20582
20583   // mainMBB:
20584   //  EAX = -1
20585   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
20586   mainMBB->addSuccessor(sinkMBB);
20587
20588   // sinkMBB:
20589   // EAX is live into the sinkMBB
20590   sinkMBB->addLiveIn(X86::EAX);
20591   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20592           TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20593     .addReg(X86::EAX);
20594
20595   MI->eraseFromParent();
20596   return sinkMBB;
20597 }
20598
20599 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
20600 // or XMM0_V32I8 in AVX all of this code can be replaced with that
20601 // in the .td file.
20602 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
20603                                        const TargetInstrInfo *TII) {
20604   unsigned Opc;
20605   switch (MI->getOpcode()) {
20606   default: llvm_unreachable("illegal opcode!");
20607   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
20608   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
20609   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
20610   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
20611   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
20612   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
20613   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
20614   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
20615   }
20616
20617   DebugLoc dl = MI->getDebugLoc();
20618   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20619
20620   unsigned NumArgs = MI->getNumOperands();
20621   for (unsigned i = 1; i < NumArgs; ++i) {
20622     MachineOperand &Op = MI->getOperand(i);
20623     if (!(Op.isReg() && Op.isImplicit()))
20624       MIB.addOperand(Op);
20625   }
20626   if (MI->hasOneMemOperand())
20627     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20628
20629   BuildMI(*BB, MI, dl,
20630     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20631     .addReg(X86::XMM0);
20632
20633   MI->eraseFromParent();
20634   return BB;
20635 }
20636
20637 // FIXME: Custom handling because TableGen doesn't support multiple implicit
20638 // defs in an instruction pattern
20639 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
20640                                        const TargetInstrInfo *TII) {
20641   unsigned Opc;
20642   switch (MI->getOpcode()) {
20643   default: llvm_unreachable("illegal opcode!");
20644   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
20645   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
20646   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
20647   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
20648   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
20649   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
20650   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
20651   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
20652   }
20653
20654   DebugLoc dl = MI->getDebugLoc();
20655   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20656
20657   unsigned NumArgs = MI->getNumOperands(); // remove the results
20658   for (unsigned i = 1; i < NumArgs; ++i) {
20659     MachineOperand &Op = MI->getOperand(i);
20660     if (!(Op.isReg() && Op.isImplicit()))
20661       MIB.addOperand(Op);
20662   }
20663   if (MI->hasOneMemOperand())
20664     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20665
20666   BuildMI(*BB, MI, dl,
20667     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20668     .addReg(X86::ECX);
20669
20670   MI->eraseFromParent();
20671   return BB;
20672 }
20673
20674 static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
20675                                       const X86Subtarget *Subtarget) {
20676   DebugLoc dl = MI->getDebugLoc();
20677   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20678   // Address into RAX/EAX, other two args into ECX, EDX.
20679   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
20680   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
20681   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
20682   for (int i = 0; i < X86::AddrNumOperands; ++i)
20683     MIB.addOperand(MI->getOperand(i));
20684
20685   unsigned ValOps = X86::AddrNumOperands;
20686   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
20687     .addReg(MI->getOperand(ValOps).getReg());
20688   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
20689     .addReg(MI->getOperand(ValOps+1).getReg());
20690
20691   // The instruction doesn't actually take any operands though.
20692   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
20693
20694   MI->eraseFromParent(); // The pseudo is gone now.
20695   return BB;
20696 }
20697
20698 MachineBasicBlock *
20699 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
20700                                                  MachineBasicBlock *MBB) const {
20701   // Emit va_arg instruction on X86-64.
20702
20703   // Operands to this pseudo-instruction:
20704   // 0  ) Output        : destination address (reg)
20705   // 1-5) Input         : va_list address (addr, i64mem)
20706   // 6  ) ArgSize       : Size (in bytes) of vararg type
20707   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
20708   // 8  ) Align         : Alignment of type
20709   // 9  ) EFLAGS (implicit-def)
20710
20711   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
20712   assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
20713
20714   unsigned DestReg = MI->getOperand(0).getReg();
20715   MachineOperand &Base = MI->getOperand(1);
20716   MachineOperand &Scale = MI->getOperand(2);
20717   MachineOperand &Index = MI->getOperand(3);
20718   MachineOperand &Disp = MI->getOperand(4);
20719   MachineOperand &Segment = MI->getOperand(5);
20720   unsigned ArgSize = MI->getOperand(6).getImm();
20721   unsigned ArgMode = MI->getOperand(7).getImm();
20722   unsigned Align = MI->getOperand(8).getImm();
20723
20724   // Memory Reference
20725   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
20726   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
20727   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
20728
20729   // Machine Information
20730   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20731   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
20732   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
20733   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
20734   DebugLoc DL = MI->getDebugLoc();
20735
20736   // struct va_list {
20737   //   i32   gp_offset
20738   //   i32   fp_offset
20739   //   i64   overflow_area (address)
20740   //   i64   reg_save_area (address)
20741   // }
20742   // sizeof(va_list) = 24
20743   // alignment(va_list) = 8
20744
20745   unsigned TotalNumIntRegs = 6;
20746   unsigned TotalNumXMMRegs = 8;
20747   bool UseGPOffset = (ArgMode == 1);
20748   bool UseFPOffset = (ArgMode == 2);
20749   unsigned MaxOffset = TotalNumIntRegs * 8 +
20750                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
20751
20752   /* Align ArgSize to a multiple of 8 */
20753   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
20754   bool NeedsAlign = (Align > 8);
20755
20756   MachineBasicBlock *thisMBB = MBB;
20757   MachineBasicBlock *overflowMBB;
20758   MachineBasicBlock *offsetMBB;
20759   MachineBasicBlock *endMBB;
20760
20761   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
20762   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
20763   unsigned OffsetReg = 0;
20764
20765   if (!UseGPOffset && !UseFPOffset) {
20766     // If we only pull from the overflow region, we don't create a branch.
20767     // We don't need to alter control flow.
20768     OffsetDestReg = 0; // unused
20769     OverflowDestReg = DestReg;
20770
20771     offsetMBB = nullptr;
20772     overflowMBB = thisMBB;
20773     endMBB = thisMBB;
20774   } else {
20775     // First emit code to check if gp_offset (or fp_offset) is below the bound.
20776     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
20777     // If not, pull from overflow_area. (branch to overflowMBB)
20778     //
20779     //       thisMBB
20780     //         |     .
20781     //         |        .
20782     //     offsetMBB   overflowMBB
20783     //         |        .
20784     //         |     .
20785     //        endMBB
20786
20787     // Registers for the PHI in endMBB
20788     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
20789     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
20790
20791     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20792     MachineFunction *MF = MBB->getParent();
20793     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20794     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20795     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20796
20797     MachineFunction::iterator MBBIter = MBB;
20798     ++MBBIter;
20799
20800     // Insert the new basic blocks
20801     MF->insert(MBBIter, offsetMBB);
20802     MF->insert(MBBIter, overflowMBB);
20803     MF->insert(MBBIter, endMBB);
20804
20805     // Transfer the remainder of MBB and its successor edges to endMBB.
20806     endMBB->splice(endMBB->begin(), thisMBB,
20807                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
20808     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
20809
20810     // Make offsetMBB and overflowMBB successors of thisMBB
20811     thisMBB->addSuccessor(offsetMBB);
20812     thisMBB->addSuccessor(overflowMBB);
20813
20814     // endMBB is a successor of both offsetMBB and overflowMBB
20815     offsetMBB->addSuccessor(endMBB);
20816     overflowMBB->addSuccessor(endMBB);
20817
20818     // Load the offset value into a register
20819     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20820     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
20821       .addOperand(Base)
20822       .addOperand(Scale)
20823       .addOperand(Index)
20824       .addDisp(Disp, UseFPOffset ? 4 : 0)
20825       .addOperand(Segment)
20826       .setMemRefs(MMOBegin, MMOEnd);
20827
20828     // Check if there is enough room left to pull this argument.
20829     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
20830       .addReg(OffsetReg)
20831       .addImm(MaxOffset + 8 - ArgSizeA8);
20832
20833     // Branch to "overflowMBB" if offset >= max
20834     // Fall through to "offsetMBB" otherwise
20835     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
20836       .addMBB(overflowMBB);
20837   }
20838
20839   // In offsetMBB, emit code to use the reg_save_area.
20840   if (offsetMBB) {
20841     assert(OffsetReg != 0);
20842
20843     // Read the reg_save_area address.
20844     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
20845     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
20846       .addOperand(Base)
20847       .addOperand(Scale)
20848       .addOperand(Index)
20849       .addDisp(Disp, 16)
20850       .addOperand(Segment)
20851       .setMemRefs(MMOBegin, MMOEnd);
20852
20853     // Zero-extend the offset
20854     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
20855       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
20856         .addImm(0)
20857         .addReg(OffsetReg)
20858         .addImm(X86::sub_32bit);
20859
20860     // Add the offset to the reg_save_area to get the final address.
20861     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
20862       .addReg(OffsetReg64)
20863       .addReg(RegSaveReg);
20864
20865     // Compute the offset for the next argument
20866     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20867     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
20868       .addReg(OffsetReg)
20869       .addImm(UseFPOffset ? 16 : 8);
20870
20871     // Store it back into the va_list.
20872     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
20873       .addOperand(Base)
20874       .addOperand(Scale)
20875       .addOperand(Index)
20876       .addDisp(Disp, UseFPOffset ? 4 : 0)
20877       .addOperand(Segment)
20878       .addReg(NextOffsetReg)
20879       .setMemRefs(MMOBegin, MMOEnd);
20880
20881     // Jump to endMBB
20882     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
20883       .addMBB(endMBB);
20884   }
20885
20886   //
20887   // Emit code to use overflow area
20888   //
20889
20890   // Load the overflow_area address into a register.
20891   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
20892   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
20893     .addOperand(Base)
20894     .addOperand(Scale)
20895     .addOperand(Index)
20896     .addDisp(Disp, 8)
20897     .addOperand(Segment)
20898     .setMemRefs(MMOBegin, MMOEnd);
20899
20900   // If we need to align it, do so. Otherwise, just copy the address
20901   // to OverflowDestReg.
20902   if (NeedsAlign) {
20903     // Align the overflow address
20904     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
20905     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
20906
20907     // aligned_addr = (addr + (align-1)) & ~(align-1)
20908     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
20909       .addReg(OverflowAddrReg)
20910       .addImm(Align-1);
20911
20912     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
20913       .addReg(TmpReg)
20914       .addImm(~(uint64_t)(Align-1));
20915   } else {
20916     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
20917       .addReg(OverflowAddrReg);
20918   }
20919
20920   // Compute the next overflow address after this argument.
20921   // (the overflow address should be kept 8-byte aligned)
20922   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
20923   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
20924     .addReg(OverflowDestReg)
20925     .addImm(ArgSizeA8);
20926
20927   // Store the new overflow address.
20928   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
20929     .addOperand(Base)
20930     .addOperand(Scale)
20931     .addOperand(Index)
20932     .addDisp(Disp, 8)
20933     .addOperand(Segment)
20934     .addReg(NextAddrReg)
20935     .setMemRefs(MMOBegin, MMOEnd);
20936
20937   // If we branched, emit the PHI to the front of endMBB.
20938   if (offsetMBB) {
20939     BuildMI(*endMBB, endMBB->begin(), DL,
20940             TII->get(X86::PHI), DestReg)
20941       .addReg(OffsetDestReg).addMBB(offsetMBB)
20942       .addReg(OverflowDestReg).addMBB(overflowMBB);
20943   }
20944
20945   // Erase the pseudo instruction
20946   MI->eraseFromParent();
20947
20948   return endMBB;
20949 }
20950
20951 MachineBasicBlock *
20952 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
20953                                                  MachineInstr *MI,
20954                                                  MachineBasicBlock *MBB) const {
20955   // Emit code to save XMM registers to the stack. The ABI says that the
20956   // number of registers to save is given in %al, so it's theoretically
20957   // possible to do an indirect jump trick to avoid saving all of them,
20958   // however this code takes a simpler approach and just executes all
20959   // of the stores if %al is non-zero. It's less code, and it's probably
20960   // easier on the hardware branch predictor, and stores aren't all that
20961   // expensive anyway.
20962
20963   // Create the new basic blocks. One block contains all the XMM stores,
20964   // and one block is the final destination regardless of whether any
20965   // stores were performed.
20966   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20967   MachineFunction *F = MBB->getParent();
20968   MachineFunction::iterator MBBIter = MBB;
20969   ++MBBIter;
20970   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
20971   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
20972   F->insert(MBBIter, XMMSaveMBB);
20973   F->insert(MBBIter, EndMBB);
20974
20975   // Transfer the remainder of MBB and its successor edges to EndMBB.
20976   EndMBB->splice(EndMBB->begin(), MBB,
20977                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20978   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
20979
20980   // The original block will now fall through to the XMM save block.
20981   MBB->addSuccessor(XMMSaveMBB);
20982   // The XMMSaveMBB will fall through to the end block.
20983   XMMSaveMBB->addSuccessor(EndMBB);
20984
20985   // Now add the instructions.
20986   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20987   DebugLoc DL = MI->getDebugLoc();
20988
20989   unsigned CountReg = MI->getOperand(0).getReg();
20990   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
20991   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
20992
20993   if (!Subtarget->isTargetWin64()) {
20994     // If %al is 0, branch around the XMM save block.
20995     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
20996     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
20997     MBB->addSuccessor(EndMBB);
20998   }
20999
21000   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
21001   // that was just emitted, but clearly shouldn't be "saved".
21002   assert((MI->getNumOperands() <= 3 ||
21003           !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
21004           MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
21005          && "Expected last argument to be EFLAGS");
21006   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
21007   // In the XMM save block, save all the XMM argument registers.
21008   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
21009     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
21010     MachineMemOperand *MMO =
21011       F->getMachineMemOperand(
21012           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
21013         MachineMemOperand::MOStore,
21014         /*Size=*/16, /*Align=*/16);
21015     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
21016       .addFrameIndex(RegSaveFrameIndex)
21017       .addImm(/*Scale=*/1)
21018       .addReg(/*IndexReg=*/0)
21019       .addImm(/*Disp=*/Offset)
21020       .addReg(/*Segment=*/0)
21021       .addReg(MI->getOperand(i).getReg())
21022       .addMemOperand(MMO);
21023   }
21024
21025   MI->eraseFromParent();   // The pseudo instruction is gone now.
21026
21027   return EndMBB;
21028 }
21029
21030 // The EFLAGS operand of SelectItr might be missing a kill marker
21031 // because there were multiple uses of EFLAGS, and ISel didn't know
21032 // which to mark. Figure out whether SelectItr should have had a
21033 // kill marker, and set it if it should. Returns the correct kill
21034 // marker value.
21035 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
21036                                      MachineBasicBlock* BB,
21037                                      const TargetRegisterInfo* TRI) {
21038   // Scan forward through BB for a use/def of EFLAGS.
21039   MachineBasicBlock::iterator miI(std::next(SelectItr));
21040   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
21041     const MachineInstr& mi = *miI;
21042     if (mi.readsRegister(X86::EFLAGS))
21043       return false;
21044     if (mi.definesRegister(X86::EFLAGS))
21045       break; // Should have kill-flag - update below.
21046   }
21047
21048   // If we hit the end of the block, check whether EFLAGS is live into a
21049   // successor.
21050   if (miI == BB->end()) {
21051     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
21052                                           sEnd = BB->succ_end();
21053          sItr != sEnd; ++sItr) {
21054       MachineBasicBlock* succ = *sItr;
21055       if (succ->isLiveIn(X86::EFLAGS))
21056         return false;
21057     }
21058   }
21059
21060   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
21061   // out. SelectMI should have a kill flag on EFLAGS.
21062   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
21063   return true;
21064 }
21065
21066 MachineBasicBlock *
21067 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
21068                                      MachineBasicBlock *BB) const {
21069   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21070   DebugLoc DL = MI->getDebugLoc();
21071
21072   // To "insert" a SELECT_CC instruction, we actually have to insert the
21073   // diamond control-flow pattern.  The incoming instruction knows the
21074   // destination vreg to set, the condition code register to branch on, the
21075   // true/false values to select between, and a branch opcode to use.
21076   const BasicBlock *LLVM_BB = BB->getBasicBlock();
21077   MachineFunction::iterator It = BB;
21078   ++It;
21079
21080   //  thisMBB:
21081   //  ...
21082   //   TrueVal = ...
21083   //   cmpTY ccX, r1, r2
21084   //   bCC copy1MBB
21085   //   fallthrough --> copy0MBB
21086   MachineBasicBlock *thisMBB = BB;
21087   MachineFunction *F = BB->getParent();
21088   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
21089   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
21090   F->insert(It, copy0MBB);
21091   F->insert(It, sinkMBB);
21092
21093   // If the EFLAGS register isn't dead in the terminator, then claim that it's
21094   // live into the sink and copy blocks.
21095   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
21096   if (!MI->killsRegister(X86::EFLAGS) &&
21097       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
21098     copy0MBB->addLiveIn(X86::EFLAGS);
21099     sinkMBB->addLiveIn(X86::EFLAGS);
21100   }
21101
21102   // Transfer the remainder of BB and its successor edges to sinkMBB.
21103   sinkMBB->splice(sinkMBB->begin(), BB,
21104                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
21105   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
21106
21107   // Add the true and fallthrough blocks as its successors.
21108   BB->addSuccessor(copy0MBB);
21109   BB->addSuccessor(sinkMBB);
21110
21111   // Create the conditional branch instruction.
21112   unsigned Opc =
21113     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
21114   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
21115
21116   //  copy0MBB:
21117   //   %FalseValue = ...
21118   //   # fallthrough to sinkMBB
21119   copy0MBB->addSuccessor(sinkMBB);
21120
21121   //  sinkMBB:
21122   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
21123   //  ...
21124   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21125           TII->get(X86::PHI), MI->getOperand(0).getReg())
21126     .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
21127     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
21128
21129   MI->eraseFromParent();   // The pseudo instruction is gone now.
21130   return sinkMBB;
21131 }
21132
21133 MachineBasicBlock *
21134 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
21135                                         MachineBasicBlock *BB) const {
21136   MachineFunction *MF = BB->getParent();
21137   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21138   DebugLoc DL = MI->getDebugLoc();
21139   const BasicBlock *LLVM_BB = BB->getBasicBlock();
21140
21141   assert(MF->shouldSplitStack());
21142
21143   const bool Is64Bit = Subtarget->is64Bit();
21144   const bool IsLP64 = Subtarget->isTarget64BitLP64();
21145
21146   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
21147   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
21148
21149   // BB:
21150   //  ... [Till the alloca]
21151   // If stacklet is not large enough, jump to mallocMBB
21152   //
21153   // bumpMBB:
21154   //  Allocate by subtracting from RSP
21155   //  Jump to continueMBB
21156   //
21157   // mallocMBB:
21158   //  Allocate by call to runtime
21159   //
21160   // continueMBB:
21161   //  ...
21162   //  [rest of original BB]
21163   //
21164
21165   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21166   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21167   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21168
21169   MachineRegisterInfo &MRI = MF->getRegInfo();
21170   const TargetRegisterClass *AddrRegClass =
21171     getRegClassFor(getPointerTy());
21172
21173   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21174     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21175     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
21176     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
21177     sizeVReg = MI->getOperand(1).getReg(),
21178     physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
21179
21180   MachineFunction::iterator MBBIter = BB;
21181   ++MBBIter;
21182
21183   MF->insert(MBBIter, bumpMBB);
21184   MF->insert(MBBIter, mallocMBB);
21185   MF->insert(MBBIter, continueMBB);
21186
21187   continueMBB->splice(continueMBB->begin(), BB,
21188                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
21189   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
21190
21191   // Add code to the main basic block to check if the stack limit has been hit,
21192   // and if so, jump to mallocMBB otherwise to bumpMBB.
21193   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
21194   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
21195     .addReg(tmpSPVReg).addReg(sizeVReg);
21196   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
21197     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
21198     .addReg(SPLimitVReg);
21199   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
21200
21201   // bumpMBB simply decreases the stack pointer, since we know the current
21202   // stacklet has enough space.
21203   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
21204     .addReg(SPLimitVReg);
21205   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
21206     .addReg(SPLimitVReg);
21207   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21208
21209   // Calls into a routine in libgcc to allocate more space from the heap.
21210   const uint32_t *RegMask =
21211       Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21212   if (IsLP64) {
21213     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
21214       .addReg(sizeVReg);
21215     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21216       .addExternalSymbol("__morestack_allocate_stack_space")
21217       .addRegMask(RegMask)
21218       .addReg(X86::RDI, RegState::Implicit)
21219       .addReg(X86::RAX, RegState::ImplicitDefine);
21220   } else if (Is64Bit) {
21221     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
21222       .addReg(sizeVReg);
21223     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21224       .addExternalSymbol("__morestack_allocate_stack_space")
21225       .addRegMask(RegMask)
21226       .addReg(X86::EDI, RegState::Implicit)
21227       .addReg(X86::EAX, RegState::ImplicitDefine);
21228   } else {
21229     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
21230       .addImm(12);
21231     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
21232     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
21233       .addExternalSymbol("__morestack_allocate_stack_space")
21234       .addRegMask(RegMask)
21235       .addReg(X86::EAX, RegState::ImplicitDefine);
21236   }
21237
21238   if (!Is64Bit)
21239     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
21240       .addImm(16);
21241
21242   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
21243     .addReg(IsLP64 ? X86::RAX : X86::EAX);
21244   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21245
21246   // Set up the CFG correctly.
21247   BB->addSuccessor(bumpMBB);
21248   BB->addSuccessor(mallocMBB);
21249   mallocMBB->addSuccessor(continueMBB);
21250   bumpMBB->addSuccessor(continueMBB);
21251
21252   // Take care of the PHI nodes.
21253   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
21254           MI->getOperand(0).getReg())
21255     .addReg(mallocPtrVReg).addMBB(mallocMBB)
21256     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
21257
21258   // Delete the original pseudo instruction.
21259   MI->eraseFromParent();
21260
21261   // And we're done.
21262   return continueMBB;
21263 }
21264
21265 MachineBasicBlock *
21266 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
21267                                         MachineBasicBlock *BB) const {
21268   DebugLoc DL = MI->getDebugLoc();
21269
21270   assert(!Subtarget->isTargetMachO());
21271
21272   X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
21273
21274   MI->eraseFromParent();   // The pseudo instruction is gone now.
21275   return BB;
21276 }
21277
21278 MachineBasicBlock *
21279 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
21280                                       MachineBasicBlock *BB) const {
21281   // This is pretty easy.  We're taking the value that we received from
21282   // our load from the relocation, sticking it in either RDI (x86-64)
21283   // or EAX and doing an indirect call.  The return value will then
21284   // be in the normal return register.
21285   MachineFunction *F = BB->getParent();
21286   const X86InstrInfo *TII = Subtarget->getInstrInfo();
21287   DebugLoc DL = MI->getDebugLoc();
21288
21289   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
21290   assert(MI->getOperand(3).isGlobal() && "This should be a global");
21291
21292   // Get a register mask for the lowered call.
21293   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
21294   // proper register mask.
21295   const uint32_t *RegMask =
21296       Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21297   if (Subtarget->is64Bit()) {
21298     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21299                                       TII->get(X86::MOV64rm), X86::RDI)
21300     .addReg(X86::RIP)
21301     .addImm(0).addReg(0)
21302     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21303                       MI->getOperand(3).getTargetFlags())
21304     .addReg(0);
21305     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
21306     addDirectMem(MIB, X86::RDI);
21307     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
21308   } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
21309     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21310                                       TII->get(X86::MOV32rm), X86::EAX)
21311     .addReg(0)
21312     .addImm(0).addReg(0)
21313     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21314                       MI->getOperand(3).getTargetFlags())
21315     .addReg(0);
21316     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21317     addDirectMem(MIB, X86::EAX);
21318     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21319   } else {
21320     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21321                                       TII->get(X86::MOV32rm), X86::EAX)
21322     .addReg(TII->getGlobalBaseReg(F))
21323     .addImm(0).addReg(0)
21324     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21325                       MI->getOperand(3).getTargetFlags())
21326     .addReg(0);
21327     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21328     addDirectMem(MIB, X86::EAX);
21329     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21330   }
21331
21332   MI->eraseFromParent(); // The pseudo instruction is gone now.
21333   return BB;
21334 }
21335
21336 MachineBasicBlock *
21337 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
21338                                     MachineBasicBlock *MBB) const {
21339   DebugLoc DL = MI->getDebugLoc();
21340   MachineFunction *MF = MBB->getParent();
21341   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21342   MachineRegisterInfo &MRI = MF->getRegInfo();
21343
21344   const BasicBlock *BB = MBB->getBasicBlock();
21345   MachineFunction::iterator I = MBB;
21346   ++I;
21347
21348   // Memory Reference
21349   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21350   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21351
21352   unsigned DstReg;
21353   unsigned MemOpndSlot = 0;
21354
21355   unsigned CurOp = 0;
21356
21357   DstReg = MI->getOperand(CurOp++).getReg();
21358   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
21359   assert(RC->hasType(MVT::i32) && "Invalid destination!");
21360   unsigned mainDstReg = MRI.createVirtualRegister(RC);
21361   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
21362
21363   MemOpndSlot = CurOp;
21364
21365   MVT PVT = getPointerTy();
21366   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21367          "Invalid Pointer Size!");
21368
21369   // For v = setjmp(buf), we generate
21370   //
21371   // thisMBB:
21372   //  buf[LabelOffset] = restoreMBB
21373   //  SjLjSetup restoreMBB
21374   //
21375   // mainMBB:
21376   //  v_main = 0
21377   //
21378   // sinkMBB:
21379   //  v = phi(main, restore)
21380   //
21381   // restoreMBB:
21382   //  if base pointer being used, load it from frame
21383   //  v_restore = 1
21384
21385   MachineBasicBlock *thisMBB = MBB;
21386   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
21387   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
21388   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
21389   MF->insert(I, mainMBB);
21390   MF->insert(I, sinkMBB);
21391   MF->push_back(restoreMBB);
21392
21393   MachineInstrBuilder MIB;
21394
21395   // Transfer the remainder of BB and its successor edges to sinkMBB.
21396   sinkMBB->splice(sinkMBB->begin(), MBB,
21397                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21398   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
21399
21400   // thisMBB:
21401   unsigned PtrStoreOpc = 0;
21402   unsigned LabelReg = 0;
21403   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21404   Reloc::Model RM = MF->getTarget().getRelocationModel();
21405   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
21406                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
21407
21408   // Prepare IP either in reg or imm.
21409   if (!UseImmLabel) {
21410     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
21411     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
21412     LabelReg = MRI.createVirtualRegister(PtrRC);
21413     if (Subtarget->is64Bit()) {
21414       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
21415               .addReg(X86::RIP)
21416               .addImm(0)
21417               .addReg(0)
21418               .addMBB(restoreMBB)
21419               .addReg(0);
21420     } else {
21421       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
21422       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
21423               .addReg(XII->getGlobalBaseReg(MF))
21424               .addImm(0)
21425               .addReg(0)
21426               .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
21427               .addReg(0);
21428     }
21429   } else
21430     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
21431   // Store IP
21432   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
21433   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21434     if (i == X86::AddrDisp)
21435       MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
21436     else
21437       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
21438   }
21439   if (!UseImmLabel)
21440     MIB.addReg(LabelReg);
21441   else
21442     MIB.addMBB(restoreMBB);
21443   MIB.setMemRefs(MMOBegin, MMOEnd);
21444   // Setup
21445   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
21446           .addMBB(restoreMBB);
21447
21448   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21449   MIB.addRegMask(RegInfo->getNoPreservedMask());
21450   thisMBB->addSuccessor(mainMBB);
21451   thisMBB->addSuccessor(restoreMBB);
21452
21453   // mainMBB:
21454   //  EAX = 0
21455   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
21456   mainMBB->addSuccessor(sinkMBB);
21457
21458   // sinkMBB:
21459   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21460           TII->get(X86::PHI), DstReg)
21461     .addReg(mainDstReg).addMBB(mainMBB)
21462     .addReg(restoreDstReg).addMBB(restoreMBB);
21463
21464   // restoreMBB:
21465   if (RegInfo->hasBasePointer(*MF)) {
21466     const bool Uses64BitFramePtr =
21467         Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
21468     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
21469     X86FI->setRestoreBasePointer(MF);
21470     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
21471     unsigned BasePtr = RegInfo->getBaseRegister();
21472     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
21473     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
21474                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
21475       .setMIFlag(MachineInstr::FrameSetup);
21476   }
21477   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
21478   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
21479   restoreMBB->addSuccessor(sinkMBB);
21480
21481   MI->eraseFromParent();
21482   return sinkMBB;
21483 }
21484
21485 MachineBasicBlock *
21486 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
21487                                      MachineBasicBlock *MBB) const {
21488   DebugLoc DL = MI->getDebugLoc();
21489   MachineFunction *MF = MBB->getParent();
21490   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21491   MachineRegisterInfo &MRI = MF->getRegInfo();
21492
21493   // Memory Reference
21494   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21495   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21496
21497   MVT PVT = getPointerTy();
21498   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21499          "Invalid Pointer Size!");
21500
21501   const TargetRegisterClass *RC =
21502     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
21503   unsigned Tmp = MRI.createVirtualRegister(RC);
21504   // Since FP is only updated here but NOT referenced, it's treated as GPR.
21505   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21506   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
21507   unsigned SP = RegInfo->getStackRegister();
21508
21509   MachineInstrBuilder MIB;
21510
21511   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21512   const int64_t SPOffset = 2 * PVT.getStoreSize();
21513
21514   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
21515   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
21516
21517   // Reload FP
21518   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
21519   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
21520     MIB.addOperand(MI->getOperand(i));
21521   MIB.setMemRefs(MMOBegin, MMOEnd);
21522   // Reload IP
21523   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
21524   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21525     if (i == X86::AddrDisp)
21526       MIB.addDisp(MI->getOperand(i), LabelOffset);
21527     else
21528       MIB.addOperand(MI->getOperand(i));
21529   }
21530   MIB.setMemRefs(MMOBegin, MMOEnd);
21531   // Reload SP
21532   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
21533   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21534     if (i == X86::AddrDisp)
21535       MIB.addDisp(MI->getOperand(i), SPOffset);
21536     else
21537       MIB.addOperand(MI->getOperand(i));
21538   }
21539   MIB.setMemRefs(MMOBegin, MMOEnd);
21540   // Jump
21541   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
21542
21543   MI->eraseFromParent();
21544   return MBB;
21545 }
21546
21547 // Replace 213-type (isel default) FMA3 instructions with 231-type for
21548 // accumulator loops. Writing back to the accumulator allows the coalescer
21549 // to remove extra copies in the loop.
21550 MachineBasicBlock *
21551 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
21552                                  MachineBasicBlock *MBB) const {
21553   MachineOperand &AddendOp = MI->getOperand(3);
21554
21555   // Bail out early if the addend isn't a register - we can't switch these.
21556   if (!AddendOp.isReg())
21557     return MBB;
21558
21559   MachineFunction &MF = *MBB->getParent();
21560   MachineRegisterInfo &MRI = MF.getRegInfo();
21561
21562   // Check whether the addend is defined by a PHI:
21563   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
21564   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
21565   if (!AddendDef.isPHI())
21566     return MBB;
21567
21568   // Look for the following pattern:
21569   // loop:
21570   //   %addend = phi [%entry, 0], [%loop, %result]
21571   //   ...
21572   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
21573
21574   // Replace with:
21575   //   loop:
21576   //   %addend = phi [%entry, 0], [%loop, %result]
21577   //   ...
21578   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
21579
21580   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
21581     assert(AddendDef.getOperand(i).isReg());
21582     MachineOperand PHISrcOp = AddendDef.getOperand(i);
21583     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
21584     if (&PHISrcInst == MI) {
21585       // Found a matching instruction.
21586       unsigned NewFMAOpc = 0;
21587       switch (MI->getOpcode()) {
21588         case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
21589         case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
21590         case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
21591         case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
21592         case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
21593         case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
21594         case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
21595         case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
21596         case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
21597         case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
21598         case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
21599         case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
21600         case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
21601         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
21602         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
21603         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
21604         case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
21605         case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
21606         case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
21607         case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
21608
21609         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
21610         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
21611         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
21612         case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
21613         case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
21614         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
21615         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
21616         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
21617         case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
21618         case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
21619         case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
21620         case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
21621         default: llvm_unreachable("Unrecognized FMA variant.");
21622       }
21623
21624       const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
21625       MachineInstrBuilder MIB =
21626         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
21627         .addOperand(MI->getOperand(0))
21628         .addOperand(MI->getOperand(3))
21629         .addOperand(MI->getOperand(2))
21630         .addOperand(MI->getOperand(1));
21631       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
21632       MI->eraseFromParent();
21633     }
21634   }
21635
21636   return MBB;
21637 }
21638
21639 MachineBasicBlock *
21640 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
21641                                                MachineBasicBlock *BB) const {
21642   switch (MI->getOpcode()) {
21643   default: llvm_unreachable("Unexpected instr type to insert");
21644   case X86::TAILJMPd64:
21645   case X86::TAILJMPr64:
21646   case X86::TAILJMPm64:
21647   case X86::TAILJMPd64_REX:
21648   case X86::TAILJMPr64_REX:
21649   case X86::TAILJMPm64_REX:
21650     llvm_unreachable("TAILJMP64 would not be touched here.");
21651   case X86::TCRETURNdi64:
21652   case X86::TCRETURNri64:
21653   case X86::TCRETURNmi64:
21654     return BB;
21655   case X86::WIN_ALLOCA:
21656     return EmitLoweredWinAlloca(MI, BB);
21657   case X86::SEG_ALLOCA_32:
21658   case X86::SEG_ALLOCA_64:
21659     return EmitLoweredSegAlloca(MI, BB);
21660   case X86::TLSCall_32:
21661   case X86::TLSCall_64:
21662     return EmitLoweredTLSCall(MI, BB);
21663   case X86::CMOV_GR8:
21664   case X86::CMOV_FR32:
21665   case X86::CMOV_FR64:
21666   case X86::CMOV_V4F32:
21667   case X86::CMOV_V2F64:
21668   case X86::CMOV_V2I64:
21669   case X86::CMOV_V8F32:
21670   case X86::CMOV_V4F64:
21671   case X86::CMOV_V4I64:
21672   case X86::CMOV_V16F32:
21673   case X86::CMOV_V8F64:
21674   case X86::CMOV_V8I64:
21675   case X86::CMOV_GR16:
21676   case X86::CMOV_GR32:
21677   case X86::CMOV_RFP32:
21678   case X86::CMOV_RFP64:
21679   case X86::CMOV_RFP80:
21680     return EmitLoweredSelect(MI, BB);
21681
21682   case X86::FP32_TO_INT16_IN_MEM:
21683   case X86::FP32_TO_INT32_IN_MEM:
21684   case X86::FP32_TO_INT64_IN_MEM:
21685   case X86::FP64_TO_INT16_IN_MEM:
21686   case X86::FP64_TO_INT32_IN_MEM:
21687   case X86::FP64_TO_INT64_IN_MEM:
21688   case X86::FP80_TO_INT16_IN_MEM:
21689   case X86::FP80_TO_INT32_IN_MEM:
21690   case X86::FP80_TO_INT64_IN_MEM: {
21691     MachineFunction *F = BB->getParent();
21692     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21693     DebugLoc DL = MI->getDebugLoc();
21694
21695     // Change the floating point control register to use "round towards zero"
21696     // mode when truncating to an integer value.
21697     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
21698     addFrameReference(BuildMI(*BB, MI, DL,
21699                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
21700
21701     // Load the old value of the high byte of the control word...
21702     unsigned OldCW =
21703       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
21704     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
21705                       CWFrameIdx);
21706
21707     // Set the high part to be round to zero...
21708     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
21709       .addImm(0xC7F);
21710
21711     // Reload the modified control word now...
21712     addFrameReference(BuildMI(*BB, MI, DL,
21713                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21714
21715     // Restore the memory image of control word to original value
21716     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
21717       .addReg(OldCW);
21718
21719     // Get the X86 opcode to use.
21720     unsigned Opc;
21721     switch (MI->getOpcode()) {
21722     default: llvm_unreachable("illegal opcode!");
21723     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
21724     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
21725     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
21726     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
21727     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
21728     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
21729     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
21730     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
21731     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
21732     }
21733
21734     X86AddressMode AM;
21735     MachineOperand &Op = MI->getOperand(0);
21736     if (Op.isReg()) {
21737       AM.BaseType = X86AddressMode::RegBase;
21738       AM.Base.Reg = Op.getReg();
21739     } else {
21740       AM.BaseType = X86AddressMode::FrameIndexBase;
21741       AM.Base.FrameIndex = Op.getIndex();
21742     }
21743     Op = MI->getOperand(1);
21744     if (Op.isImm())
21745       AM.Scale = Op.getImm();
21746     Op = MI->getOperand(2);
21747     if (Op.isImm())
21748       AM.IndexReg = Op.getImm();
21749     Op = MI->getOperand(3);
21750     if (Op.isGlobal()) {
21751       AM.GV = Op.getGlobal();
21752     } else {
21753       AM.Disp = Op.getImm();
21754     }
21755     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
21756                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
21757
21758     // Reload the original control word now.
21759     addFrameReference(BuildMI(*BB, MI, DL,
21760                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21761
21762     MI->eraseFromParent();   // The pseudo instruction is gone now.
21763     return BB;
21764   }
21765     // String/text processing lowering.
21766   case X86::PCMPISTRM128REG:
21767   case X86::VPCMPISTRM128REG:
21768   case X86::PCMPISTRM128MEM:
21769   case X86::VPCMPISTRM128MEM:
21770   case X86::PCMPESTRM128REG:
21771   case X86::VPCMPESTRM128REG:
21772   case X86::PCMPESTRM128MEM:
21773   case X86::VPCMPESTRM128MEM:
21774     assert(Subtarget->hasSSE42() &&
21775            "Target must have SSE4.2 or AVX features enabled");
21776     return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
21777
21778   // String/text processing lowering.
21779   case X86::PCMPISTRIREG:
21780   case X86::VPCMPISTRIREG:
21781   case X86::PCMPISTRIMEM:
21782   case X86::VPCMPISTRIMEM:
21783   case X86::PCMPESTRIREG:
21784   case X86::VPCMPESTRIREG:
21785   case X86::PCMPESTRIMEM:
21786   case X86::VPCMPESTRIMEM:
21787     assert(Subtarget->hasSSE42() &&
21788            "Target must have SSE4.2 or AVX features enabled");
21789     return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
21790
21791   // Thread synchronization.
21792   case X86::MONITOR:
21793     return EmitMonitor(MI, BB, Subtarget);
21794
21795   // xbegin
21796   case X86::XBEGIN:
21797     return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
21798
21799   case X86::VASTART_SAVE_XMM_REGS:
21800     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
21801
21802   case X86::VAARG_64:
21803     return EmitVAARG64WithCustomInserter(MI, BB);
21804
21805   case X86::EH_SjLj_SetJmp32:
21806   case X86::EH_SjLj_SetJmp64:
21807     return emitEHSjLjSetJmp(MI, BB);
21808
21809   case X86::EH_SjLj_LongJmp32:
21810   case X86::EH_SjLj_LongJmp64:
21811     return emitEHSjLjLongJmp(MI, BB);
21812
21813   case TargetOpcode::STATEPOINT:
21814     // As an implementation detail, STATEPOINT shares the STACKMAP format at
21815     // this point in the process.  We diverge later.
21816     return emitPatchPoint(MI, BB);
21817
21818   case TargetOpcode::STACKMAP:
21819   case TargetOpcode::PATCHPOINT:
21820     return emitPatchPoint(MI, BB);
21821
21822   case X86::VFMADDPDr213r:
21823   case X86::VFMADDPSr213r:
21824   case X86::VFMADDSDr213r:
21825   case X86::VFMADDSSr213r:
21826   case X86::VFMSUBPDr213r:
21827   case X86::VFMSUBPSr213r:
21828   case X86::VFMSUBSDr213r:
21829   case X86::VFMSUBSSr213r:
21830   case X86::VFNMADDPDr213r:
21831   case X86::VFNMADDPSr213r:
21832   case X86::VFNMADDSDr213r:
21833   case X86::VFNMADDSSr213r:
21834   case X86::VFNMSUBPDr213r:
21835   case X86::VFNMSUBPSr213r:
21836   case X86::VFNMSUBSDr213r:
21837   case X86::VFNMSUBSSr213r:
21838   case X86::VFMADDSUBPDr213r:
21839   case X86::VFMADDSUBPSr213r:
21840   case X86::VFMSUBADDPDr213r:
21841   case X86::VFMSUBADDPSr213r:
21842   case X86::VFMADDPDr213rY:
21843   case X86::VFMADDPSr213rY:
21844   case X86::VFMSUBPDr213rY:
21845   case X86::VFMSUBPSr213rY:
21846   case X86::VFNMADDPDr213rY:
21847   case X86::VFNMADDPSr213rY:
21848   case X86::VFNMSUBPDr213rY:
21849   case X86::VFNMSUBPSr213rY:
21850   case X86::VFMADDSUBPDr213rY:
21851   case X86::VFMADDSUBPSr213rY:
21852   case X86::VFMSUBADDPDr213rY:
21853   case X86::VFMSUBADDPSr213rY:
21854     return emitFMA3Instr(MI, BB);
21855   }
21856 }
21857
21858 //===----------------------------------------------------------------------===//
21859 //                           X86 Optimization Hooks
21860 //===----------------------------------------------------------------------===//
21861
21862 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
21863                                                       APInt &KnownZero,
21864                                                       APInt &KnownOne,
21865                                                       const SelectionDAG &DAG,
21866                                                       unsigned Depth) const {
21867   unsigned BitWidth = KnownZero.getBitWidth();
21868   unsigned Opc = Op.getOpcode();
21869   assert((Opc >= ISD::BUILTIN_OP_END ||
21870           Opc == ISD::INTRINSIC_WO_CHAIN ||
21871           Opc == ISD::INTRINSIC_W_CHAIN ||
21872           Opc == ISD::INTRINSIC_VOID) &&
21873          "Should use MaskedValueIsZero if you don't know whether Op"
21874          " is a target node!");
21875
21876   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
21877   switch (Opc) {
21878   default: break;
21879   case X86ISD::ADD:
21880   case X86ISD::SUB:
21881   case X86ISD::ADC:
21882   case X86ISD::SBB:
21883   case X86ISD::SMUL:
21884   case X86ISD::UMUL:
21885   case X86ISD::INC:
21886   case X86ISD::DEC:
21887   case X86ISD::OR:
21888   case X86ISD::XOR:
21889   case X86ISD::AND:
21890     // These nodes' second result is a boolean.
21891     if (Op.getResNo() == 0)
21892       break;
21893     // Fallthrough
21894   case X86ISD::SETCC:
21895     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
21896     break;
21897   case ISD::INTRINSIC_WO_CHAIN: {
21898     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21899     unsigned NumLoBits = 0;
21900     switch (IntId) {
21901     default: break;
21902     case Intrinsic::x86_sse_movmsk_ps:
21903     case Intrinsic::x86_avx_movmsk_ps_256:
21904     case Intrinsic::x86_sse2_movmsk_pd:
21905     case Intrinsic::x86_avx_movmsk_pd_256:
21906     case Intrinsic::x86_mmx_pmovmskb:
21907     case Intrinsic::x86_sse2_pmovmskb_128:
21908     case Intrinsic::x86_avx2_pmovmskb: {
21909       // High bits of movmskp{s|d}, pmovmskb are known zero.
21910       switch (IntId) {
21911         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
21912         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
21913         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
21914         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
21915         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
21916         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
21917         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
21918         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
21919       }
21920       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
21921       break;
21922     }
21923     }
21924     break;
21925   }
21926   }
21927 }
21928
21929 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
21930   SDValue Op,
21931   const SelectionDAG &,
21932   unsigned Depth) const {
21933   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
21934   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
21935     return Op.getValueType().getScalarType().getSizeInBits();
21936
21937   // Fallback case.
21938   return 1;
21939 }
21940
21941 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
21942 /// node is a GlobalAddress + offset.
21943 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
21944                                        const GlobalValue* &GA,
21945                                        int64_t &Offset) const {
21946   if (N->getOpcode() == X86ISD::Wrapper) {
21947     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
21948       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
21949       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
21950       return true;
21951     }
21952   }
21953   return TargetLowering::isGAPlusOffset(N, GA, Offset);
21954 }
21955
21956 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
21957 /// same as extracting the high 128-bit part of 256-bit vector and then
21958 /// inserting the result into the low part of a new 256-bit vector
21959 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
21960   EVT VT = SVOp->getValueType(0);
21961   unsigned NumElems = VT.getVectorNumElements();
21962
21963   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21964   for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
21965     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21966         SVOp->getMaskElt(j) >= 0)
21967       return false;
21968
21969   return true;
21970 }
21971
21972 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
21973 /// same as extracting the low 128-bit part of 256-bit vector and then
21974 /// inserting the result into the high part of a new 256-bit vector
21975 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
21976   EVT VT = SVOp->getValueType(0);
21977   unsigned NumElems = VT.getVectorNumElements();
21978
21979   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21980   for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
21981     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21982         SVOp->getMaskElt(j) >= 0)
21983       return false;
21984
21985   return true;
21986 }
21987
21988 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
21989 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
21990                                         TargetLowering::DAGCombinerInfo &DCI,
21991                                         const X86Subtarget* Subtarget) {
21992   SDLoc dl(N);
21993   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
21994   SDValue V1 = SVOp->getOperand(0);
21995   SDValue V2 = SVOp->getOperand(1);
21996   EVT VT = SVOp->getValueType(0);
21997   unsigned NumElems = VT.getVectorNumElements();
21998
21999   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
22000       V2.getOpcode() == ISD::CONCAT_VECTORS) {
22001     //
22002     //                   0,0,0,...
22003     //                      |
22004     //    V      UNDEF    BUILD_VECTOR    UNDEF
22005     //     \      /           \           /
22006     //  CONCAT_VECTOR         CONCAT_VECTOR
22007     //         \                  /
22008     //          \                /
22009     //          RESULT: V + zero extended
22010     //
22011     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
22012         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
22013         V1.getOperand(1).getOpcode() != ISD::UNDEF)
22014       return SDValue();
22015
22016     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
22017       return SDValue();
22018
22019     // To match the shuffle mask, the first half of the mask should
22020     // be exactly the first vector, and all the rest a splat with the
22021     // first element of the second one.
22022     for (unsigned i = 0; i != NumElems/2; ++i)
22023       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
22024           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
22025         return SDValue();
22026
22027     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
22028     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
22029       if (Ld->hasNUsesOfValue(1, 0)) {
22030         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
22031         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
22032         SDValue ResNode =
22033           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
22034                                   Ld->getMemoryVT(),
22035                                   Ld->getPointerInfo(),
22036                                   Ld->getAlignment(),
22037                                   false/*isVolatile*/, true/*ReadMem*/,
22038                                   false/*WriteMem*/);
22039
22040         // Make sure the newly-created LOAD is in the same position as Ld in
22041         // terms of dependency. We create a TokenFactor for Ld and ResNode,
22042         // and update uses of Ld's output chain to use the TokenFactor.
22043         if (Ld->hasAnyUseOfValue(1)) {
22044           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
22045                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
22046           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
22047           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
22048                                  SDValue(ResNode.getNode(), 1));
22049         }
22050
22051         return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
22052       }
22053     }
22054
22055     // Emit a zeroed vector and insert the desired subvector on its
22056     // first half.
22057     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22058     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
22059     return DCI.CombineTo(N, InsV);
22060   }
22061
22062   //===--------------------------------------------------------------------===//
22063   // Combine some shuffles into subvector extracts and inserts:
22064   //
22065
22066   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
22067   if (isShuffleHigh128VectorInsertLow(SVOp)) {
22068     SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
22069     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
22070     return DCI.CombineTo(N, InsV);
22071   }
22072
22073   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
22074   if (isShuffleLow128VectorInsertHigh(SVOp)) {
22075     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
22076     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
22077     return DCI.CombineTo(N, InsV);
22078   }
22079
22080   return SDValue();
22081 }
22082
22083 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
22084 /// possible.
22085 ///
22086 /// This is the leaf of the recursive combinine below. When we have found some
22087 /// chain of single-use x86 shuffle instructions and accumulated the combined
22088 /// shuffle mask represented by them, this will try to pattern match that mask
22089 /// into either a single instruction if there is a special purpose instruction
22090 /// for this operation, or into a PSHUFB instruction which is a fully general
22091 /// instruction but should only be used to replace chains over a certain depth.
22092 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
22093                                    int Depth, bool HasPSHUFB, SelectionDAG &DAG,
22094                                    TargetLowering::DAGCombinerInfo &DCI,
22095                                    const X86Subtarget *Subtarget) {
22096   assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
22097
22098   // Find the operand that enters the chain. Note that multiple uses are OK
22099   // here, we're not going to remove the operand we find.
22100   SDValue Input = Op.getOperand(0);
22101   while (Input.getOpcode() == ISD::BITCAST)
22102     Input = Input.getOperand(0);
22103
22104   MVT VT = Input.getSimpleValueType();
22105   MVT RootVT = Root.getSimpleValueType();
22106   SDLoc DL(Root);
22107
22108   // Just remove no-op shuffle masks.
22109   if (Mask.size() == 1) {
22110     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
22111                   /*AddTo*/ true);
22112     return true;
22113   }
22114
22115   // Use the float domain if the operand type is a floating point type.
22116   bool FloatDomain = VT.isFloatingPoint();
22117
22118   // For floating point shuffles, we don't have free copies in the shuffle
22119   // instructions or the ability to load as part of the instruction, so
22120   // canonicalize their shuffles to UNPCK or MOV variants.
22121   //
22122   // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
22123   // vectors because it can have a load folded into it that UNPCK cannot. This
22124   // doesn't preclude something switching to the shorter encoding post-RA.
22125   if (FloatDomain) {
22126     if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
22127       bool Lo = Mask.equals(0, 0);
22128       unsigned Shuffle;
22129       MVT ShuffleVT;
22130       // Check if we have SSE3 which will let us use MOVDDUP. That instruction
22131       // is no slower than UNPCKLPD but has the option to fold the input operand
22132       // into even an unaligned memory load.
22133       if (Lo && Subtarget->hasSSE3()) {
22134         Shuffle = X86ISD::MOVDDUP;
22135         ShuffleVT = MVT::v2f64;
22136       } else {
22137         // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
22138         // than the UNPCK variants.
22139         Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
22140         ShuffleVT = MVT::v4f32;
22141       }
22142       if (Depth == 1 && Root->getOpcode() == Shuffle)
22143         return false; // Nothing to do!
22144       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22145       DCI.AddToWorklist(Op.getNode());
22146       if (Shuffle == X86ISD::MOVDDUP)
22147         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22148       else
22149         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22150       DCI.AddToWorklist(Op.getNode());
22151       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22152                     /*AddTo*/ true);
22153       return true;
22154     }
22155     if (Subtarget->hasSSE3() &&
22156         (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
22157       bool Lo = Mask.equals(0, 0, 2, 2);
22158       unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
22159       MVT ShuffleVT = MVT::v4f32;
22160       if (Depth == 1 && Root->getOpcode() == Shuffle)
22161         return false; // Nothing to do!
22162       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22163       DCI.AddToWorklist(Op.getNode());
22164       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22165       DCI.AddToWorklist(Op.getNode());
22166       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22167                     /*AddTo*/ true);
22168       return true;
22169     }
22170     if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
22171       bool Lo = Mask.equals(0, 0, 1, 1);
22172       unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22173       MVT ShuffleVT = MVT::v4f32;
22174       if (Depth == 1 && Root->getOpcode() == Shuffle)
22175         return false; // Nothing to do!
22176       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22177       DCI.AddToWorklist(Op.getNode());
22178       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22179       DCI.AddToWorklist(Op.getNode());
22180       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22181                     /*AddTo*/ true);
22182       return true;
22183     }
22184   }
22185
22186   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
22187   // variants as none of these have single-instruction variants that are
22188   // superior to the UNPCK formulation.
22189   if (!FloatDomain &&
22190       (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
22191        Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
22192        Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
22193        Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
22194                    15))) {
22195     bool Lo = Mask[0] == 0;
22196     unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22197     if (Depth == 1 && Root->getOpcode() == Shuffle)
22198       return false; // Nothing to do!
22199     MVT ShuffleVT;
22200     switch (Mask.size()) {
22201     case 8:
22202       ShuffleVT = MVT::v8i16;
22203       break;
22204     case 16:
22205       ShuffleVT = MVT::v16i8;
22206       break;
22207     default:
22208       llvm_unreachable("Impossible mask size!");
22209     };
22210     Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22211     DCI.AddToWorklist(Op.getNode());
22212     Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22213     DCI.AddToWorklist(Op.getNode());
22214     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22215                   /*AddTo*/ true);
22216     return true;
22217   }
22218
22219   // Don't try to re-form single instruction chains under any circumstances now
22220   // that we've done encoding canonicalization for them.
22221   if (Depth < 2)
22222     return false;
22223
22224   // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
22225   // can replace them with a single PSHUFB instruction profitably. Intel's
22226   // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
22227   // in practice PSHUFB tends to be *very* fast so we're more aggressive.
22228   if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
22229     SmallVector<SDValue, 16> PSHUFBMask;
22230     assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
22231     int Ratio = 16 / Mask.size();
22232     for (unsigned i = 0; i < 16; ++i) {
22233       if (Mask[i / Ratio] == SM_SentinelUndef) {
22234         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
22235         continue;
22236       }
22237       int M = Mask[i / Ratio] != SM_SentinelZero
22238                   ? Ratio * Mask[i / Ratio] + i % Ratio
22239                   : 255;
22240       PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
22241     }
22242     Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
22243     DCI.AddToWorklist(Op.getNode());
22244     SDValue PSHUFBMaskOp =
22245         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
22246     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
22247     Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
22248     DCI.AddToWorklist(Op.getNode());
22249     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22250                   /*AddTo*/ true);
22251     return true;
22252   }
22253
22254   // Failed to find any combines.
22255   return false;
22256 }
22257
22258 /// \brief Fully generic combining of x86 shuffle instructions.
22259 ///
22260 /// This should be the last combine run over the x86 shuffle instructions. Once
22261 /// they have been fully optimized, this will recursively consider all chains
22262 /// of single-use shuffle instructions, build a generic model of the cumulative
22263 /// shuffle operation, and check for simpler instructions which implement this
22264 /// operation. We use this primarily for two purposes:
22265 ///
22266 /// 1) Collapse generic shuffles to specialized single instructions when
22267 ///    equivalent. In most cases, this is just an encoding size win, but
22268 ///    sometimes we will collapse multiple generic shuffles into a single
22269 ///    special-purpose shuffle.
22270 /// 2) Look for sequences of shuffle instructions with 3 or more total
22271 ///    instructions, and replace them with the slightly more expensive SSSE3
22272 ///    PSHUFB instruction if available. We do this as the last combining step
22273 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
22274 ///    a suitable short sequence of other instructions. The PHUFB will either
22275 ///    use a register or have to read from memory and so is slightly (but only
22276 ///    slightly) more expensive than the other shuffle instructions.
22277 ///
22278 /// Because this is inherently a quadratic operation (for each shuffle in
22279 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
22280 /// This should never be an issue in practice as the shuffle lowering doesn't
22281 /// produce sequences of more than 8 instructions.
22282 ///
22283 /// FIXME: We will currently miss some cases where the redundant shuffling
22284 /// would simplify under the threshold for PSHUFB formation because of
22285 /// combine-ordering. To fix this, we should do the redundant instruction
22286 /// combining in this recursive walk.
22287 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
22288                                           ArrayRef<int> RootMask,
22289                                           int Depth, bool HasPSHUFB,
22290                                           SelectionDAG &DAG,
22291                                           TargetLowering::DAGCombinerInfo &DCI,
22292                                           const X86Subtarget *Subtarget) {
22293   // Bound the depth of our recursive combine because this is ultimately
22294   // quadratic in nature.
22295   if (Depth > 8)
22296     return false;
22297
22298   // Directly rip through bitcasts to find the underlying operand.
22299   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
22300     Op = Op.getOperand(0);
22301
22302   MVT VT = Op.getSimpleValueType();
22303   if (!VT.isVector())
22304     return false; // Bail if we hit a non-vector.
22305   // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
22306   // version should be added.
22307   if (VT.getSizeInBits() != 128)
22308     return false;
22309
22310   assert(Root.getSimpleValueType().isVector() &&
22311          "Shuffles operate on vector types!");
22312   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
22313          "Can only combine shuffles of the same vector register size.");
22314
22315   if (!isTargetShuffle(Op.getOpcode()))
22316     return false;
22317   SmallVector<int, 16> OpMask;
22318   bool IsUnary;
22319   bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
22320   // We only can combine unary shuffles which we can decode the mask for.
22321   if (!HaveMask || !IsUnary)
22322     return false;
22323
22324   assert(VT.getVectorNumElements() == OpMask.size() &&
22325          "Different mask size from vector size!");
22326   assert(((RootMask.size() > OpMask.size() &&
22327            RootMask.size() % OpMask.size() == 0) ||
22328           (OpMask.size() > RootMask.size() &&
22329            OpMask.size() % RootMask.size() == 0) ||
22330           OpMask.size() == RootMask.size()) &&
22331          "The smaller number of elements must divide the larger.");
22332   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
22333   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
22334   assert(((RootRatio == 1 && OpRatio == 1) ||
22335           (RootRatio == 1) != (OpRatio == 1)) &&
22336          "Must not have a ratio for both incoming and op masks!");
22337
22338   SmallVector<int, 16> Mask;
22339   Mask.reserve(std::max(OpMask.size(), RootMask.size()));
22340
22341   // Merge this shuffle operation's mask into our accumulated mask. Note that
22342   // this shuffle's mask will be the first applied to the input, followed by the
22343   // root mask to get us all the way to the root value arrangement. The reason
22344   // for this order is that we are recursing up the operation chain.
22345   for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
22346     int RootIdx = i / RootRatio;
22347     if (RootMask[RootIdx] < 0) {
22348       // This is a zero or undef lane, we're done.
22349       Mask.push_back(RootMask[RootIdx]);
22350       continue;
22351     }
22352
22353     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
22354     int OpIdx = RootMaskedIdx / OpRatio;
22355     if (OpMask[OpIdx] < 0) {
22356       // The incoming lanes are zero or undef, it doesn't matter which ones we
22357       // are using.
22358       Mask.push_back(OpMask[OpIdx]);
22359       continue;
22360     }
22361
22362     // Ok, we have non-zero lanes, map them through.
22363     Mask.push_back(OpMask[OpIdx] * OpRatio +
22364                    RootMaskedIdx % OpRatio);
22365   }
22366
22367   // See if we can recurse into the operand to combine more things.
22368   switch (Op.getOpcode()) {
22369     case X86ISD::PSHUFB:
22370       HasPSHUFB = true;
22371     case X86ISD::PSHUFD:
22372     case X86ISD::PSHUFHW:
22373     case X86ISD::PSHUFLW:
22374       if (Op.getOperand(0).hasOneUse() &&
22375           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22376                                         HasPSHUFB, DAG, DCI, Subtarget))
22377         return true;
22378       break;
22379
22380     case X86ISD::UNPCKL:
22381     case X86ISD::UNPCKH:
22382       assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
22383       // We can't check for single use, we have to check that this shuffle is the only user.
22384       if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
22385           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22386                                         HasPSHUFB, DAG, DCI, Subtarget))
22387           return true;
22388       break;
22389   }
22390
22391   // Minor canonicalization of the accumulated shuffle mask to make it easier
22392   // to match below. All this does is detect masks with squential pairs of
22393   // elements, and shrink them to the half-width mask. It does this in a loop
22394   // so it will reduce the size of the mask to the minimal width mask which
22395   // performs an equivalent shuffle.
22396   SmallVector<int, 16> WidenedMask;
22397   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
22398     Mask = std::move(WidenedMask);
22399     WidenedMask.clear();
22400   }
22401
22402   return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
22403                                 Subtarget);
22404 }
22405
22406 /// \brief Get the PSHUF-style mask from PSHUF node.
22407 ///
22408 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
22409 /// PSHUF-style masks that can be reused with such instructions.
22410 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
22411   SmallVector<int, 4> Mask;
22412   bool IsUnary;
22413   bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
22414   (void)HaveMask;
22415   assert(HaveMask);
22416
22417   switch (N.getOpcode()) {
22418   case X86ISD::PSHUFD:
22419     return Mask;
22420   case X86ISD::PSHUFLW:
22421     Mask.resize(4);
22422     return Mask;
22423   case X86ISD::PSHUFHW:
22424     Mask.erase(Mask.begin(), Mask.begin() + 4);
22425     for (int &M : Mask)
22426       M -= 4;
22427     return Mask;
22428   default:
22429     llvm_unreachable("No valid shuffle instruction found!");
22430   }
22431 }
22432
22433 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
22434 ///
22435 /// We walk up the chain and look for a combinable shuffle, skipping over
22436 /// shuffles that we could hoist this shuffle's transformation past without
22437 /// altering anything.
22438 static SDValue
22439 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
22440                              SelectionDAG &DAG,
22441                              TargetLowering::DAGCombinerInfo &DCI) {
22442   assert(N.getOpcode() == X86ISD::PSHUFD &&
22443          "Called with something other than an x86 128-bit half shuffle!");
22444   SDLoc DL(N);
22445
22446   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
22447   // of the shuffles in the chain so that we can form a fresh chain to replace
22448   // this one.
22449   SmallVector<SDValue, 8> Chain;
22450   SDValue V = N.getOperand(0);
22451   for (; V.hasOneUse(); V = V.getOperand(0)) {
22452     switch (V.getOpcode()) {
22453     default:
22454       return SDValue(); // Nothing combined!
22455
22456     case ISD::BITCAST:
22457       // Skip bitcasts as we always know the type for the target specific
22458       // instructions.
22459       continue;
22460
22461     case X86ISD::PSHUFD:
22462       // Found another dword shuffle.
22463       break;
22464
22465     case X86ISD::PSHUFLW:
22466       // Check that the low words (being shuffled) are the identity in the
22467       // dword shuffle, and the high words are self-contained.
22468       if (Mask[0] != 0 || Mask[1] != 1 ||
22469           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
22470         return SDValue();
22471
22472       Chain.push_back(V);
22473       continue;
22474
22475     case X86ISD::PSHUFHW:
22476       // Check that the high words (being shuffled) are the identity in the
22477       // dword shuffle, and the low words are self-contained.
22478       if (Mask[2] != 2 || Mask[3] != 3 ||
22479           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
22480         return SDValue();
22481
22482       Chain.push_back(V);
22483       continue;
22484
22485     case X86ISD::UNPCKL:
22486     case X86ISD::UNPCKH:
22487       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
22488       // shuffle into a preceding word shuffle.
22489       if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
22490         return SDValue();
22491
22492       // Search for a half-shuffle which we can combine with.
22493       unsigned CombineOp =
22494           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
22495       if (V.getOperand(0) != V.getOperand(1) ||
22496           !V->isOnlyUserOf(V.getOperand(0).getNode()))
22497         return SDValue();
22498       Chain.push_back(V);
22499       V = V.getOperand(0);
22500       do {
22501         switch (V.getOpcode()) {
22502         default:
22503           return SDValue(); // Nothing to combine.
22504
22505         case X86ISD::PSHUFLW:
22506         case X86ISD::PSHUFHW:
22507           if (V.getOpcode() == CombineOp)
22508             break;
22509
22510           Chain.push_back(V);
22511
22512           // Fallthrough!
22513         case ISD::BITCAST:
22514           V = V.getOperand(0);
22515           continue;
22516         }
22517         break;
22518       } while (V.hasOneUse());
22519       break;
22520     }
22521     // Break out of the loop if we break out of the switch.
22522     break;
22523   }
22524
22525   if (!V.hasOneUse())
22526     // We fell out of the loop without finding a viable combining instruction.
22527     return SDValue();
22528
22529   // Merge this node's mask and our incoming mask.
22530   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22531   for (int &M : Mask)
22532     M = VMask[M];
22533   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
22534                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22535
22536   // Rebuild the chain around this new shuffle.
22537   while (!Chain.empty()) {
22538     SDValue W = Chain.pop_back_val();
22539
22540     if (V.getValueType() != W.getOperand(0).getValueType())
22541       V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
22542
22543     switch (W.getOpcode()) {
22544     default:
22545       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
22546
22547     case X86ISD::UNPCKL:
22548     case X86ISD::UNPCKH:
22549       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
22550       break;
22551
22552     case X86ISD::PSHUFD:
22553     case X86ISD::PSHUFLW:
22554     case X86ISD::PSHUFHW:
22555       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
22556       break;
22557     }
22558   }
22559   if (V.getValueType() != N.getValueType())
22560     V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
22561
22562   // Return the new chain to replace N.
22563   return V;
22564 }
22565
22566 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
22567 ///
22568 /// We walk up the chain, skipping shuffles of the other half and looking
22569 /// through shuffles which switch halves trying to find a shuffle of the same
22570 /// pair of dwords.
22571 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
22572                                         SelectionDAG &DAG,
22573                                         TargetLowering::DAGCombinerInfo &DCI) {
22574   assert(
22575       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
22576       "Called with something other than an x86 128-bit half shuffle!");
22577   SDLoc DL(N);
22578   unsigned CombineOpcode = N.getOpcode();
22579
22580   // Walk up a single-use chain looking for a combinable shuffle.
22581   SDValue V = N.getOperand(0);
22582   for (; V.hasOneUse(); V = V.getOperand(0)) {
22583     switch (V.getOpcode()) {
22584     default:
22585       return false; // Nothing combined!
22586
22587     case ISD::BITCAST:
22588       // Skip bitcasts as we always know the type for the target specific
22589       // instructions.
22590       continue;
22591
22592     case X86ISD::PSHUFLW:
22593     case X86ISD::PSHUFHW:
22594       if (V.getOpcode() == CombineOpcode)
22595         break;
22596
22597       // Other-half shuffles are no-ops.
22598       continue;
22599     }
22600     // Break out of the loop if we break out of the switch.
22601     break;
22602   }
22603
22604   if (!V.hasOneUse())
22605     // We fell out of the loop without finding a viable combining instruction.
22606     return false;
22607
22608   // Combine away the bottom node as its shuffle will be accumulated into
22609   // a preceding shuffle.
22610   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22611
22612   // Record the old value.
22613   SDValue Old = V;
22614
22615   // Merge this node's mask and our incoming mask (adjusted to account for all
22616   // the pshufd instructions encountered).
22617   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22618   for (int &M : Mask)
22619     M = VMask[M];
22620   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
22621                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22622
22623   // Check that the shuffles didn't cancel each other out. If not, we need to
22624   // combine to the new one.
22625   if (Old != V)
22626     // Replace the combinable shuffle with the combined one, updating all users
22627     // so that we re-evaluate the chain here.
22628     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
22629
22630   return true;
22631 }
22632
22633 /// \brief Try to combine x86 target specific shuffles.
22634 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
22635                                            TargetLowering::DAGCombinerInfo &DCI,
22636                                            const X86Subtarget *Subtarget) {
22637   SDLoc DL(N);
22638   MVT VT = N.getSimpleValueType();
22639   SmallVector<int, 4> Mask;
22640
22641   switch (N.getOpcode()) {
22642   case X86ISD::PSHUFD:
22643   case X86ISD::PSHUFLW:
22644   case X86ISD::PSHUFHW:
22645     Mask = getPSHUFShuffleMask(N);
22646     assert(Mask.size() == 4);
22647     break;
22648   default:
22649     return SDValue();
22650   }
22651
22652   // Nuke no-op shuffles that show up after combining.
22653   if (isNoopShuffleMask(Mask))
22654     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22655
22656   // Look for simplifications involving one or two shuffle instructions.
22657   SDValue V = N.getOperand(0);
22658   switch (N.getOpcode()) {
22659   default:
22660     break;
22661   case X86ISD::PSHUFLW:
22662   case X86ISD::PSHUFHW:
22663     assert(VT == MVT::v8i16);
22664     (void)VT;
22665
22666     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
22667       return SDValue(); // We combined away this shuffle, so we're done.
22668
22669     // See if this reduces to a PSHUFD which is no more expensive and can
22670     // combine with more operations. Note that it has to at least flip the
22671     // dwords as otherwise it would have been removed as a no-op.
22672     if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
22673       int DMask[] = {0, 1, 2, 3};
22674       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
22675       DMask[DOffset + 0] = DOffset + 1;
22676       DMask[DOffset + 1] = DOffset + 0;
22677       V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
22678       DCI.AddToWorklist(V.getNode());
22679       V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
22680                       getV4X86ShuffleImm8ForMask(DMask, DAG));
22681       DCI.AddToWorklist(V.getNode());
22682       return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
22683     }
22684
22685     // Look for shuffle patterns which can be implemented as a single unpack.
22686     // FIXME: This doesn't handle the location of the PSHUFD generically, and
22687     // only works when we have a PSHUFD followed by two half-shuffles.
22688     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
22689         (V.getOpcode() == X86ISD::PSHUFLW ||
22690          V.getOpcode() == X86ISD::PSHUFHW) &&
22691         V.getOpcode() != N.getOpcode() &&
22692         V.hasOneUse()) {
22693       SDValue D = V.getOperand(0);
22694       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
22695         D = D.getOperand(0);
22696       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
22697         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22698         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
22699         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22700         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22701         int WordMask[8];
22702         for (int i = 0; i < 4; ++i) {
22703           WordMask[i + NOffset] = Mask[i] + NOffset;
22704           WordMask[i + VOffset] = VMask[i] + VOffset;
22705         }
22706         // Map the word mask through the DWord mask.
22707         int MappedMask[8];
22708         for (int i = 0; i < 8; ++i)
22709           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
22710         const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
22711         const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
22712         if (std::equal(std::begin(MappedMask), std::end(MappedMask),
22713                        std::begin(UnpackLoMask)) ||
22714             std::equal(std::begin(MappedMask), std::end(MappedMask),
22715                        std::begin(UnpackHiMask))) {
22716           // We can replace all three shuffles with an unpack.
22717           V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
22718           DCI.AddToWorklist(V.getNode());
22719           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
22720                                                 : X86ISD::UNPCKH,
22721                              DL, MVT::v8i16, V, V);
22722         }
22723       }
22724     }
22725
22726     break;
22727
22728   case X86ISD::PSHUFD:
22729     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
22730       return NewN;
22731
22732     break;
22733   }
22734
22735   return SDValue();
22736 }
22737
22738 /// \brief Try to combine a shuffle into a target-specific add-sub node.
22739 ///
22740 /// We combine this directly on the abstract vector shuffle nodes so it is
22741 /// easier to generically match. We also insert dummy vector shuffle nodes for
22742 /// the operands which explicitly discard the lanes which are unused by this
22743 /// operation to try to flow through the rest of the combiner the fact that
22744 /// they're unused.
22745 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
22746   SDLoc DL(N);
22747   EVT VT = N->getValueType(0);
22748
22749   // We only handle target-independent shuffles.
22750   // FIXME: It would be easy and harmless to use the target shuffle mask
22751   // extraction tool to support more.
22752   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
22753     return SDValue();
22754
22755   auto *SVN = cast<ShuffleVectorSDNode>(N);
22756   ArrayRef<int> Mask = SVN->getMask();
22757   SDValue V1 = N->getOperand(0);
22758   SDValue V2 = N->getOperand(1);
22759
22760   // We require the first shuffle operand to be the SUB node, and the second to
22761   // be the ADD node.
22762   // FIXME: We should support the commuted patterns.
22763   if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
22764     return SDValue();
22765
22766   // If there are other uses of these operations we can't fold them.
22767   if (!V1->hasOneUse() || !V2->hasOneUse())
22768     return SDValue();
22769
22770   // Ensure that both operations have the same operands. Note that we can
22771   // commute the FADD operands.
22772   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
22773   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
22774       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
22775     return SDValue();
22776
22777   // We're looking for blends between FADD and FSUB nodes. We insist on these
22778   // nodes being lined up in a specific expected pattern.
22779   if (!(isShuffleEquivalent(Mask, 0, 3) ||
22780         isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
22781         isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
22782     return SDValue();
22783
22784   // Only specific types are legal at this point, assert so we notice if and
22785   // when these change.
22786   assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
22787           VT == MVT::v4f64) &&
22788          "Unknown vector type encountered!");
22789
22790   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
22791 }
22792
22793 /// PerformShuffleCombine - Performs several different shuffle combines.
22794 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
22795                                      TargetLowering::DAGCombinerInfo &DCI,
22796                                      const X86Subtarget *Subtarget) {
22797   SDLoc dl(N);
22798   SDValue N0 = N->getOperand(0);
22799   SDValue N1 = N->getOperand(1);
22800   EVT VT = N->getValueType(0);
22801
22802   // Don't create instructions with illegal types after legalize types has run.
22803   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22804   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
22805     return SDValue();
22806
22807   // If we have legalized the vector types, look for blends of FADD and FSUB
22808   // nodes that we can fuse into an ADDSUB node.
22809   if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
22810     if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
22811       return AddSub;
22812
22813   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
22814   if (Subtarget->hasFp256() && VT.is256BitVector() &&
22815       N->getOpcode() == ISD::VECTOR_SHUFFLE)
22816     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
22817
22818   // During Type Legalization, when promoting illegal vector types,
22819   // the backend might introduce new shuffle dag nodes and bitcasts.
22820   //
22821   // This code performs the following transformation:
22822   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
22823   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
22824   //
22825   // We do this only if both the bitcast and the BINOP dag nodes have
22826   // one use. Also, perform this transformation only if the new binary
22827   // operation is legal. This is to avoid introducing dag nodes that
22828   // potentially need to be further expanded (or custom lowered) into a
22829   // less optimal sequence of dag nodes.
22830   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
22831       N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
22832       N0.getOpcode() == ISD::BITCAST) {
22833     SDValue BC0 = N0.getOperand(0);
22834     EVT SVT = BC0.getValueType();
22835     unsigned Opcode = BC0.getOpcode();
22836     unsigned NumElts = VT.getVectorNumElements();
22837
22838     if (BC0.hasOneUse() && SVT.isVector() &&
22839         SVT.getVectorNumElements() * 2 == NumElts &&
22840         TLI.isOperationLegal(Opcode, VT)) {
22841       bool CanFold = false;
22842       switch (Opcode) {
22843       default : break;
22844       case ISD::ADD :
22845       case ISD::FADD :
22846       case ISD::SUB :
22847       case ISD::FSUB :
22848       case ISD::MUL :
22849       case ISD::FMUL :
22850         CanFold = true;
22851       }
22852
22853       unsigned SVTNumElts = SVT.getVectorNumElements();
22854       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
22855       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
22856         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
22857       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
22858         CanFold = SVOp->getMaskElt(i) < 0;
22859
22860       if (CanFold) {
22861         SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
22862         SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
22863         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
22864         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
22865       }
22866     }
22867   }
22868
22869   // Only handle 128 wide vector from here on.
22870   if (!VT.is128BitVector())
22871     return SDValue();
22872
22873   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
22874   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
22875   // consecutive, non-overlapping, and in the right order.
22876   SmallVector<SDValue, 16> Elts;
22877   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
22878     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
22879
22880   SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
22881   if (LD.getNode())
22882     return LD;
22883
22884   if (isTargetShuffle(N->getOpcode())) {
22885     SDValue Shuffle =
22886         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
22887     if (Shuffle.getNode())
22888       return Shuffle;
22889
22890     // Try recursively combining arbitrary sequences of x86 shuffle
22891     // instructions into higher-order shuffles. We do this after combining
22892     // specific PSHUF instruction sequences into their minimal form so that we
22893     // can evaluate how many specialized shuffle instructions are involved in
22894     // a particular chain.
22895     SmallVector<int, 1> NonceMask; // Just a placeholder.
22896     NonceMask.push_back(0);
22897     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
22898                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
22899                                       DCI, Subtarget))
22900       return SDValue(); // This routine will use CombineTo to replace N.
22901   }
22902
22903   return SDValue();
22904 }
22905
22906 /// PerformTruncateCombine - Converts truncate operation to
22907 /// a sequence of vector shuffle operations.
22908 /// It is possible when we truncate 256-bit vector to 128-bit vector
22909 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
22910                                       TargetLowering::DAGCombinerInfo &DCI,
22911                                       const X86Subtarget *Subtarget)  {
22912   return SDValue();
22913 }
22914
22915 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
22916 /// specific shuffle of a load can be folded into a single element load.
22917 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
22918 /// shuffles have been custom lowered so we need to handle those here.
22919 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
22920                                          TargetLowering::DAGCombinerInfo &DCI) {
22921   if (DCI.isBeforeLegalizeOps())
22922     return SDValue();
22923
22924   SDValue InVec = N->getOperand(0);
22925   SDValue EltNo = N->getOperand(1);
22926
22927   if (!isa<ConstantSDNode>(EltNo))
22928     return SDValue();
22929
22930   EVT OriginalVT = InVec.getValueType();
22931
22932   if (InVec.getOpcode() == ISD::BITCAST) {
22933     // Don't duplicate a load with other uses.
22934     if (!InVec.hasOneUse())
22935       return SDValue();
22936     EVT BCVT = InVec.getOperand(0).getValueType();
22937     if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
22938       return SDValue();
22939     InVec = InVec.getOperand(0);
22940   }
22941
22942   EVT CurrentVT = InVec.getValueType();
22943
22944   if (!isTargetShuffle(InVec.getOpcode()))
22945     return SDValue();
22946
22947   // Don't duplicate a load with other uses.
22948   if (!InVec.hasOneUse())
22949     return SDValue();
22950
22951   SmallVector<int, 16> ShuffleMask;
22952   bool UnaryShuffle;
22953   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
22954                             ShuffleMask, UnaryShuffle))
22955     return SDValue();
22956
22957   // Select the input vector, guarding against out of range extract vector.
22958   unsigned NumElems = CurrentVT.getVectorNumElements();
22959   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
22960   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
22961   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
22962                                          : InVec.getOperand(1);
22963
22964   // If inputs to shuffle are the same for both ops, then allow 2 uses
22965   unsigned AllowedUses = InVec.getNumOperands() > 1 &&
22966                          InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
22967
22968   if (LdNode.getOpcode() == ISD::BITCAST) {
22969     // Don't duplicate a load with other uses.
22970     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
22971       return SDValue();
22972
22973     AllowedUses = 1; // only allow 1 load use if we have a bitcast
22974     LdNode = LdNode.getOperand(0);
22975   }
22976
22977   if (!ISD::isNormalLoad(LdNode.getNode()))
22978     return SDValue();
22979
22980   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
22981
22982   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
22983     return SDValue();
22984
22985   EVT EltVT = N->getValueType(0);
22986   // If there's a bitcast before the shuffle, check if the load type and
22987   // alignment is valid.
22988   unsigned Align = LN0->getAlignment();
22989   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22990   unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
22991       EltVT.getTypeForEVT(*DAG.getContext()));
22992
22993   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
22994     return SDValue();
22995
22996   // All checks match so transform back to vector_shuffle so that DAG combiner
22997   // can finish the job
22998   SDLoc dl(N);
22999
23000   // Create shuffle node taking into account the case that its a unary shuffle
23001   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
23002                                    : InVec.getOperand(1);
23003   Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
23004                                  InVec.getOperand(0), Shuffle,
23005                                  &ShuffleMask[0]);
23006   Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
23007   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
23008                      EltNo);
23009 }
23010
23011 /// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
23012 /// special and don't usually play with other vector types, it's better to
23013 /// handle them early to be sure we emit efficient code by avoiding
23014 /// store-load conversions.
23015 static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
23016   if (N->getValueType(0) != MVT::x86mmx ||
23017       N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
23018       N->getOperand(0)->getValueType(0) != MVT::v2i32)
23019     return SDValue();
23020
23021   SDValue V = N->getOperand(0);
23022   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
23023   if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
23024     return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
23025                        N->getValueType(0), V.getOperand(0));
23026
23027   return SDValue();
23028 }
23029
23030 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
23031 /// generation and convert it from being a bunch of shuffles and extracts
23032 /// into a somewhat faster sequence. For i686, the best sequence is apparently
23033 /// storing the value and loading scalars back, while for x64 we should
23034 /// use 64-bit extracts and shifts.
23035 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
23036                                          TargetLowering::DAGCombinerInfo &DCI) {
23037   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
23038   if (NewOp.getNode())
23039     return NewOp;
23040
23041   SDValue InputVector = N->getOperand(0);
23042
23043   // Detect mmx to i32 conversion through a v2i32 elt extract.
23044   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
23045       N->getValueType(0) == MVT::i32 &&
23046       InputVector.getValueType() == MVT::v2i32) {
23047
23048     // The bitcast source is a direct mmx result.
23049     SDValue MMXSrc = InputVector.getNode()->getOperand(0);
23050     if (MMXSrc.getValueType() == MVT::x86mmx)
23051       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23052                          N->getValueType(0),
23053                          InputVector.getNode()->getOperand(0));
23054
23055     // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
23056     SDValue MMXSrcOp = MMXSrc.getOperand(0);
23057     if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
23058         MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
23059         MMXSrcOp.getOpcode() == ISD::BITCAST &&
23060         MMXSrcOp.getValueType() == MVT::v1i64 &&
23061         MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
23062       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23063                          N->getValueType(0),
23064                          MMXSrcOp.getOperand(0));
23065   }
23066
23067   // Only operate on vectors of 4 elements, where the alternative shuffling
23068   // gets to be more expensive.
23069   if (InputVector.getValueType() != MVT::v4i32)
23070     return SDValue();
23071
23072   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
23073   // single use which is a sign-extend or zero-extend, and all elements are
23074   // used.
23075   SmallVector<SDNode *, 4> Uses;
23076   unsigned ExtractedElements = 0;
23077   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
23078        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
23079     if (UI.getUse().getResNo() != InputVector.getResNo())
23080       return SDValue();
23081
23082     SDNode *Extract = *UI;
23083     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23084       return SDValue();
23085
23086     if (Extract->getValueType(0) != MVT::i32)
23087       return SDValue();
23088     if (!Extract->hasOneUse())
23089       return SDValue();
23090     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
23091         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
23092       return SDValue();
23093     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
23094       return SDValue();
23095
23096     // Record which element was extracted.
23097     ExtractedElements |=
23098       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
23099
23100     Uses.push_back(Extract);
23101   }
23102
23103   // If not all the elements were used, this may not be worthwhile.
23104   if (ExtractedElements != 15)
23105     return SDValue();
23106
23107   // Ok, we've now decided to do the transformation.
23108   // If 64-bit shifts are legal, use the extract-shift sequence,
23109   // otherwise bounce the vector off the cache.
23110   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23111   SDValue Vals[4];
23112   SDLoc dl(InputVector);
23113
23114   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
23115     SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
23116     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
23117     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23118       DAG.getConstant(0, VecIdxTy));
23119     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23120       DAG.getConstant(1, VecIdxTy));
23121
23122     SDValue ShAmt = DAG.getConstant(32,
23123       DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
23124     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
23125     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23126       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
23127     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
23128     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23129       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
23130   } else {
23131     // Store the value to a temporary stack slot.
23132     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
23133     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
23134       MachinePointerInfo(), false, false, 0);
23135
23136     EVT ElementType = InputVector.getValueType().getVectorElementType();
23137     unsigned EltSize = ElementType.getSizeInBits() / 8;
23138
23139     // Replace each use (extract) with a load of the appropriate element.
23140     for (unsigned i = 0; i < 4; ++i) {
23141       uint64_t Offset = EltSize * i;
23142       SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
23143
23144       SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
23145                                        StackPtr, OffsetVal);
23146
23147       // Load the scalar.
23148       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
23149                             ScalarAddr, MachinePointerInfo(),
23150                             false, false, false, 0);
23151
23152     }
23153   }
23154
23155   // Replace the extracts
23156   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
23157     UE = Uses.end(); UI != UE; ++UI) {
23158     SDNode *Extract = *UI;
23159
23160     SDValue Idx = Extract->getOperand(1);
23161     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
23162     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
23163   }
23164
23165   // The replacement was made in place; don't return anything.
23166   return SDValue();
23167 }
23168
23169 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
23170 static std::pair<unsigned, bool>
23171 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
23172                    SelectionDAG &DAG, const X86Subtarget *Subtarget) {
23173   if (!VT.isVector())
23174     return std::make_pair(0, false);
23175
23176   bool NeedSplit = false;
23177   switch (VT.getSimpleVT().SimpleTy) {
23178   default: return std::make_pair(0, false);
23179   case MVT::v4i64:
23180   case MVT::v2i64:
23181     if (!Subtarget->hasVLX())
23182       return std::make_pair(0, false);
23183     break;
23184   case MVT::v64i8:
23185   case MVT::v32i16:
23186     if (!Subtarget->hasBWI())
23187       return std::make_pair(0, false);
23188     break;
23189   case MVT::v16i32:
23190   case MVT::v8i64:
23191     if (!Subtarget->hasAVX512())
23192       return std::make_pair(0, false);
23193     break;
23194   case MVT::v32i8:
23195   case MVT::v16i16:
23196   case MVT::v8i32:
23197     if (!Subtarget->hasAVX2())
23198       NeedSplit = true;
23199     if (!Subtarget->hasAVX())
23200       return std::make_pair(0, false);
23201     break;
23202   case MVT::v16i8:
23203   case MVT::v8i16:
23204   case MVT::v4i32:
23205     if (!Subtarget->hasSSE2())
23206       return std::make_pair(0, false);
23207   }
23208
23209   // SSE2 has only a small subset of the operations.
23210   bool hasUnsigned = Subtarget->hasSSE41() ||
23211                      (Subtarget->hasSSE2() && VT == MVT::v16i8);
23212   bool hasSigned = Subtarget->hasSSE41() ||
23213                    (Subtarget->hasSSE2() && VT == MVT::v8i16);
23214
23215   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23216
23217   unsigned Opc = 0;
23218   // Check for x CC y ? x : y.
23219   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23220       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23221     switch (CC) {
23222     default: break;
23223     case ISD::SETULT:
23224     case ISD::SETULE:
23225       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23226     case ISD::SETUGT:
23227     case ISD::SETUGE:
23228       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23229     case ISD::SETLT:
23230     case ISD::SETLE:
23231       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23232     case ISD::SETGT:
23233     case ISD::SETGE:
23234       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23235     }
23236   // Check for x CC y ? y : x -- a min/max with reversed arms.
23237   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23238              DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23239     switch (CC) {
23240     default: break;
23241     case ISD::SETULT:
23242     case ISD::SETULE:
23243       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23244     case ISD::SETUGT:
23245     case ISD::SETUGE:
23246       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23247     case ISD::SETLT:
23248     case ISD::SETLE:
23249       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23250     case ISD::SETGT:
23251     case ISD::SETGE:
23252       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23253     }
23254   }
23255
23256   return std::make_pair(Opc, NeedSplit);
23257 }
23258
23259 static SDValue
23260 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
23261                                       const X86Subtarget *Subtarget) {
23262   SDLoc dl(N);
23263   SDValue Cond = N->getOperand(0);
23264   SDValue LHS = N->getOperand(1);
23265   SDValue RHS = N->getOperand(2);
23266
23267   if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
23268     SDValue CondSrc = Cond->getOperand(0);
23269     if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
23270       Cond = CondSrc->getOperand(0);
23271   }
23272
23273   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
23274     return SDValue();
23275
23276   // A vselect where all conditions and data are constants can be optimized into
23277   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
23278   if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
23279       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
23280     return SDValue();
23281
23282   unsigned MaskValue = 0;
23283   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
23284     return SDValue();
23285
23286   MVT VT = N->getSimpleValueType(0);
23287   unsigned NumElems = VT.getVectorNumElements();
23288   SmallVector<int, 8> ShuffleMask(NumElems, -1);
23289   for (unsigned i = 0; i < NumElems; ++i) {
23290     // Be sure we emit undef where we can.
23291     if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
23292       ShuffleMask[i] = -1;
23293     else
23294       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
23295   }
23296
23297   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23298   if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
23299     return SDValue();
23300   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
23301 }
23302
23303 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
23304 /// nodes.
23305 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
23306                                     TargetLowering::DAGCombinerInfo &DCI,
23307                                     const X86Subtarget *Subtarget) {
23308   SDLoc DL(N);
23309   SDValue Cond = N->getOperand(0);
23310   // Get the LHS/RHS of the select.
23311   SDValue LHS = N->getOperand(1);
23312   SDValue RHS = N->getOperand(2);
23313   EVT VT = LHS.getValueType();
23314   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23315
23316   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
23317   // instructions match the semantics of the common C idiom x<y?x:y but not
23318   // x<=y?x:y, because of how they handle negative zero (which can be
23319   // ignored in unsafe-math mode).
23320   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
23321   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
23322       VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
23323       (Subtarget->hasSSE2() ||
23324        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
23325     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23326
23327     unsigned Opcode = 0;
23328     // Check for x CC y ? x : y.
23329     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23330         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23331       switch (CC) {
23332       default: break;
23333       case ISD::SETULT:
23334         // Converting this to a min would handle NaNs incorrectly, and swapping
23335         // the operands would cause it to handle comparisons between positive
23336         // and negative zero incorrectly.
23337         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23338           if (!DAG.getTarget().Options.UnsafeFPMath &&
23339               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23340             break;
23341           std::swap(LHS, RHS);
23342         }
23343         Opcode = X86ISD::FMIN;
23344         break;
23345       case ISD::SETOLE:
23346         // Converting this to a min would handle comparisons between positive
23347         // and negative zero incorrectly.
23348         if (!DAG.getTarget().Options.UnsafeFPMath &&
23349             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23350           break;
23351         Opcode = X86ISD::FMIN;
23352         break;
23353       case ISD::SETULE:
23354         // Converting this to a min would handle both negative zeros and NaNs
23355         // incorrectly, but we can swap the operands to fix both.
23356         std::swap(LHS, RHS);
23357       case ISD::SETOLT:
23358       case ISD::SETLT:
23359       case ISD::SETLE:
23360         Opcode = X86ISD::FMIN;
23361         break;
23362
23363       case ISD::SETOGE:
23364         // Converting this to a max would handle comparisons between positive
23365         // and negative zero incorrectly.
23366         if (!DAG.getTarget().Options.UnsafeFPMath &&
23367             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23368           break;
23369         Opcode = X86ISD::FMAX;
23370         break;
23371       case ISD::SETUGT:
23372         // Converting this to a max would handle NaNs incorrectly, and swapping
23373         // the operands would cause it to handle comparisons between positive
23374         // and negative zero incorrectly.
23375         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23376           if (!DAG.getTarget().Options.UnsafeFPMath &&
23377               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23378             break;
23379           std::swap(LHS, RHS);
23380         }
23381         Opcode = X86ISD::FMAX;
23382         break;
23383       case ISD::SETUGE:
23384         // Converting this to a max would handle both negative zeros and NaNs
23385         // incorrectly, but we can swap the operands to fix both.
23386         std::swap(LHS, RHS);
23387       case ISD::SETOGT:
23388       case ISD::SETGT:
23389       case ISD::SETGE:
23390         Opcode = X86ISD::FMAX;
23391         break;
23392       }
23393     // Check for x CC y ? y : x -- a min/max with reversed arms.
23394     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23395                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23396       switch (CC) {
23397       default: break;
23398       case ISD::SETOGE:
23399         // Converting this to a min would handle comparisons between positive
23400         // and negative zero incorrectly, and swapping the operands would
23401         // cause it to handle NaNs incorrectly.
23402         if (!DAG.getTarget().Options.UnsafeFPMath &&
23403             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
23404           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23405             break;
23406           std::swap(LHS, RHS);
23407         }
23408         Opcode = X86ISD::FMIN;
23409         break;
23410       case ISD::SETUGT:
23411         // Converting this to a min would handle NaNs incorrectly.
23412         if (!DAG.getTarget().Options.UnsafeFPMath &&
23413             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
23414           break;
23415         Opcode = X86ISD::FMIN;
23416         break;
23417       case ISD::SETUGE:
23418         // Converting this to a min would handle both negative zeros and NaNs
23419         // incorrectly, but we can swap the operands to fix both.
23420         std::swap(LHS, RHS);
23421       case ISD::SETOGT:
23422       case ISD::SETGT:
23423       case ISD::SETGE:
23424         Opcode = X86ISD::FMIN;
23425         break;
23426
23427       case ISD::SETULT:
23428         // Converting this to a max would handle NaNs incorrectly.
23429         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23430           break;
23431         Opcode = X86ISD::FMAX;
23432         break;
23433       case ISD::SETOLE:
23434         // Converting this to a max would handle comparisons between positive
23435         // and negative zero incorrectly, and swapping the operands would
23436         // cause it to handle NaNs incorrectly.
23437         if (!DAG.getTarget().Options.UnsafeFPMath &&
23438             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
23439           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23440             break;
23441           std::swap(LHS, RHS);
23442         }
23443         Opcode = X86ISD::FMAX;
23444         break;
23445       case ISD::SETULE:
23446         // Converting this to a max would handle both negative zeros and NaNs
23447         // incorrectly, but we can swap the operands to fix both.
23448         std::swap(LHS, RHS);
23449       case ISD::SETOLT:
23450       case ISD::SETLT:
23451       case ISD::SETLE:
23452         Opcode = X86ISD::FMAX;
23453         break;
23454       }
23455     }
23456
23457     if (Opcode)
23458       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
23459   }
23460
23461   EVT CondVT = Cond.getValueType();
23462   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
23463       CondVT.getVectorElementType() == MVT::i1) {
23464     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
23465     // lowering on KNL. In this case we convert it to
23466     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
23467     // The same situation for all 128 and 256-bit vectors of i8 and i16.
23468     // Since SKX these selects have a proper lowering.
23469     EVT OpVT = LHS.getValueType();
23470     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
23471         (OpVT.getVectorElementType() == MVT::i8 ||
23472          OpVT.getVectorElementType() == MVT::i16) &&
23473         !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
23474       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
23475       DCI.AddToWorklist(Cond.getNode());
23476       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
23477     }
23478   }
23479   // If this is a select between two integer constants, try to do some
23480   // optimizations.
23481   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
23482     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
23483       // Don't do this for crazy integer types.
23484       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
23485         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
23486         // so that TrueC (the true value) is larger than FalseC.
23487         bool NeedsCondInvert = false;
23488
23489         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
23490             // Efficiently invertible.
23491             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
23492              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
23493               isa<ConstantSDNode>(Cond.getOperand(1))))) {
23494           NeedsCondInvert = true;
23495           std::swap(TrueC, FalseC);
23496         }
23497
23498         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
23499         if (FalseC->getAPIntValue() == 0 &&
23500             TrueC->getAPIntValue().isPowerOf2()) {
23501           if (NeedsCondInvert) // Invert the condition if needed.
23502             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23503                                DAG.getConstant(1, Cond.getValueType()));
23504
23505           // Zero extend the condition if needed.
23506           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
23507
23508           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23509           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
23510                              DAG.getConstant(ShAmt, MVT::i8));
23511         }
23512
23513         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
23514         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23515           if (NeedsCondInvert) // Invert the condition if needed.
23516             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23517                                DAG.getConstant(1, Cond.getValueType()));
23518
23519           // Zero extend the condition if needed.
23520           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23521                              FalseC->getValueType(0), Cond);
23522           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23523                              SDValue(FalseC, 0));
23524         }
23525
23526         // Optimize cases that will turn into an LEA instruction.  This requires
23527         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23528         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23529           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23530           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23531
23532           bool isFastMultiplier = false;
23533           if (Diff < 10) {
23534             switch ((unsigned char)Diff) {
23535               default: break;
23536               case 1:  // result = add base, cond
23537               case 2:  // result = lea base(    , cond*2)
23538               case 3:  // result = lea base(cond, cond*2)
23539               case 4:  // result = lea base(    , cond*4)
23540               case 5:  // result = lea base(cond, cond*4)
23541               case 8:  // result = lea base(    , cond*8)
23542               case 9:  // result = lea base(cond, cond*8)
23543                 isFastMultiplier = true;
23544                 break;
23545             }
23546           }
23547
23548           if (isFastMultiplier) {
23549             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23550             if (NeedsCondInvert) // Invert the condition if needed.
23551               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23552                                  DAG.getConstant(1, Cond.getValueType()));
23553
23554             // Zero extend the condition if needed.
23555             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23556                                Cond);
23557             // Scale the condition by the difference.
23558             if (Diff != 1)
23559               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23560                                  DAG.getConstant(Diff, Cond.getValueType()));
23561
23562             // Add the base if non-zero.
23563             if (FalseC->getAPIntValue() != 0)
23564               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23565                                  SDValue(FalseC, 0));
23566             return Cond;
23567           }
23568         }
23569       }
23570   }
23571
23572   // Canonicalize max and min:
23573   // (x > y) ? x : y -> (x >= y) ? x : y
23574   // (x < y) ? x : y -> (x <= y) ? x : y
23575   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
23576   // the need for an extra compare
23577   // against zero. e.g.
23578   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
23579   // subl   %esi, %edi
23580   // testl  %edi, %edi
23581   // movl   $0, %eax
23582   // cmovgl %edi, %eax
23583   // =>
23584   // xorl   %eax, %eax
23585   // subl   %esi, $edi
23586   // cmovsl %eax, %edi
23587   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
23588       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23589       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23590     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23591     switch (CC) {
23592     default: break;
23593     case ISD::SETLT:
23594     case ISD::SETGT: {
23595       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
23596       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
23597                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
23598       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
23599     }
23600     }
23601   }
23602
23603   // Early exit check
23604   if (!TLI.isTypeLegal(VT))
23605     return SDValue();
23606
23607   // Match VSELECTs into subs with unsigned saturation.
23608   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
23609       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
23610       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
23611        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
23612     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23613
23614     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
23615     // left side invert the predicate to simplify logic below.
23616     SDValue Other;
23617     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
23618       Other = RHS;
23619       CC = ISD::getSetCCInverse(CC, true);
23620     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
23621       Other = LHS;
23622     }
23623
23624     if (Other.getNode() && Other->getNumOperands() == 2 &&
23625         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
23626       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
23627       SDValue CondRHS = Cond->getOperand(1);
23628
23629       // Look for a general sub with unsigned saturation first.
23630       // x >= y ? x-y : 0 --> subus x, y
23631       // x >  y ? x-y : 0 --> subus x, y
23632       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
23633           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
23634         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
23635
23636       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
23637         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
23638           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
23639             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
23640               // If the RHS is a constant we have to reverse the const
23641               // canonicalization.
23642               // x > C-1 ? x+-C : 0 --> subus x, C
23643               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
23644                   CondRHSConst->getAPIntValue() ==
23645                       (-OpRHSConst->getAPIntValue() - 1))
23646                 return DAG.getNode(
23647                     X86ISD::SUBUS, DL, VT, OpLHS,
23648                     DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
23649
23650           // Another special case: If C was a sign bit, the sub has been
23651           // canonicalized into a xor.
23652           // FIXME: Would it be better to use computeKnownBits to determine
23653           //        whether it's safe to decanonicalize the xor?
23654           // x s< 0 ? x^C : 0 --> subus x, C
23655           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
23656               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
23657               OpRHSConst->getAPIntValue().isSignBit())
23658             // Note that we have to rebuild the RHS constant here to ensure we
23659             // don't rely on particular values of undef lanes.
23660             return DAG.getNode(
23661                 X86ISD::SUBUS, DL, VT, OpLHS,
23662                 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
23663         }
23664     }
23665   }
23666
23667   // Try to match a min/max vector operation.
23668   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
23669     std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
23670     unsigned Opc = ret.first;
23671     bool NeedSplit = ret.second;
23672
23673     if (Opc && NeedSplit) {
23674       unsigned NumElems = VT.getVectorNumElements();
23675       // Extract the LHS vectors
23676       SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
23677       SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
23678
23679       // Extract the RHS vectors
23680       SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
23681       SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
23682
23683       // Create min/max for each subvector
23684       LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
23685       RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
23686
23687       // Merge the result
23688       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
23689     } else if (Opc)
23690       return DAG.getNode(Opc, DL, VT, LHS, RHS);
23691   }
23692
23693   // Simplify vector selection if condition value type matches vselect
23694   // operand type
23695   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
23696     assert(Cond.getValueType().isVector() &&
23697            "vector select expects a vector selector!");
23698
23699     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
23700     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
23701
23702     // Try invert the condition if true value is not all 1s and false value
23703     // is not all 0s.
23704     if (!TValIsAllOnes && !FValIsAllZeros &&
23705         // Check if the selector will be produced by CMPP*/PCMP*
23706         Cond.getOpcode() == ISD::SETCC &&
23707         // Check if SETCC has already been promoted
23708         TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
23709       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
23710       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
23711
23712       if (TValIsAllZeros || FValIsAllOnes) {
23713         SDValue CC = Cond.getOperand(2);
23714         ISD::CondCode NewCC =
23715           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
23716                                Cond.getOperand(0).getValueType().isInteger());
23717         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
23718         std::swap(LHS, RHS);
23719         TValIsAllOnes = FValIsAllOnes;
23720         FValIsAllZeros = TValIsAllZeros;
23721       }
23722     }
23723
23724     if (TValIsAllOnes || FValIsAllZeros) {
23725       SDValue Ret;
23726
23727       if (TValIsAllOnes && FValIsAllZeros)
23728         Ret = Cond;
23729       else if (TValIsAllOnes)
23730         Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
23731                           DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
23732       else if (FValIsAllZeros)
23733         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
23734                           DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
23735
23736       return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
23737     }
23738   }
23739
23740   // If we know that this node is legal then we know that it is going to be
23741   // matched by one of the SSE/AVX BLEND instructions. These instructions only
23742   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
23743   // to simplify previous instructions.
23744   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
23745       !DCI.isBeforeLegalize() &&
23746       // We explicitly check against v8i16 and v16i16 because, although
23747       // they're marked as Custom, they might only be legal when Cond is a
23748       // build_vector of constants. This will be taken care in a later
23749       // condition.
23750       (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
23751        VT != MVT::v8i16) &&
23752       // Don't optimize vector of constants. Those are handled by
23753       // the generic code and all the bits must be properly set for
23754       // the generic optimizer.
23755       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
23756     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
23757
23758     // Don't optimize vector selects that map to mask-registers.
23759     if (BitWidth == 1)
23760       return SDValue();
23761
23762     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
23763     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
23764
23765     APInt KnownZero, KnownOne;
23766     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
23767                                           DCI.isBeforeLegalizeOps());
23768     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
23769         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
23770                                  TLO)) {
23771       // If we changed the computation somewhere in the DAG, this change
23772       // will affect all users of Cond.
23773       // Make sure it is fine and update all the nodes so that we do not
23774       // use the generic VSELECT anymore. Otherwise, we may perform
23775       // wrong optimizations as we messed up with the actual expectation
23776       // for the vector boolean values.
23777       if (Cond != TLO.Old) {
23778         // Check all uses of that condition operand to check whether it will be
23779         // consumed by non-BLEND instructions, which may depend on all bits are
23780         // set properly.
23781         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23782              I != E; ++I)
23783           if (I->getOpcode() != ISD::VSELECT)
23784             // TODO: Add other opcodes eventually lowered into BLEND.
23785             return SDValue();
23786
23787         // Update all the users of the condition, before committing the change,
23788         // so that the VSELECT optimizations that expect the correct vector
23789         // boolean value will not be triggered.
23790         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23791              I != E; ++I)
23792           DAG.ReplaceAllUsesOfValueWith(
23793               SDValue(*I, 0),
23794               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
23795                           Cond, I->getOperand(1), I->getOperand(2)));
23796         DCI.CommitTargetLoweringOpt(TLO);
23797         return SDValue();
23798       }
23799       // At this point, only Cond is changed. Change the condition
23800       // just for N to keep the opportunity to optimize all other
23801       // users their own way.
23802       DAG.ReplaceAllUsesOfValueWith(
23803           SDValue(N, 0),
23804           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
23805                       TLO.New, N->getOperand(1), N->getOperand(2)));
23806       return SDValue();
23807     }
23808   }
23809
23810   // We should generate an X86ISD::BLENDI from a vselect if its argument
23811   // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
23812   // constants. This specific pattern gets generated when we split a
23813   // selector for a 512 bit vector in a machine without AVX512 (but with
23814   // 256-bit vectors), during legalization:
23815   //
23816   // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
23817   //
23818   // Iff we find this pattern and the build_vectors are built from
23819   // constants, we translate the vselect into a shuffle_vector that we
23820   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
23821   if ((N->getOpcode() == ISD::VSELECT ||
23822        N->getOpcode() == X86ISD::SHRUNKBLEND) &&
23823       !DCI.isBeforeLegalize()) {
23824     SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
23825     if (Shuffle.getNode())
23826       return Shuffle;
23827   }
23828
23829   return SDValue();
23830 }
23831
23832 // Check whether a boolean test is testing a boolean value generated by
23833 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
23834 // code.
23835 //
23836 // Simplify the following patterns:
23837 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
23838 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
23839 // to (Op EFLAGS Cond)
23840 //
23841 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
23842 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
23843 // to (Op EFLAGS !Cond)
23844 //
23845 // where Op could be BRCOND or CMOV.
23846 //
23847 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
23848   // Quit if not CMP and SUB with its value result used.
23849   if (Cmp.getOpcode() != X86ISD::CMP &&
23850       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
23851       return SDValue();
23852
23853   // Quit if not used as a boolean value.
23854   if (CC != X86::COND_E && CC != X86::COND_NE)
23855     return SDValue();
23856
23857   // Check CMP operands. One of them should be 0 or 1 and the other should be
23858   // an SetCC or extended from it.
23859   SDValue Op1 = Cmp.getOperand(0);
23860   SDValue Op2 = Cmp.getOperand(1);
23861
23862   SDValue SetCC;
23863   const ConstantSDNode* C = nullptr;
23864   bool needOppositeCond = (CC == X86::COND_E);
23865   bool checkAgainstTrue = false; // Is it a comparison against 1?
23866
23867   if ((C = dyn_cast<ConstantSDNode>(Op1)))
23868     SetCC = Op2;
23869   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
23870     SetCC = Op1;
23871   else // Quit if all operands are not constants.
23872     return SDValue();
23873
23874   if (C->getZExtValue() == 1) {
23875     needOppositeCond = !needOppositeCond;
23876     checkAgainstTrue = true;
23877   } else if (C->getZExtValue() != 0)
23878     // Quit if the constant is neither 0 or 1.
23879     return SDValue();
23880
23881   bool truncatedToBoolWithAnd = false;
23882   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
23883   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
23884          SetCC.getOpcode() == ISD::TRUNCATE ||
23885          SetCC.getOpcode() == ISD::AND) {
23886     if (SetCC.getOpcode() == ISD::AND) {
23887       int OpIdx = -1;
23888       ConstantSDNode *CS;
23889       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
23890           CS->getZExtValue() == 1)
23891         OpIdx = 1;
23892       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
23893           CS->getZExtValue() == 1)
23894         OpIdx = 0;
23895       if (OpIdx == -1)
23896         break;
23897       SetCC = SetCC.getOperand(OpIdx);
23898       truncatedToBoolWithAnd = true;
23899     } else
23900       SetCC = SetCC.getOperand(0);
23901   }
23902
23903   switch (SetCC.getOpcode()) {
23904   case X86ISD::SETCC_CARRY:
23905     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
23906     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
23907     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
23908     // truncated to i1 using 'and'.
23909     if (checkAgainstTrue && !truncatedToBoolWithAnd)
23910       break;
23911     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
23912            "Invalid use of SETCC_CARRY!");
23913     // FALL THROUGH
23914   case X86ISD::SETCC:
23915     // Set the condition code or opposite one if necessary.
23916     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
23917     if (needOppositeCond)
23918       CC = X86::GetOppositeBranchCondition(CC);
23919     return SetCC.getOperand(1);
23920   case X86ISD::CMOV: {
23921     // Check whether false/true value has canonical one, i.e. 0 or 1.
23922     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
23923     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
23924     // Quit if true value is not a constant.
23925     if (!TVal)
23926       return SDValue();
23927     // Quit if false value is not a constant.
23928     if (!FVal) {
23929       SDValue Op = SetCC.getOperand(0);
23930       // Skip 'zext' or 'trunc' node.
23931       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
23932           Op.getOpcode() == ISD::TRUNCATE)
23933         Op = Op.getOperand(0);
23934       // A special case for rdrand/rdseed, where 0 is set if false cond is
23935       // found.
23936       if ((Op.getOpcode() != X86ISD::RDRAND &&
23937            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
23938         return SDValue();
23939     }
23940     // Quit if false value is not the constant 0 or 1.
23941     bool FValIsFalse = true;
23942     if (FVal && FVal->getZExtValue() != 0) {
23943       if (FVal->getZExtValue() != 1)
23944         return SDValue();
23945       // If FVal is 1, opposite cond is needed.
23946       needOppositeCond = !needOppositeCond;
23947       FValIsFalse = false;
23948     }
23949     // Quit if TVal is not the constant opposite of FVal.
23950     if (FValIsFalse && TVal->getZExtValue() != 1)
23951       return SDValue();
23952     if (!FValIsFalse && TVal->getZExtValue() != 0)
23953       return SDValue();
23954     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
23955     if (needOppositeCond)
23956       CC = X86::GetOppositeBranchCondition(CC);
23957     return SetCC.getOperand(3);
23958   }
23959   }
23960
23961   return SDValue();
23962 }
23963
23964 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
23965 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
23966                                   TargetLowering::DAGCombinerInfo &DCI,
23967                                   const X86Subtarget *Subtarget) {
23968   SDLoc DL(N);
23969
23970   // If the flag operand isn't dead, don't touch this CMOV.
23971   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
23972     return SDValue();
23973
23974   SDValue FalseOp = N->getOperand(0);
23975   SDValue TrueOp = N->getOperand(1);
23976   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
23977   SDValue Cond = N->getOperand(3);
23978
23979   if (CC == X86::COND_E || CC == X86::COND_NE) {
23980     switch (Cond.getOpcode()) {
23981     default: break;
23982     case X86ISD::BSR:
23983     case X86ISD::BSF:
23984       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
23985       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
23986         return (CC == X86::COND_E) ? FalseOp : TrueOp;
23987     }
23988   }
23989
23990   SDValue Flags;
23991
23992   Flags = checkBoolTestSetCCCombine(Cond, CC);
23993   if (Flags.getNode() &&
23994       // Extra check as FCMOV only supports a subset of X86 cond.
23995       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
23996     SDValue Ops[] = { FalseOp, TrueOp,
23997                       DAG.getConstant(CC, MVT::i8), Flags };
23998     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
23999   }
24000
24001   // If this is a select between two integer constants, try to do some
24002   // optimizations.  Note that the operands are ordered the opposite of SELECT
24003   // operands.
24004   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
24005     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
24006       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
24007       // larger than FalseC (the false value).
24008       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
24009         CC = X86::GetOppositeBranchCondition(CC);
24010         std::swap(TrueC, FalseC);
24011         std::swap(TrueOp, FalseOp);
24012       }
24013
24014       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
24015       // This is efficient for any integer data type (including i8/i16) and
24016       // shift amount.
24017       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
24018         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24019                            DAG.getConstant(CC, MVT::i8), Cond);
24020
24021         // Zero extend the condition if needed.
24022         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
24023
24024         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
24025         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
24026                            DAG.getConstant(ShAmt, MVT::i8));
24027         if (N->getNumValues() == 2)  // Dead flag value?
24028           return DCI.CombineTo(N, Cond, SDValue());
24029         return Cond;
24030       }
24031
24032       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
24033       // for any integer data type, including i8/i16.
24034       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
24035         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24036                            DAG.getConstant(CC, MVT::i8), Cond);
24037
24038         // Zero extend the condition if needed.
24039         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
24040                            FalseC->getValueType(0), Cond);
24041         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24042                            SDValue(FalseC, 0));
24043
24044         if (N->getNumValues() == 2)  // Dead flag value?
24045           return DCI.CombineTo(N, Cond, SDValue());
24046         return Cond;
24047       }
24048
24049       // Optimize cases that will turn into an LEA instruction.  This requires
24050       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
24051       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
24052         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
24053         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
24054
24055         bool isFastMultiplier = false;
24056         if (Diff < 10) {
24057           switch ((unsigned char)Diff) {
24058           default: break;
24059           case 1:  // result = add base, cond
24060           case 2:  // result = lea base(    , cond*2)
24061           case 3:  // result = lea base(cond, cond*2)
24062           case 4:  // result = lea base(    , cond*4)
24063           case 5:  // result = lea base(cond, cond*4)
24064           case 8:  // result = lea base(    , cond*8)
24065           case 9:  // result = lea base(cond, cond*8)
24066             isFastMultiplier = true;
24067             break;
24068           }
24069         }
24070
24071         if (isFastMultiplier) {
24072           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
24073           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24074                              DAG.getConstant(CC, MVT::i8), Cond);
24075           // Zero extend the condition if needed.
24076           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
24077                              Cond);
24078           // Scale the condition by the difference.
24079           if (Diff != 1)
24080             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
24081                                DAG.getConstant(Diff, Cond.getValueType()));
24082
24083           // Add the base if non-zero.
24084           if (FalseC->getAPIntValue() != 0)
24085             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24086                                SDValue(FalseC, 0));
24087           if (N->getNumValues() == 2)  // Dead flag value?
24088             return DCI.CombineTo(N, Cond, SDValue());
24089           return Cond;
24090         }
24091       }
24092     }
24093   }
24094
24095   // Handle these cases:
24096   //   (select (x != c), e, c) -> select (x != c), e, x),
24097   //   (select (x == c), c, e) -> select (x == c), x, e)
24098   // where the c is an integer constant, and the "select" is the combination
24099   // of CMOV and CMP.
24100   //
24101   // The rationale for this change is that the conditional-move from a constant
24102   // needs two instructions, however, conditional-move from a register needs
24103   // only one instruction.
24104   //
24105   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
24106   //  some instruction-combining opportunities. This opt needs to be
24107   //  postponed as late as possible.
24108   //
24109   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
24110     // the DCI.xxxx conditions are provided to postpone the optimization as
24111     // late as possible.
24112
24113     ConstantSDNode *CmpAgainst = nullptr;
24114     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
24115         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
24116         !isa<ConstantSDNode>(Cond.getOperand(0))) {
24117
24118       if (CC == X86::COND_NE &&
24119           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
24120         CC = X86::GetOppositeBranchCondition(CC);
24121         std::swap(TrueOp, FalseOp);
24122       }
24123
24124       if (CC == X86::COND_E &&
24125           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
24126         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
24127                           DAG.getConstant(CC, MVT::i8), Cond };
24128         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
24129       }
24130     }
24131   }
24132
24133   return SDValue();
24134 }
24135
24136 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
24137                                                 const X86Subtarget *Subtarget) {
24138   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
24139   switch (IntNo) {
24140   default: return SDValue();
24141   // SSE/AVX/AVX2 blend intrinsics.
24142   case Intrinsic::x86_avx2_pblendvb:
24143   case Intrinsic::x86_avx2_pblendw:
24144   case Intrinsic::x86_avx2_pblendd_128:
24145   case Intrinsic::x86_avx2_pblendd_256:
24146     // Don't try to simplify this intrinsic if we don't have AVX2.
24147     if (!Subtarget->hasAVX2())
24148       return SDValue();
24149     // FALL-THROUGH
24150   case Intrinsic::x86_avx_blend_pd_256:
24151   case Intrinsic::x86_avx_blend_ps_256:
24152   case Intrinsic::x86_avx_blendv_pd_256:
24153   case Intrinsic::x86_avx_blendv_ps_256:
24154     // Don't try to simplify this intrinsic if we don't have AVX.
24155     if (!Subtarget->hasAVX())
24156       return SDValue();
24157     // FALL-THROUGH
24158   case Intrinsic::x86_sse41_pblendw:
24159   case Intrinsic::x86_sse41_blendpd:
24160   case Intrinsic::x86_sse41_blendps:
24161   case Intrinsic::x86_sse41_blendvps:
24162   case Intrinsic::x86_sse41_blendvpd:
24163   case Intrinsic::x86_sse41_pblendvb: {
24164     SDValue Op0 = N->getOperand(1);
24165     SDValue Op1 = N->getOperand(2);
24166     SDValue Mask = N->getOperand(3);
24167
24168     // Don't try to simplify this intrinsic if we don't have SSE4.1.
24169     if (!Subtarget->hasSSE41())
24170       return SDValue();
24171
24172     // fold (blend A, A, Mask) -> A
24173     if (Op0 == Op1)
24174       return Op0;
24175     // fold (blend A, B, allZeros) -> A
24176     if (ISD::isBuildVectorAllZeros(Mask.getNode()))
24177       return Op0;
24178     // fold (blend A, B, allOnes) -> B
24179     if (ISD::isBuildVectorAllOnes(Mask.getNode()))
24180       return Op1;
24181
24182     // Simplify the case where the mask is a constant i32 value.
24183     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
24184       if (C->isNullValue())
24185         return Op0;
24186       if (C->isAllOnesValue())
24187         return Op1;
24188     }
24189
24190     return SDValue();
24191   }
24192
24193   // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
24194   case Intrinsic::x86_sse2_psrai_w:
24195   case Intrinsic::x86_sse2_psrai_d:
24196   case Intrinsic::x86_avx2_psrai_w:
24197   case Intrinsic::x86_avx2_psrai_d:
24198   case Intrinsic::x86_sse2_psra_w:
24199   case Intrinsic::x86_sse2_psra_d:
24200   case Intrinsic::x86_avx2_psra_w:
24201   case Intrinsic::x86_avx2_psra_d: {
24202     SDValue Op0 = N->getOperand(1);
24203     SDValue Op1 = N->getOperand(2);
24204     EVT VT = Op0.getValueType();
24205     assert(VT.isVector() && "Expected a vector type!");
24206
24207     if (isa<BuildVectorSDNode>(Op1))
24208       Op1 = Op1.getOperand(0);
24209
24210     if (!isa<ConstantSDNode>(Op1))
24211       return SDValue();
24212
24213     EVT SVT = VT.getVectorElementType();
24214     unsigned SVTBits = SVT.getSizeInBits();
24215
24216     ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
24217     const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
24218     uint64_t ShAmt = C.getZExtValue();
24219
24220     // Don't try to convert this shift into a ISD::SRA if the shift
24221     // count is bigger than or equal to the element size.
24222     if (ShAmt >= SVTBits)
24223       return SDValue();
24224
24225     // Trivial case: if the shift count is zero, then fold this
24226     // into the first operand.
24227     if (ShAmt == 0)
24228       return Op0;
24229
24230     // Replace this packed shift intrinsic with a target independent
24231     // shift dag node.
24232     SDValue Splat = DAG.getConstant(C, VT);
24233     return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
24234   }
24235   }
24236 }
24237
24238 /// PerformMulCombine - Optimize a single multiply with constant into two
24239 /// in order to implement it with two cheaper instructions, e.g.
24240 /// LEA + SHL, LEA + LEA.
24241 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
24242                                  TargetLowering::DAGCombinerInfo &DCI) {
24243   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24244     return SDValue();
24245
24246   EVT VT = N->getValueType(0);
24247   if (VT != MVT::i64 && VT != MVT::i32)
24248     return SDValue();
24249
24250   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
24251   if (!C)
24252     return SDValue();
24253   uint64_t MulAmt = C->getZExtValue();
24254   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
24255     return SDValue();
24256
24257   uint64_t MulAmt1 = 0;
24258   uint64_t MulAmt2 = 0;
24259   if ((MulAmt % 9) == 0) {
24260     MulAmt1 = 9;
24261     MulAmt2 = MulAmt / 9;
24262   } else if ((MulAmt % 5) == 0) {
24263     MulAmt1 = 5;
24264     MulAmt2 = MulAmt / 5;
24265   } else if ((MulAmt % 3) == 0) {
24266     MulAmt1 = 3;
24267     MulAmt2 = MulAmt / 3;
24268   }
24269   if (MulAmt2 &&
24270       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
24271     SDLoc DL(N);
24272
24273     if (isPowerOf2_64(MulAmt2) &&
24274         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
24275       // If second multiplifer is pow2, issue it first. We want the multiply by
24276       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
24277       // is an add.
24278       std::swap(MulAmt1, MulAmt2);
24279
24280     SDValue NewMul;
24281     if (isPowerOf2_64(MulAmt1))
24282       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
24283                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
24284     else
24285       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
24286                            DAG.getConstant(MulAmt1, VT));
24287
24288     if (isPowerOf2_64(MulAmt2))
24289       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
24290                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
24291     else
24292       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
24293                            DAG.getConstant(MulAmt2, VT));
24294
24295     // Do not add new nodes to DAG combiner worklist.
24296     DCI.CombineTo(N, NewMul, false);
24297   }
24298   return SDValue();
24299 }
24300
24301 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
24302   SDValue N0 = N->getOperand(0);
24303   SDValue N1 = N->getOperand(1);
24304   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
24305   EVT VT = N0.getValueType();
24306
24307   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
24308   // since the result of setcc_c is all zero's or all ones.
24309   if (VT.isInteger() && !VT.isVector() &&
24310       N1C && N0.getOpcode() == ISD::AND &&
24311       N0.getOperand(1).getOpcode() == ISD::Constant) {
24312     SDValue N00 = N0.getOperand(0);
24313     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
24314         ((N00.getOpcode() == ISD::ANY_EXTEND ||
24315           N00.getOpcode() == ISD::ZERO_EXTEND) &&
24316          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
24317       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
24318       APInt ShAmt = N1C->getAPIntValue();
24319       Mask = Mask.shl(ShAmt);
24320       if (Mask != 0)
24321         return DAG.getNode(ISD::AND, SDLoc(N), VT,
24322                            N00, DAG.getConstant(Mask, VT));
24323     }
24324   }
24325
24326   // Hardware support for vector shifts is sparse which makes us scalarize the
24327   // vector operations in many cases. Also, on sandybridge ADD is faster than
24328   // shl.
24329   // (shl V, 1) -> add V,V
24330   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
24331     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
24332       assert(N0.getValueType().isVector() && "Invalid vector shift type");
24333       // We shift all of the values by one. In many cases we do not have
24334       // hardware support for this operation. This is better expressed as an ADD
24335       // of two values.
24336       if (N1SplatC->getZExtValue() == 1)
24337         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
24338     }
24339
24340   return SDValue();
24341 }
24342
24343 /// \brief Returns a vector of 0s if the node in input is a vector logical
24344 /// shift by a constant amount which is known to be bigger than or equal
24345 /// to the vector element size in bits.
24346 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
24347                                       const X86Subtarget *Subtarget) {
24348   EVT VT = N->getValueType(0);
24349
24350   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
24351       (!Subtarget->hasInt256() ||
24352        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
24353     return SDValue();
24354
24355   SDValue Amt = N->getOperand(1);
24356   SDLoc DL(N);
24357   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
24358     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
24359       APInt ShiftAmt = AmtSplat->getAPIntValue();
24360       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
24361
24362       // SSE2/AVX2 logical shifts always return a vector of 0s
24363       // if the shift amount is bigger than or equal to
24364       // the element size. The constant shift amount will be
24365       // encoded as a 8-bit immediate.
24366       if (ShiftAmt.trunc(8).uge(MaxAmount))
24367         return getZeroVector(VT, Subtarget, DAG, DL);
24368     }
24369
24370   return SDValue();
24371 }
24372
24373 /// PerformShiftCombine - Combine shifts.
24374 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
24375                                    TargetLowering::DAGCombinerInfo &DCI,
24376                                    const X86Subtarget *Subtarget) {
24377   if (N->getOpcode() == ISD::SHL) {
24378     SDValue V = PerformSHLCombine(N, DAG);
24379     if (V.getNode()) return V;
24380   }
24381
24382   if (N->getOpcode() != ISD::SRA) {
24383     // Try to fold this logical shift into a zero vector.
24384     SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
24385     if (V.getNode()) return V;
24386   }
24387
24388   return SDValue();
24389 }
24390
24391 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
24392 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
24393 // and friends.  Likewise for OR -> CMPNEQSS.
24394 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
24395                             TargetLowering::DAGCombinerInfo &DCI,
24396                             const X86Subtarget *Subtarget) {
24397   unsigned opcode;
24398
24399   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
24400   // we're requiring SSE2 for both.
24401   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
24402     SDValue N0 = N->getOperand(0);
24403     SDValue N1 = N->getOperand(1);
24404     SDValue CMP0 = N0->getOperand(1);
24405     SDValue CMP1 = N1->getOperand(1);
24406     SDLoc DL(N);
24407
24408     // The SETCCs should both refer to the same CMP.
24409     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
24410       return SDValue();
24411
24412     SDValue CMP00 = CMP0->getOperand(0);
24413     SDValue CMP01 = CMP0->getOperand(1);
24414     EVT     VT    = CMP00.getValueType();
24415
24416     if (VT == MVT::f32 || VT == MVT::f64) {
24417       bool ExpectingFlags = false;
24418       // Check for any users that want flags:
24419       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
24420            !ExpectingFlags && UI != UE; ++UI)
24421         switch (UI->getOpcode()) {
24422         default:
24423         case ISD::BR_CC:
24424         case ISD::BRCOND:
24425         case ISD::SELECT:
24426           ExpectingFlags = true;
24427           break;
24428         case ISD::CopyToReg:
24429         case ISD::SIGN_EXTEND:
24430         case ISD::ZERO_EXTEND:
24431         case ISD::ANY_EXTEND:
24432           break;
24433         }
24434
24435       if (!ExpectingFlags) {
24436         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
24437         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
24438
24439         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
24440           X86::CondCode tmp = cc0;
24441           cc0 = cc1;
24442           cc1 = tmp;
24443         }
24444
24445         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
24446             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
24447           // FIXME: need symbolic constants for these magic numbers.
24448           // See X86ATTInstPrinter.cpp:printSSECC().
24449           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
24450           if (Subtarget->hasAVX512()) {
24451             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
24452                                          CMP01, DAG.getConstant(x86cc, MVT::i8));
24453             if (N->getValueType(0) != MVT::i1)
24454               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
24455                                  FSetCC);
24456             return FSetCC;
24457           }
24458           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
24459                                               CMP00.getValueType(), CMP00, CMP01,
24460                                               DAG.getConstant(x86cc, MVT::i8));
24461
24462           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
24463           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
24464
24465           if (is64BitFP && !Subtarget->is64Bit()) {
24466             // On a 32-bit target, we cannot bitcast the 64-bit float to a
24467             // 64-bit integer, since that's not a legal type. Since
24468             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
24469             // bits, but can do this little dance to extract the lowest 32 bits
24470             // and work with those going forward.
24471             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
24472                                            OnesOrZeroesF);
24473             SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
24474                                            Vector64);
24475             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
24476                                         Vector32, DAG.getIntPtrConstant(0));
24477             IntVT = MVT::i32;
24478           }
24479
24480           SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
24481           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
24482                                       DAG.getConstant(1, IntVT));
24483           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
24484           return OneBitOfTruth;
24485         }
24486       }
24487     }
24488   }
24489   return SDValue();
24490 }
24491
24492 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
24493 /// so it can be folded inside ANDNP.
24494 static bool CanFoldXORWithAllOnes(const SDNode *N) {
24495   EVT VT = N->getValueType(0);
24496
24497   // Match direct AllOnes for 128 and 256-bit vectors
24498   if (ISD::isBuildVectorAllOnes(N))
24499     return true;
24500
24501   // Look through a bit convert.
24502   if (N->getOpcode() == ISD::BITCAST)
24503     N = N->getOperand(0).getNode();
24504
24505   // Sometimes the operand may come from a insert_subvector building a 256-bit
24506   // allones vector
24507   if (VT.is256BitVector() &&
24508       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
24509     SDValue V1 = N->getOperand(0);
24510     SDValue V2 = N->getOperand(1);
24511
24512     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
24513         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
24514         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
24515         ISD::isBuildVectorAllOnes(V2.getNode()))
24516       return true;
24517   }
24518
24519   return false;
24520 }
24521
24522 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
24523 // register. In most cases we actually compare or select YMM-sized registers
24524 // and mixing the two types creates horrible code. This method optimizes
24525 // some of the transition sequences.
24526 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
24527                                  TargetLowering::DAGCombinerInfo &DCI,
24528                                  const X86Subtarget *Subtarget) {
24529   EVT VT = N->getValueType(0);
24530   if (!VT.is256BitVector())
24531     return SDValue();
24532
24533   assert((N->getOpcode() == ISD::ANY_EXTEND ||
24534           N->getOpcode() == ISD::ZERO_EXTEND ||
24535           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
24536
24537   SDValue Narrow = N->getOperand(0);
24538   EVT NarrowVT = Narrow->getValueType(0);
24539   if (!NarrowVT.is128BitVector())
24540     return SDValue();
24541
24542   if (Narrow->getOpcode() != ISD::XOR &&
24543       Narrow->getOpcode() != ISD::AND &&
24544       Narrow->getOpcode() != ISD::OR)
24545     return SDValue();
24546
24547   SDValue N0  = Narrow->getOperand(0);
24548   SDValue N1  = Narrow->getOperand(1);
24549   SDLoc DL(Narrow);
24550
24551   // The Left side has to be a trunc.
24552   if (N0.getOpcode() != ISD::TRUNCATE)
24553     return SDValue();
24554
24555   // The type of the truncated inputs.
24556   EVT WideVT = N0->getOperand(0)->getValueType(0);
24557   if (WideVT != VT)
24558     return SDValue();
24559
24560   // The right side has to be a 'trunc' or a constant vector.
24561   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
24562   ConstantSDNode *RHSConstSplat = nullptr;
24563   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
24564     RHSConstSplat = RHSBV->getConstantSplatNode();
24565   if (!RHSTrunc && !RHSConstSplat)
24566     return SDValue();
24567
24568   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24569
24570   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
24571     return SDValue();
24572
24573   // Set N0 and N1 to hold the inputs to the new wide operation.
24574   N0 = N0->getOperand(0);
24575   if (RHSConstSplat) {
24576     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
24577                      SDValue(RHSConstSplat, 0));
24578     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
24579     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
24580   } else if (RHSTrunc) {
24581     N1 = N1->getOperand(0);
24582   }
24583
24584   // Generate the wide operation.
24585   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
24586   unsigned Opcode = N->getOpcode();
24587   switch (Opcode) {
24588   case ISD::ANY_EXTEND:
24589     return Op;
24590   case ISD::ZERO_EXTEND: {
24591     unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
24592     APInt Mask = APInt::getAllOnesValue(InBits);
24593     Mask = Mask.zext(VT.getScalarType().getSizeInBits());
24594     return DAG.getNode(ISD::AND, DL, VT,
24595                        Op, DAG.getConstant(Mask, VT));
24596   }
24597   case ISD::SIGN_EXTEND:
24598     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
24599                        Op, DAG.getValueType(NarrowVT));
24600   default:
24601     llvm_unreachable("Unexpected opcode");
24602   }
24603 }
24604
24605 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
24606                                  TargetLowering::DAGCombinerInfo &DCI,
24607                                  const X86Subtarget *Subtarget) {
24608   EVT VT = N->getValueType(0);
24609   if (DCI.isBeforeLegalizeOps())
24610     return SDValue();
24611
24612   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24613   if (R.getNode())
24614     return R;
24615
24616   // Create BEXTR instructions
24617   // BEXTR is ((X >> imm) & (2**size-1))
24618   if (VT == MVT::i32 || VT == MVT::i64) {
24619     SDValue N0 = N->getOperand(0);
24620     SDValue N1 = N->getOperand(1);
24621     SDLoc DL(N);
24622
24623     // Check for BEXTR.
24624     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
24625         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
24626       ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
24627       ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
24628       if (MaskNode && ShiftNode) {
24629         uint64_t Mask = MaskNode->getZExtValue();
24630         uint64_t Shift = ShiftNode->getZExtValue();
24631         if (isMask_64(Mask)) {
24632           uint64_t MaskSize = CountPopulation_64(Mask);
24633           if (Shift + MaskSize <= VT.getSizeInBits())
24634             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
24635                                DAG.getConstant(Shift | (MaskSize << 8), VT));
24636         }
24637       }
24638     } // BEXTR
24639
24640     return SDValue();
24641   }
24642
24643   // Want to form ANDNP nodes:
24644   // 1) In the hopes of then easily combining them with OR and AND nodes
24645   //    to form PBLEND/PSIGN.
24646   // 2) To match ANDN packed intrinsics
24647   if (VT != MVT::v2i64 && VT != MVT::v4i64)
24648     return SDValue();
24649
24650   SDValue N0 = N->getOperand(0);
24651   SDValue N1 = N->getOperand(1);
24652   SDLoc DL(N);
24653
24654   // Check LHS for vnot
24655   if (N0.getOpcode() == ISD::XOR &&
24656       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
24657       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
24658     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
24659
24660   // Check RHS for vnot
24661   if (N1.getOpcode() == ISD::XOR &&
24662       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
24663       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
24664     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
24665
24666   return SDValue();
24667 }
24668
24669 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
24670                                 TargetLowering::DAGCombinerInfo &DCI,
24671                                 const X86Subtarget *Subtarget) {
24672   if (DCI.isBeforeLegalizeOps())
24673     return SDValue();
24674
24675   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24676   if (R.getNode())
24677     return R;
24678
24679   SDValue N0 = N->getOperand(0);
24680   SDValue N1 = N->getOperand(1);
24681   EVT VT = N->getValueType(0);
24682
24683   // look for psign/blend
24684   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
24685     if (!Subtarget->hasSSSE3() ||
24686         (VT == MVT::v4i64 && !Subtarget->hasInt256()))
24687       return SDValue();
24688
24689     // Canonicalize pandn to RHS
24690     if (N0.getOpcode() == X86ISD::ANDNP)
24691       std::swap(N0, N1);
24692     // or (and (m, y), (pandn m, x))
24693     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
24694       SDValue Mask = N1.getOperand(0);
24695       SDValue X    = N1.getOperand(1);
24696       SDValue Y;
24697       if (N0.getOperand(0) == Mask)
24698         Y = N0.getOperand(1);
24699       if (N0.getOperand(1) == Mask)
24700         Y = N0.getOperand(0);
24701
24702       // Check to see if the mask appeared in both the AND and ANDNP and
24703       if (!Y.getNode())
24704         return SDValue();
24705
24706       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
24707       // Look through mask bitcast.
24708       if (Mask.getOpcode() == ISD::BITCAST)
24709         Mask = Mask.getOperand(0);
24710       if (X.getOpcode() == ISD::BITCAST)
24711         X = X.getOperand(0);
24712       if (Y.getOpcode() == ISD::BITCAST)
24713         Y = Y.getOperand(0);
24714
24715       EVT MaskVT = Mask.getValueType();
24716
24717       // Validate that the Mask operand is a vector sra node.
24718       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
24719       // there is no psrai.b
24720       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
24721       unsigned SraAmt = ~0;
24722       if (Mask.getOpcode() == ISD::SRA) {
24723         if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
24724           if (auto *AmtConst = AmtBV->getConstantSplatNode())
24725             SraAmt = AmtConst->getZExtValue();
24726       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
24727         SDValue SraC = Mask.getOperand(1);
24728         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
24729       }
24730       if ((SraAmt + 1) != EltBits)
24731         return SDValue();
24732
24733       SDLoc DL(N);
24734
24735       // Now we know we at least have a plendvb with the mask val.  See if
24736       // we can form a psignb/w/d.
24737       // psign = x.type == y.type == mask.type && y = sub(0, x);
24738       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
24739           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
24740           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
24741         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
24742                "Unsupported VT for PSIGN");
24743         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
24744         return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24745       }
24746       // PBLENDVB only available on SSE 4.1
24747       if (!Subtarget->hasSSE41())
24748         return SDValue();
24749
24750       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
24751
24752       X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
24753       Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
24754       Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
24755       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
24756       return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24757     }
24758   }
24759
24760   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
24761     return SDValue();
24762
24763   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
24764   MachineFunction &MF = DAG.getMachineFunction();
24765   bool OptForSize = MF.getFunction()->getAttributes().
24766     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
24767
24768   // SHLD/SHRD instructions have lower register pressure, but on some
24769   // platforms they have higher latency than the equivalent
24770   // series of shifts/or that would otherwise be generated.
24771   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
24772   // have higher latencies and we are not optimizing for size.
24773   if (!OptForSize && Subtarget->isSHLDSlow())
24774     return SDValue();
24775
24776   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
24777     std::swap(N0, N1);
24778   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
24779     return SDValue();
24780   if (!N0.hasOneUse() || !N1.hasOneUse())
24781     return SDValue();
24782
24783   SDValue ShAmt0 = N0.getOperand(1);
24784   if (ShAmt0.getValueType() != MVT::i8)
24785     return SDValue();
24786   SDValue ShAmt1 = N1.getOperand(1);
24787   if (ShAmt1.getValueType() != MVT::i8)
24788     return SDValue();
24789   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
24790     ShAmt0 = ShAmt0.getOperand(0);
24791   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
24792     ShAmt1 = ShAmt1.getOperand(0);
24793
24794   SDLoc DL(N);
24795   unsigned Opc = X86ISD::SHLD;
24796   SDValue Op0 = N0.getOperand(0);
24797   SDValue Op1 = N1.getOperand(0);
24798   if (ShAmt0.getOpcode() == ISD::SUB) {
24799     Opc = X86ISD::SHRD;
24800     std::swap(Op0, Op1);
24801     std::swap(ShAmt0, ShAmt1);
24802   }
24803
24804   unsigned Bits = VT.getSizeInBits();
24805   if (ShAmt1.getOpcode() == ISD::SUB) {
24806     SDValue Sum = ShAmt1.getOperand(0);
24807     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
24808       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
24809       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
24810         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
24811       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
24812         return DAG.getNode(Opc, DL, VT,
24813                            Op0, Op1,
24814                            DAG.getNode(ISD::TRUNCATE, DL,
24815                                        MVT::i8, ShAmt0));
24816     }
24817   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
24818     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
24819     if (ShAmt0C &&
24820         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
24821       return DAG.getNode(Opc, DL, VT,
24822                          N0.getOperand(0), N1.getOperand(0),
24823                          DAG.getNode(ISD::TRUNCATE, DL,
24824                                        MVT::i8, ShAmt0));
24825   }
24826
24827   return SDValue();
24828 }
24829
24830 // Generate NEG and CMOV for integer abs.
24831 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
24832   EVT VT = N->getValueType(0);
24833
24834   // Since X86 does not have CMOV for 8-bit integer, we don't convert
24835   // 8-bit integer abs to NEG and CMOV.
24836   if (VT.isInteger() && VT.getSizeInBits() == 8)
24837     return SDValue();
24838
24839   SDValue N0 = N->getOperand(0);
24840   SDValue N1 = N->getOperand(1);
24841   SDLoc DL(N);
24842
24843   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
24844   // and change it to SUB and CMOV.
24845   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
24846       N0.getOpcode() == ISD::ADD &&
24847       N0.getOperand(1) == N1 &&
24848       N1.getOpcode() == ISD::SRA &&
24849       N1.getOperand(0) == N0.getOperand(0))
24850     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
24851       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
24852         // Generate SUB & CMOV.
24853         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
24854                                   DAG.getConstant(0, VT), N0.getOperand(0));
24855
24856         SDValue Ops[] = { N0.getOperand(0), Neg,
24857                           DAG.getConstant(X86::COND_GE, MVT::i8),
24858                           SDValue(Neg.getNode(), 1) };
24859         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
24860       }
24861   return SDValue();
24862 }
24863
24864 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
24865 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
24866                                  TargetLowering::DAGCombinerInfo &DCI,
24867                                  const X86Subtarget *Subtarget) {
24868   if (DCI.isBeforeLegalizeOps())
24869     return SDValue();
24870
24871   if (Subtarget->hasCMov()) {
24872     SDValue RV = performIntegerAbsCombine(N, DAG);
24873     if (RV.getNode())
24874       return RV;
24875   }
24876
24877   return SDValue();
24878 }
24879
24880 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
24881 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
24882                                   TargetLowering::DAGCombinerInfo &DCI,
24883                                   const X86Subtarget *Subtarget) {
24884   LoadSDNode *Ld = cast<LoadSDNode>(N);
24885   EVT RegVT = Ld->getValueType(0);
24886   EVT MemVT = Ld->getMemoryVT();
24887   SDLoc dl(Ld);
24888   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24889
24890   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
24891   // into two 16-byte operations.
24892   ISD::LoadExtType Ext = Ld->getExtensionType();
24893   unsigned Alignment = Ld->getAlignment();
24894   bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
24895   if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24896       !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
24897     unsigned NumElems = RegVT.getVectorNumElements();
24898     if (NumElems < 2)
24899       return SDValue();
24900
24901     SDValue Ptr = Ld->getBasePtr();
24902     SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
24903
24904     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
24905                                   NumElems/2);
24906     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24907                                 Ld->getPointerInfo(), Ld->isVolatile(),
24908                                 Ld->isNonTemporal(), Ld->isInvariant(),
24909                                 Alignment);
24910     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24911     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24912                                 Ld->getPointerInfo(), Ld->isVolatile(),
24913                                 Ld->isNonTemporal(), Ld->isInvariant(),
24914                                 std::min(16U, Alignment));
24915     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24916                              Load1.getValue(1),
24917                              Load2.getValue(1));
24918
24919     SDValue NewVec = DAG.getUNDEF(RegVT);
24920     NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
24921     NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
24922     return DCI.CombineTo(N, NewVec, TF, true);
24923   }
24924
24925   return SDValue();
24926 }
24927
24928 /// PerformMLOADCombine - Resolve extending loads
24929 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
24930                                    TargetLowering::DAGCombinerInfo &DCI,
24931                                    const X86Subtarget *Subtarget) {
24932   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
24933   if (Mld->getExtensionType() != ISD::SEXTLOAD)
24934     return SDValue();
24935
24936   EVT VT = Mld->getValueType(0);
24937   unsigned NumElems = VT.getVectorNumElements();
24938   EVT LdVT = Mld->getMemoryVT();
24939   SDLoc dl(Mld);
24940
24941   assert(LdVT != VT && "Cannot extend to the same type");
24942   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
24943   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
24944   // From, To sizes and ElemCount must be pow of two
24945   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
24946     "Unexpected size for extending masked load");
24947
24948   unsigned SizeRatio  = ToSz / FromSz;
24949   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
24950
24951   // Create a type on which we perform the shuffle
24952   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24953           LdVT.getScalarType(), NumElems*SizeRatio);
24954   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24955
24956   // Convert Src0 value
24957   SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
24958   if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
24959     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24960     for (unsigned i = 0; i != NumElems; ++i)
24961       ShuffleVec[i] = i * SizeRatio;
24962
24963     // Can't shuffle using an illegal type.
24964     assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
24965             && "WideVecVT should be legal");
24966     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
24967                                     DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
24968   }
24969   // Prepare the new mask
24970   SDValue NewMask;
24971   SDValue Mask = Mld->getMask();
24972   if (Mask.getValueType() == VT) {
24973     // Mask and original value have the same type
24974     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
24975     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24976     for (unsigned i = 0; i != NumElems; ++i)
24977       ShuffleVec[i] = i * SizeRatio;
24978     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
24979       ShuffleVec[i] = NumElems*SizeRatio;
24980     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
24981                                    DAG.getConstant(0, WideVecVT),
24982                                    &ShuffleVec[0]);
24983   }
24984   else {
24985     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
24986     unsigned WidenNumElts = NumElems*SizeRatio;
24987     unsigned MaskNumElts = VT.getVectorNumElements();
24988     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
24989                                      WidenNumElts);
24990
24991     unsigned NumConcat = WidenNumElts / MaskNumElts;
24992     SmallVector<SDValue, 16> Ops(NumConcat);
24993     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
24994     Ops[0] = Mask;
24995     for (unsigned i = 1; i != NumConcat; ++i)
24996       Ops[i] = ZeroVal;
24997
24998     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
24999   }
25000
25001   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
25002                                      Mld->getBasePtr(), NewMask, WideSrc0,
25003                                      Mld->getMemoryVT(), Mld->getMemOperand(),
25004                                      ISD::NON_EXTLOAD);
25005   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
25006   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
25007
25008 }
25009 /// PerformMSTORECombine - Resolve truncating stores
25010 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
25011                                     const X86Subtarget *Subtarget) {
25012   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
25013   if (!Mst->isTruncatingStore())
25014     return SDValue();
25015
25016   EVT VT = Mst->getValue().getValueType();
25017   unsigned NumElems = VT.getVectorNumElements();
25018   EVT StVT = Mst->getMemoryVT();
25019   SDLoc dl(Mst);
25020
25021   assert(StVT != VT && "Cannot truncate to the same type");
25022   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25023   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25024
25025   // From, To sizes and ElemCount must be pow of two
25026   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
25027     "Unexpected size for truncating masked store");
25028   // We are going to use the original vector elt for storing.
25029   // Accumulated smaller vector elements must be a multiple of the store size.
25030   assert (((NumElems * FromSz) % ToSz) == 0 &&
25031           "Unexpected ratio for truncating masked store");
25032
25033   unsigned SizeRatio  = FromSz / ToSz;
25034   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25035
25036   // Create a type on which we perform the shuffle
25037   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25038           StVT.getScalarType(), NumElems*SizeRatio);
25039
25040   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25041
25042   SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
25043   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
25044   for (unsigned i = 0; i != NumElems; ++i)
25045     ShuffleVec[i] = i * SizeRatio;
25046
25047   // Can't shuffle using an illegal type.
25048   assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
25049           && "WideVecVT should be legal");
25050
25051   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25052                                         DAG.getUNDEF(WideVecVT),
25053                                         &ShuffleVec[0]);
25054
25055   SDValue NewMask;
25056   SDValue Mask = Mst->getMask();
25057   if (Mask.getValueType() == VT) {
25058     // Mask and original value have the same type
25059     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
25060     for (unsigned i = 0; i != NumElems; ++i)
25061       ShuffleVec[i] = i * SizeRatio;
25062     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
25063       ShuffleVec[i] = NumElems*SizeRatio;
25064     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
25065                                    DAG.getConstant(0, WideVecVT),
25066                                    &ShuffleVec[0]);
25067   }
25068   else {
25069     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
25070     unsigned WidenNumElts = NumElems*SizeRatio;
25071     unsigned MaskNumElts = VT.getVectorNumElements();
25072     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
25073                                      WidenNumElts);
25074
25075     unsigned NumConcat = WidenNumElts / MaskNumElts;
25076     SmallVector<SDValue, 16> Ops(NumConcat);
25077     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
25078     Ops[0] = Mask;
25079     for (unsigned i = 1; i != NumConcat; ++i)
25080       Ops[i] = ZeroVal;
25081
25082     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
25083   }
25084
25085   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
25086                             NewMask, StVT, Mst->getMemOperand(), false);
25087 }
25088 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
25089 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
25090                                    const X86Subtarget *Subtarget) {
25091   StoreSDNode *St = cast<StoreSDNode>(N);
25092   EVT VT = St->getValue().getValueType();
25093   EVT StVT = St->getMemoryVT();
25094   SDLoc dl(St);
25095   SDValue StoredVal = St->getOperand(1);
25096   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25097
25098   // If we are saving a concatenation of two XMM registers and 32-byte stores
25099   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
25100   unsigned Alignment = St->getAlignment();
25101   bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
25102   if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
25103       StVT == VT && !IsAligned) {
25104     unsigned NumElems = VT.getVectorNumElements();
25105     if (NumElems < 2)
25106       return SDValue();
25107
25108     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
25109     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
25110
25111     SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
25112     SDValue Ptr0 = St->getBasePtr();
25113     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
25114
25115     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
25116                                 St->getPointerInfo(), St->isVolatile(),
25117                                 St->isNonTemporal(), Alignment);
25118     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
25119                                 St->getPointerInfo(), St->isVolatile(),
25120                                 St->isNonTemporal(),
25121                                 std::min(16U, Alignment));
25122     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
25123   }
25124
25125   // Optimize trunc store (of multiple scalars) to shuffle and store.
25126   // First, pack all of the elements in one place. Next, store to memory
25127   // in fewer chunks.
25128   if (St->isTruncatingStore() && VT.isVector()) {
25129     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25130     unsigned NumElems = VT.getVectorNumElements();
25131     assert(StVT != VT && "Cannot truncate to the same type");
25132     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25133     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25134
25135     // From, To sizes and ElemCount must be pow of two
25136     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
25137     // We are going to use the original vector elt for storing.
25138     // Accumulated smaller vector elements must be a multiple of the store size.
25139     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
25140
25141     unsigned SizeRatio  = FromSz / ToSz;
25142
25143     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25144
25145     // Create a type on which we perform the shuffle
25146     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25147             StVT.getScalarType(), NumElems*SizeRatio);
25148
25149     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25150
25151     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
25152     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
25153     for (unsigned i = 0; i != NumElems; ++i)
25154       ShuffleVec[i] = i * SizeRatio;
25155
25156     // Can't shuffle using an illegal type.
25157     if (!TLI.isTypeLegal(WideVecVT))
25158       return SDValue();
25159
25160     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25161                                          DAG.getUNDEF(WideVecVT),
25162                                          &ShuffleVec[0]);
25163     // At this point all of the data is stored at the bottom of the
25164     // register. We now need to save it to mem.
25165
25166     // Find the largest store unit
25167     MVT StoreType = MVT::i8;
25168     for (MVT Tp : MVT::integer_valuetypes()) {
25169       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
25170         StoreType = Tp;
25171     }
25172
25173     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
25174     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
25175         (64 <= NumElems * ToSz))
25176       StoreType = MVT::f64;
25177
25178     // Bitcast the original vector into a vector of store-size units
25179     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
25180             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
25181     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
25182     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
25183     SmallVector<SDValue, 8> Chains;
25184     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
25185                                         TLI.getPointerTy());
25186     SDValue Ptr = St->getBasePtr();
25187
25188     // Perform one or more big stores into memory.
25189     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
25190       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
25191                                    StoreType, ShuffWide,
25192                                    DAG.getIntPtrConstant(i));
25193       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
25194                                 St->getPointerInfo(), St->isVolatile(),
25195                                 St->isNonTemporal(), St->getAlignment());
25196       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
25197       Chains.push_back(Ch);
25198     }
25199
25200     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
25201   }
25202
25203   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
25204   // the FP state in cases where an emms may be missing.
25205   // A preferable solution to the general problem is to figure out the right
25206   // places to insert EMMS.  This qualifies as a quick hack.
25207
25208   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
25209   if (VT.getSizeInBits() != 64)
25210     return SDValue();
25211
25212   const Function *F = DAG.getMachineFunction().getFunction();
25213   bool NoImplicitFloatOps = F->getAttributes().
25214     hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
25215   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
25216                      && Subtarget->hasSSE2();
25217   if ((VT.isVector() ||
25218        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
25219       isa<LoadSDNode>(St->getValue()) &&
25220       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
25221       St->getChain().hasOneUse() && !St->isVolatile()) {
25222     SDNode* LdVal = St->getValue().getNode();
25223     LoadSDNode *Ld = nullptr;
25224     int TokenFactorIndex = -1;
25225     SmallVector<SDValue, 8> Ops;
25226     SDNode* ChainVal = St->getChain().getNode();
25227     // Must be a store of a load.  We currently handle two cases:  the load
25228     // is a direct child, and it's under an intervening TokenFactor.  It is
25229     // possible to dig deeper under nested TokenFactors.
25230     if (ChainVal == LdVal)
25231       Ld = cast<LoadSDNode>(St->getChain());
25232     else if (St->getValue().hasOneUse() &&
25233              ChainVal->getOpcode() == ISD::TokenFactor) {
25234       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
25235         if (ChainVal->getOperand(i).getNode() == LdVal) {
25236           TokenFactorIndex = i;
25237           Ld = cast<LoadSDNode>(St->getValue());
25238         } else
25239           Ops.push_back(ChainVal->getOperand(i));
25240       }
25241     }
25242
25243     if (!Ld || !ISD::isNormalLoad(Ld))
25244       return SDValue();
25245
25246     // If this is not the MMX case, i.e. we are just turning i64 load/store
25247     // into f64 load/store, avoid the transformation if there are multiple
25248     // uses of the loaded value.
25249     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
25250       return SDValue();
25251
25252     SDLoc LdDL(Ld);
25253     SDLoc StDL(N);
25254     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
25255     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
25256     // pair instead.
25257     if (Subtarget->is64Bit() || F64IsLegal) {
25258       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
25259       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
25260                                   Ld->getPointerInfo(), Ld->isVolatile(),
25261                                   Ld->isNonTemporal(), Ld->isInvariant(),
25262                                   Ld->getAlignment());
25263       SDValue NewChain = NewLd.getValue(1);
25264       if (TokenFactorIndex != -1) {
25265         Ops.push_back(NewChain);
25266         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25267       }
25268       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
25269                           St->getPointerInfo(),
25270                           St->isVolatile(), St->isNonTemporal(),
25271                           St->getAlignment());
25272     }
25273
25274     // Otherwise, lower to two pairs of 32-bit loads / stores.
25275     SDValue LoAddr = Ld->getBasePtr();
25276     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
25277                                  DAG.getConstant(4, MVT::i32));
25278
25279     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
25280                                Ld->getPointerInfo(),
25281                                Ld->isVolatile(), Ld->isNonTemporal(),
25282                                Ld->isInvariant(), Ld->getAlignment());
25283     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
25284                                Ld->getPointerInfo().getWithOffset(4),
25285                                Ld->isVolatile(), Ld->isNonTemporal(),
25286                                Ld->isInvariant(),
25287                                MinAlign(Ld->getAlignment(), 4));
25288
25289     SDValue NewChain = LoLd.getValue(1);
25290     if (TokenFactorIndex != -1) {
25291       Ops.push_back(LoLd);
25292       Ops.push_back(HiLd);
25293       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25294     }
25295
25296     LoAddr = St->getBasePtr();
25297     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
25298                          DAG.getConstant(4, MVT::i32));
25299
25300     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
25301                                 St->getPointerInfo(),
25302                                 St->isVolatile(), St->isNonTemporal(),
25303                                 St->getAlignment());
25304     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
25305                                 St->getPointerInfo().getWithOffset(4),
25306                                 St->isVolatile(),
25307                                 St->isNonTemporal(),
25308                                 MinAlign(St->getAlignment(), 4));
25309     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
25310   }
25311   return SDValue();
25312 }
25313
25314 /// Return 'true' if this vector operation is "horizontal"
25315 /// and return the operands for the horizontal operation in LHS and RHS.  A
25316 /// horizontal operation performs the binary operation on successive elements
25317 /// of its first operand, then on successive elements of its second operand,
25318 /// returning the resulting values in a vector.  For example, if
25319 ///   A = < float a0, float a1, float a2, float a3 >
25320 /// and
25321 ///   B = < float b0, float b1, float b2, float b3 >
25322 /// then the result of doing a horizontal operation on A and B is
25323 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
25324 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
25325 /// A horizontal-op B, for some already available A and B, and if so then LHS is
25326 /// set to A, RHS to B, and the routine returns 'true'.
25327 /// Note that the binary operation should have the property that if one of the
25328 /// operands is UNDEF then the result is UNDEF.
25329 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
25330   // Look for the following pattern: if
25331   //   A = < float a0, float a1, float a2, float a3 >
25332   //   B = < float b0, float b1, float b2, float b3 >
25333   // and
25334   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
25335   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
25336   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
25337   // which is A horizontal-op B.
25338
25339   // At least one of the operands should be a vector shuffle.
25340   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
25341       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
25342     return false;
25343
25344   MVT VT = LHS.getSimpleValueType();
25345
25346   assert((VT.is128BitVector() || VT.is256BitVector()) &&
25347          "Unsupported vector type for horizontal add/sub");
25348
25349   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
25350   // operate independently on 128-bit lanes.
25351   unsigned NumElts = VT.getVectorNumElements();
25352   unsigned NumLanes = VT.getSizeInBits()/128;
25353   unsigned NumLaneElts = NumElts / NumLanes;
25354   assert((NumLaneElts % 2 == 0) &&
25355          "Vector type should have an even number of elements in each lane");
25356   unsigned HalfLaneElts = NumLaneElts/2;
25357
25358   // View LHS in the form
25359   //   LHS = VECTOR_SHUFFLE A, B, LMask
25360   // If LHS is not a shuffle then pretend it is the shuffle
25361   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
25362   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
25363   // type VT.
25364   SDValue A, B;
25365   SmallVector<int, 16> LMask(NumElts);
25366   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25367     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
25368       A = LHS.getOperand(0);
25369     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
25370       B = LHS.getOperand(1);
25371     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
25372     std::copy(Mask.begin(), Mask.end(), LMask.begin());
25373   } else {
25374     if (LHS.getOpcode() != ISD::UNDEF)
25375       A = LHS;
25376     for (unsigned i = 0; i != NumElts; ++i)
25377       LMask[i] = i;
25378   }
25379
25380   // Likewise, view RHS in the form
25381   //   RHS = VECTOR_SHUFFLE C, D, RMask
25382   SDValue C, D;
25383   SmallVector<int, 16> RMask(NumElts);
25384   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25385     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
25386       C = RHS.getOperand(0);
25387     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
25388       D = RHS.getOperand(1);
25389     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
25390     std::copy(Mask.begin(), Mask.end(), RMask.begin());
25391   } else {
25392     if (RHS.getOpcode() != ISD::UNDEF)
25393       C = RHS;
25394     for (unsigned i = 0; i != NumElts; ++i)
25395       RMask[i] = i;
25396   }
25397
25398   // Check that the shuffles are both shuffling the same vectors.
25399   if (!(A == C && B == D) && !(A == D && B == C))
25400     return false;
25401
25402   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
25403   if (!A.getNode() && !B.getNode())
25404     return false;
25405
25406   // If A and B occur in reverse order in RHS, then "swap" them (which means
25407   // rewriting the mask).
25408   if (A != C)
25409     CommuteVectorShuffleMask(RMask, NumElts);
25410
25411   // At this point LHS and RHS are equivalent to
25412   //   LHS = VECTOR_SHUFFLE A, B, LMask
25413   //   RHS = VECTOR_SHUFFLE A, B, RMask
25414   // Check that the masks correspond to performing a horizontal operation.
25415   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
25416     for (unsigned i = 0; i != NumLaneElts; ++i) {
25417       int LIdx = LMask[i+l], RIdx = RMask[i+l];
25418
25419       // Ignore any UNDEF components.
25420       if (LIdx < 0 || RIdx < 0 ||
25421           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
25422           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
25423         continue;
25424
25425       // Check that successive elements are being operated on.  If not, this is
25426       // not a horizontal operation.
25427       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
25428       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
25429       if (!(LIdx == Index && RIdx == Index + 1) &&
25430           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
25431         return false;
25432     }
25433   }
25434
25435   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
25436   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
25437   return true;
25438 }
25439
25440 /// Do target-specific dag combines on floating point adds.
25441 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
25442                                   const X86Subtarget *Subtarget) {
25443   EVT VT = N->getValueType(0);
25444   SDValue LHS = N->getOperand(0);
25445   SDValue RHS = N->getOperand(1);
25446
25447   // Try to synthesize horizontal adds from adds of shuffles.
25448   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25449        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25450       isHorizontalBinOp(LHS, RHS, true))
25451     return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
25452   return SDValue();
25453 }
25454
25455 /// Do target-specific dag combines on floating point subs.
25456 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
25457                                   const X86Subtarget *Subtarget) {
25458   EVT VT = N->getValueType(0);
25459   SDValue LHS = N->getOperand(0);
25460   SDValue RHS = N->getOperand(1);
25461
25462   // Try to synthesize horizontal subs from subs of shuffles.
25463   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25464        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25465       isHorizontalBinOp(LHS, RHS, false))
25466     return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
25467   return SDValue();
25468 }
25469
25470 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
25471 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
25472   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
25473
25474   // F[X]OR(0.0, x) -> x
25475   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25476     if (C->getValueAPF().isPosZero())
25477       return N->getOperand(1);
25478
25479   // F[X]OR(x, 0.0) -> x
25480   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25481     if (C->getValueAPF().isPosZero())
25482       return N->getOperand(0);
25483   return SDValue();
25484 }
25485
25486 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
25487 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
25488   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
25489
25490   // Only perform optimizations if UnsafeMath is used.
25491   if (!DAG.getTarget().Options.UnsafeFPMath)
25492     return SDValue();
25493
25494   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
25495   // into FMINC and FMAXC, which are Commutative operations.
25496   unsigned NewOp = 0;
25497   switch (N->getOpcode()) {
25498     default: llvm_unreachable("unknown opcode");
25499     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
25500     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
25501   }
25502
25503   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
25504                      N->getOperand(0), N->getOperand(1));
25505 }
25506
25507 /// Do target-specific dag combines on X86ISD::FAND nodes.
25508 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
25509   // FAND(0.0, x) -> 0.0
25510   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25511     if (C->getValueAPF().isPosZero())
25512       return N->getOperand(0);
25513
25514   // FAND(x, 0.0) -> 0.0
25515   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25516     if (C->getValueAPF().isPosZero())
25517       return N->getOperand(1);
25518
25519   return SDValue();
25520 }
25521
25522 /// Do target-specific dag combines on X86ISD::FANDN nodes
25523 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
25524   // FANDN(0.0, x) -> x
25525   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25526     if (C->getValueAPF().isPosZero())
25527       return N->getOperand(1);
25528
25529   // FANDN(x, 0.0) -> 0.0
25530   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25531     if (C->getValueAPF().isPosZero())
25532       return N->getOperand(1);
25533
25534   return SDValue();
25535 }
25536
25537 static SDValue PerformBTCombine(SDNode *N,
25538                                 SelectionDAG &DAG,
25539                                 TargetLowering::DAGCombinerInfo &DCI) {
25540   // BT ignores high bits in the bit index operand.
25541   SDValue Op1 = N->getOperand(1);
25542   if (Op1.hasOneUse()) {
25543     unsigned BitWidth = Op1.getValueSizeInBits();
25544     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
25545     APInt KnownZero, KnownOne;
25546     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
25547                                           !DCI.isBeforeLegalizeOps());
25548     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25549     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
25550         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
25551       DCI.CommitTargetLoweringOpt(TLO);
25552   }
25553   return SDValue();
25554 }
25555
25556 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
25557   SDValue Op = N->getOperand(0);
25558   if (Op.getOpcode() == ISD::BITCAST)
25559     Op = Op.getOperand(0);
25560   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
25561   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
25562       VT.getVectorElementType().getSizeInBits() ==
25563       OpVT.getVectorElementType().getSizeInBits()) {
25564     return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
25565   }
25566   return SDValue();
25567 }
25568
25569 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
25570                                                const X86Subtarget *Subtarget) {
25571   EVT VT = N->getValueType(0);
25572   if (!VT.isVector())
25573     return SDValue();
25574
25575   SDValue N0 = N->getOperand(0);
25576   SDValue N1 = N->getOperand(1);
25577   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
25578   SDLoc dl(N);
25579
25580   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
25581   // both SSE and AVX2 since there is no sign-extended shift right
25582   // operation on a vector with 64-bit elements.
25583   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
25584   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
25585   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
25586       N0.getOpcode() == ISD::SIGN_EXTEND)) {
25587     SDValue N00 = N0.getOperand(0);
25588
25589     // EXTLOAD has a better solution on AVX2,
25590     // it may be replaced with X86ISD::VSEXT node.
25591     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
25592       if (!ISD::isNormalLoad(N00.getNode()))
25593         return SDValue();
25594
25595     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
25596         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
25597                                   N00, N1);
25598       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
25599     }
25600   }
25601   return SDValue();
25602 }
25603
25604 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
25605                                   TargetLowering::DAGCombinerInfo &DCI,
25606                                   const X86Subtarget *Subtarget) {
25607   SDValue N0 = N->getOperand(0);
25608   EVT VT = N->getValueType(0);
25609
25610   // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
25611   // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
25612   // This exposes the sext to the sdivrem lowering, so that it directly extends
25613   // from AH (which we otherwise need to do contortions to access).
25614   if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
25615       N0.getValueType() == MVT::i8 && VT == MVT::i32) {
25616     SDLoc dl(N);
25617     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25618     SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
25619                             N0.getOperand(0), N0.getOperand(1));
25620     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25621     return R.getValue(1);
25622   }
25623
25624   if (!DCI.isBeforeLegalizeOps())
25625     return SDValue();
25626
25627   if (!Subtarget->hasFp256())
25628     return SDValue();
25629
25630   if (VT.isVector() && VT.getSizeInBits() == 256) {
25631     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25632     if (R.getNode())
25633       return R;
25634   }
25635
25636   return SDValue();
25637 }
25638
25639 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
25640                                  const X86Subtarget* Subtarget) {
25641   SDLoc dl(N);
25642   EVT VT = N->getValueType(0);
25643
25644   // Let legalize expand this if it isn't a legal type yet.
25645   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
25646     return SDValue();
25647
25648   EVT ScalarVT = VT.getScalarType();
25649   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
25650       (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
25651     return SDValue();
25652
25653   SDValue A = N->getOperand(0);
25654   SDValue B = N->getOperand(1);
25655   SDValue C = N->getOperand(2);
25656
25657   bool NegA = (A.getOpcode() == ISD::FNEG);
25658   bool NegB = (B.getOpcode() == ISD::FNEG);
25659   bool NegC = (C.getOpcode() == ISD::FNEG);
25660
25661   // Negative multiplication when NegA xor NegB
25662   bool NegMul = (NegA != NegB);
25663   if (NegA)
25664     A = A.getOperand(0);
25665   if (NegB)
25666     B = B.getOperand(0);
25667   if (NegC)
25668     C = C.getOperand(0);
25669
25670   unsigned Opcode;
25671   if (!NegMul)
25672     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
25673   else
25674     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
25675
25676   return DAG.getNode(Opcode, dl, VT, A, B, C);
25677 }
25678
25679 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
25680                                   TargetLowering::DAGCombinerInfo &DCI,
25681                                   const X86Subtarget *Subtarget) {
25682   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
25683   //           (and (i32 x86isd::setcc_carry), 1)
25684   // This eliminates the zext. This transformation is necessary because
25685   // ISD::SETCC is always legalized to i8.
25686   SDLoc dl(N);
25687   SDValue N0 = N->getOperand(0);
25688   EVT VT = N->getValueType(0);
25689
25690   if (N0.getOpcode() == ISD::AND &&
25691       N0.hasOneUse() &&
25692       N0.getOperand(0).hasOneUse()) {
25693     SDValue N00 = N0.getOperand(0);
25694     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25695       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
25696       if (!C || C->getZExtValue() != 1)
25697         return SDValue();
25698       return DAG.getNode(ISD::AND, dl, VT,
25699                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25700                                      N00.getOperand(0), N00.getOperand(1)),
25701                          DAG.getConstant(1, VT));
25702     }
25703   }
25704
25705   if (N0.getOpcode() == ISD::TRUNCATE &&
25706       N0.hasOneUse() &&
25707       N0.getOperand(0).hasOneUse()) {
25708     SDValue N00 = N0.getOperand(0);
25709     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25710       return DAG.getNode(ISD::AND, dl, VT,
25711                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25712                                      N00.getOperand(0), N00.getOperand(1)),
25713                          DAG.getConstant(1, VT));
25714     }
25715   }
25716   if (VT.is256BitVector()) {
25717     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25718     if (R.getNode())
25719       return R;
25720   }
25721
25722   // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
25723   // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
25724   // This exposes the zext to the udivrem lowering, so that it directly extends
25725   // from AH (which we otherwise need to do contortions to access).
25726   if (N0.getOpcode() == ISD::UDIVREM &&
25727       N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
25728       (VT == MVT::i32 || VT == MVT::i64)) {
25729     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25730     SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
25731                             N0.getOperand(0), N0.getOperand(1));
25732     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25733     return R.getValue(1);
25734   }
25735
25736   return SDValue();
25737 }
25738
25739 // Optimize x == -y --> x+y == 0
25740 //          x != -y --> x+y != 0
25741 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
25742                                       const X86Subtarget* Subtarget) {
25743   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
25744   SDValue LHS = N->getOperand(0);
25745   SDValue RHS = N->getOperand(1);
25746   EVT VT = N->getValueType(0);
25747   SDLoc DL(N);
25748
25749   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
25750     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
25751       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
25752         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25753                                    LHS.getValueType(), RHS, LHS.getOperand(1));
25754         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25755                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25756       }
25757   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
25758     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
25759       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
25760         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25761                                    RHS.getValueType(), LHS, RHS.getOperand(1));
25762         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25763                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25764       }
25765
25766   if (VT.getScalarType() == MVT::i1) {
25767     bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
25768       (LHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25769     bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
25770     if (!IsSEXT0 && !IsVZero0)
25771       return SDValue();
25772     bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
25773       (RHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25774     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
25775
25776     if (!IsSEXT1 && !IsVZero1)
25777       return SDValue();
25778
25779     if (IsSEXT0 && IsVZero1) {
25780       assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
25781       if (CC == ISD::SETEQ)
25782         return DAG.getNOT(DL, LHS.getOperand(0), VT);
25783       return LHS.getOperand(0);
25784     }
25785     if (IsSEXT1 && IsVZero0) {
25786       assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
25787       if (CC == ISD::SETEQ)
25788         return DAG.getNOT(DL, RHS.getOperand(0), VT);
25789       return RHS.getOperand(0);
25790     }
25791   }
25792
25793   return SDValue();
25794 }
25795
25796 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
25797                                       const X86Subtarget *Subtarget) {
25798   SDLoc dl(N);
25799   MVT VT = N->getOperand(1)->getSimpleValueType(0);
25800   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
25801          "X86insertps is only defined for v4x32");
25802
25803   SDValue Ld = N->getOperand(1);
25804   if (MayFoldLoad(Ld)) {
25805     // Extract the countS bits from the immediate so we can get the proper
25806     // address when narrowing the vector load to a specific element.
25807     // When the second source op is a memory address, interps doesn't use
25808     // countS and just gets an f32 from that address.
25809     unsigned DestIndex =
25810         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
25811     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
25812   } else
25813     return SDValue();
25814
25815   // Create this as a scalar to vector to match the instruction pattern.
25816   SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
25817   // countS bits are ignored when loading from memory on insertps, which
25818   // means we don't need to explicitly set them to 0.
25819   return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
25820                      LoadScalarToVector, N->getOperand(2));
25821 }
25822
25823 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
25824 // as "sbb reg,reg", since it can be extended without zext and produces
25825 // an all-ones bit which is more useful than 0/1 in some cases.
25826 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
25827                                MVT VT) {
25828   if (VT == MVT::i8)
25829     return DAG.getNode(ISD::AND, DL, VT,
25830                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25831                                    DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
25832                        DAG.getConstant(1, VT));
25833   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
25834   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
25835                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25836                                  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
25837 }
25838
25839 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
25840 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
25841                                    TargetLowering::DAGCombinerInfo &DCI,
25842                                    const X86Subtarget *Subtarget) {
25843   SDLoc DL(N);
25844   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
25845   SDValue EFLAGS = N->getOperand(1);
25846
25847   if (CC == X86::COND_A) {
25848     // Try to convert COND_A into COND_B in an attempt to facilitate
25849     // materializing "setb reg".
25850     //
25851     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
25852     // cannot take an immediate as its first operand.
25853     //
25854     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
25855         EFLAGS.getValueType().isInteger() &&
25856         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
25857       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
25858                                    EFLAGS.getNode()->getVTList(),
25859                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
25860       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
25861       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
25862     }
25863   }
25864
25865   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
25866   // a zext and produces an all-ones bit which is more useful than 0/1 in some
25867   // cases.
25868   if (CC == X86::COND_B)
25869     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
25870
25871   SDValue Flags;
25872
25873   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25874   if (Flags.getNode()) {
25875     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25876     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
25877   }
25878
25879   return SDValue();
25880 }
25881
25882 // Optimize branch condition evaluation.
25883 //
25884 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
25885                                     TargetLowering::DAGCombinerInfo &DCI,
25886                                     const X86Subtarget *Subtarget) {
25887   SDLoc DL(N);
25888   SDValue Chain = N->getOperand(0);
25889   SDValue Dest = N->getOperand(1);
25890   SDValue EFLAGS = N->getOperand(3);
25891   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
25892
25893   SDValue Flags;
25894
25895   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25896   if (Flags.getNode()) {
25897     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25898     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
25899                        Flags);
25900   }
25901
25902   return SDValue();
25903 }
25904
25905 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
25906                                                          SelectionDAG &DAG) {
25907   // Take advantage of vector comparisons producing 0 or -1 in each lane to
25908   // optimize away operation when it's from a constant.
25909   //
25910   // The general transformation is:
25911   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
25912   //       AND(VECTOR_CMP(x,y), constant2)
25913   //    constant2 = UNARYOP(constant)
25914
25915   // Early exit if this isn't a vector operation, the operand of the
25916   // unary operation isn't a bitwise AND, or if the sizes of the operations
25917   // aren't the same.
25918   EVT VT = N->getValueType(0);
25919   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
25920       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
25921       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
25922     return SDValue();
25923
25924   // Now check that the other operand of the AND is a constant. We could
25925   // make the transformation for non-constant splats as well, but it's unclear
25926   // that would be a benefit as it would not eliminate any operations, just
25927   // perform one more step in scalar code before moving to the vector unit.
25928   if (BuildVectorSDNode *BV =
25929           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
25930     // Bail out if the vector isn't a constant.
25931     if (!BV->isConstant())
25932       return SDValue();
25933
25934     // Everything checks out. Build up the new and improved node.
25935     SDLoc DL(N);
25936     EVT IntVT = BV->getValueType(0);
25937     // Create a new constant of the appropriate type for the transformed
25938     // DAG.
25939     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
25940     // The AND node needs bitcasts to/from an integer vector type around it.
25941     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
25942     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
25943                                  N->getOperand(0)->getOperand(0), MaskConst);
25944     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
25945     return Res;
25946   }
25947
25948   return SDValue();
25949 }
25950
25951 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
25952                                         const X86Subtarget *Subtarget) {
25953   // First try to optimize away the conversion entirely when it's
25954   // conditionally from a constant. Vectors only.
25955   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
25956   if (Res != SDValue())
25957     return Res;
25958
25959   // Now move on to more general possibilities.
25960   SDValue Op0 = N->getOperand(0);
25961   EVT InVT = Op0->getValueType(0);
25962
25963   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
25964   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
25965     SDLoc dl(N);
25966     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
25967     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
25968     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
25969   }
25970
25971   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
25972   // a 32-bit target where SSE doesn't support i64->FP operations.
25973   if (Op0.getOpcode() == ISD::LOAD) {
25974     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
25975     EVT VT = Ld->getValueType(0);
25976     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
25977         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
25978         !Subtarget->is64Bit() && VT == MVT::i64) {
25979       SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
25980           SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
25981       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
25982       return FILDChain;
25983     }
25984   }
25985   return SDValue();
25986 }
25987
25988 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
25989 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
25990                                  X86TargetLowering::DAGCombinerInfo &DCI) {
25991   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
25992   // the result is either zero or one (depending on the input carry bit).
25993   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
25994   if (X86::isZeroNode(N->getOperand(0)) &&
25995       X86::isZeroNode(N->getOperand(1)) &&
25996       // We don't have a good way to replace an EFLAGS use, so only do this when
25997       // dead right now.
25998       SDValue(N, 1).use_empty()) {
25999     SDLoc DL(N);
26000     EVT VT = N->getValueType(0);
26001     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
26002     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
26003                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
26004                                            DAG.getConstant(X86::COND_B,MVT::i8),
26005                                            N->getOperand(2)),
26006                                DAG.getConstant(1, VT));
26007     return DCI.CombineTo(N, Res1, CarryOut);
26008   }
26009
26010   return SDValue();
26011 }
26012
26013 // fold (add Y, (sete  X, 0)) -> adc  0, Y
26014 //      (add Y, (setne X, 0)) -> sbb -1, Y
26015 //      (sub (sete  X, 0), Y) -> sbb  0, Y
26016 //      (sub (setne X, 0), Y) -> adc -1, Y
26017 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
26018   SDLoc DL(N);
26019
26020   // Look through ZExts.
26021   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
26022   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
26023     return SDValue();
26024
26025   SDValue SetCC = Ext.getOperand(0);
26026   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
26027     return SDValue();
26028
26029   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
26030   if (CC != X86::COND_E && CC != X86::COND_NE)
26031     return SDValue();
26032
26033   SDValue Cmp = SetCC.getOperand(1);
26034   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
26035       !X86::isZeroNode(Cmp.getOperand(1)) ||
26036       !Cmp.getOperand(0).getValueType().isInteger())
26037     return SDValue();
26038
26039   SDValue CmpOp0 = Cmp.getOperand(0);
26040   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
26041                                DAG.getConstant(1, CmpOp0.getValueType()));
26042
26043   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
26044   if (CC == X86::COND_NE)
26045     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
26046                        DL, OtherVal.getValueType(), OtherVal,
26047                        DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
26048   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
26049                      DL, OtherVal.getValueType(), OtherVal,
26050                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
26051 }
26052
26053 /// PerformADDCombine - Do target-specific dag combines on integer adds.
26054 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
26055                                  const X86Subtarget *Subtarget) {
26056   EVT VT = N->getValueType(0);
26057   SDValue Op0 = N->getOperand(0);
26058   SDValue Op1 = N->getOperand(1);
26059
26060   // Try to synthesize horizontal adds from adds of shuffles.
26061   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26062        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26063       isHorizontalBinOp(Op0, Op1, true))
26064     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
26065
26066   return OptimizeConditionalInDecrement(N, DAG);
26067 }
26068
26069 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
26070                                  const X86Subtarget *Subtarget) {
26071   SDValue Op0 = N->getOperand(0);
26072   SDValue Op1 = N->getOperand(1);
26073
26074   // X86 can't encode an immediate LHS of a sub. See if we can push the
26075   // negation into a preceding instruction.
26076   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
26077     // If the RHS of the sub is a XOR with one use and a constant, invert the
26078     // immediate. Then add one to the LHS of the sub so we can turn
26079     // X-Y -> X+~Y+1, saving one register.
26080     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
26081         isa<ConstantSDNode>(Op1.getOperand(1))) {
26082       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
26083       EVT VT = Op0.getValueType();
26084       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
26085                                    Op1.getOperand(0),
26086                                    DAG.getConstant(~XorC, VT));
26087       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
26088                          DAG.getConstant(C->getAPIntValue()+1, VT));
26089     }
26090   }
26091
26092   // Try to synthesize horizontal adds from adds of shuffles.
26093   EVT VT = N->getValueType(0);
26094   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26095        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26096       isHorizontalBinOp(Op0, Op1, true))
26097     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
26098
26099   return OptimizeConditionalInDecrement(N, DAG);
26100 }
26101
26102 /// performVZEXTCombine - Performs build vector combines
26103 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
26104                                    TargetLowering::DAGCombinerInfo &DCI,
26105                                    const X86Subtarget *Subtarget) {
26106   SDLoc DL(N);
26107   MVT VT = N->getSimpleValueType(0);
26108   SDValue Op = N->getOperand(0);
26109   MVT OpVT = Op.getSimpleValueType();
26110   MVT OpEltVT = OpVT.getVectorElementType();
26111   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
26112
26113   // (vzext (bitcast (vzext (x)) -> (vzext x)
26114   SDValue V = Op;
26115   while (V.getOpcode() == ISD::BITCAST)
26116     V = V.getOperand(0);
26117
26118   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
26119     MVT InnerVT = V.getSimpleValueType();
26120     MVT InnerEltVT = InnerVT.getVectorElementType();
26121
26122     // If the element sizes match exactly, we can just do one larger vzext. This
26123     // is always an exact type match as vzext operates on integer types.
26124     if (OpEltVT == InnerEltVT) {
26125       assert(OpVT == InnerVT && "Types must match for vzext!");
26126       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
26127     }
26128
26129     // The only other way we can combine them is if only a single element of the
26130     // inner vzext is used in the input to the outer vzext.
26131     if (InnerEltVT.getSizeInBits() < InputBits)
26132       return SDValue();
26133
26134     // In this case, the inner vzext is completely dead because we're going to
26135     // only look at bits inside of the low element. Just do the outer vzext on
26136     // a bitcast of the input to the inner.
26137     return DAG.getNode(X86ISD::VZEXT, DL, VT,
26138                        DAG.getNode(ISD::BITCAST, DL, OpVT, V));
26139   }
26140
26141   // Check if we can bypass extracting and re-inserting an element of an input
26142   // vector. Essentialy:
26143   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
26144   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
26145       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
26146       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
26147     SDValue ExtractedV = V.getOperand(0);
26148     SDValue OrigV = ExtractedV.getOperand(0);
26149     if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
26150       if (ExtractIdx->getZExtValue() == 0) {
26151         MVT OrigVT = OrigV.getSimpleValueType();
26152         // Extract a subvector if necessary...
26153         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
26154           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
26155           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
26156                                     OrigVT.getVectorNumElements() / Ratio);
26157           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
26158                               DAG.getIntPtrConstant(0));
26159         }
26160         Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
26161         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
26162       }
26163   }
26164
26165   return SDValue();
26166 }
26167
26168 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
26169                                              DAGCombinerInfo &DCI) const {
26170   SelectionDAG &DAG = DCI.DAG;
26171   switch (N->getOpcode()) {
26172   default: break;
26173   case ISD::EXTRACT_VECTOR_ELT:
26174     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
26175   case ISD::VSELECT:
26176   case ISD::SELECT:
26177   case X86ISD::SHRUNKBLEND:
26178     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
26179   case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG);
26180   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
26181   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
26182   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
26183   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
26184   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
26185   case ISD::SHL:
26186   case ISD::SRA:
26187   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
26188   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
26189   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
26190   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
26191   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
26192   case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
26193   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
26194   case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
26195   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
26196   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
26197   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
26198   case X86ISD::FXOR:
26199   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
26200   case X86ISD::FMIN:
26201   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
26202   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
26203   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
26204   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
26205   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
26206   case ISD::ANY_EXTEND:
26207   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
26208   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
26209   case ISD::SIGN_EXTEND_INREG:
26210     return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
26211   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
26212   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
26213   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
26214   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
26215   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
26216   case X86ISD::SHUFP:       // Handle all target specific shuffles
26217   case X86ISD::PALIGNR:
26218   case X86ISD::UNPCKH:
26219   case X86ISD::UNPCKL:
26220   case X86ISD::MOVHLPS:
26221   case X86ISD::MOVLHPS:
26222   case X86ISD::PSHUFB:
26223   case X86ISD::PSHUFD:
26224   case X86ISD::PSHUFHW:
26225   case X86ISD::PSHUFLW:
26226   case X86ISD::MOVSS:
26227   case X86ISD::MOVSD:
26228   case X86ISD::VPERMILPI:
26229   case X86ISD::VPERM2X128:
26230   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
26231   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
26232   case ISD::INTRINSIC_WO_CHAIN:
26233     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
26234   case X86ISD::INSERTPS: {
26235     if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
26236       return PerformINSERTPSCombine(N, DAG, Subtarget);
26237     break;
26238   }
26239   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
26240   }
26241
26242   return SDValue();
26243 }
26244
26245 /// isTypeDesirableForOp - Return true if the target has native support for
26246 /// the specified value type and it is 'desirable' to use the type for the
26247 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
26248 /// instruction encodings are longer and some i16 instructions are slow.
26249 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
26250   if (!isTypeLegal(VT))
26251     return false;
26252   if (VT != MVT::i16)
26253     return true;
26254
26255   switch (Opc) {
26256   default:
26257     return true;
26258   case ISD::LOAD:
26259   case ISD::SIGN_EXTEND:
26260   case ISD::ZERO_EXTEND:
26261   case ISD::ANY_EXTEND:
26262   case ISD::SHL:
26263   case ISD::SRL:
26264   case ISD::SUB:
26265   case ISD::ADD:
26266   case ISD::MUL:
26267   case ISD::AND:
26268   case ISD::OR:
26269   case ISD::XOR:
26270     return false;
26271   }
26272 }
26273
26274 /// IsDesirableToPromoteOp - This method query the target whether it is
26275 /// beneficial for dag combiner to promote the specified node. If true, it
26276 /// should return the desired promotion type by reference.
26277 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
26278   EVT VT = Op.getValueType();
26279   if (VT != MVT::i16)
26280     return false;
26281
26282   bool Promote = false;
26283   bool Commute = false;
26284   switch (Op.getOpcode()) {
26285   default: break;
26286   case ISD::LOAD: {
26287     LoadSDNode *LD = cast<LoadSDNode>(Op);
26288     // If the non-extending load has a single use and it's not live out, then it
26289     // might be folded.
26290     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
26291                                                      Op.hasOneUse()*/) {
26292       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
26293              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
26294         // The only case where we'd want to promote LOAD (rather then it being
26295         // promoted as an operand is when it's only use is liveout.
26296         if (UI->getOpcode() != ISD::CopyToReg)
26297           return false;
26298       }
26299     }
26300     Promote = true;
26301     break;
26302   }
26303   case ISD::SIGN_EXTEND:
26304   case ISD::ZERO_EXTEND:
26305   case ISD::ANY_EXTEND:
26306     Promote = true;
26307     break;
26308   case ISD::SHL:
26309   case ISD::SRL: {
26310     SDValue N0 = Op.getOperand(0);
26311     // Look out for (store (shl (load), x)).
26312     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
26313       return false;
26314     Promote = true;
26315     break;
26316   }
26317   case ISD::ADD:
26318   case ISD::MUL:
26319   case ISD::AND:
26320   case ISD::OR:
26321   case ISD::XOR:
26322     Commute = true;
26323     // fallthrough
26324   case ISD::SUB: {
26325     SDValue N0 = Op.getOperand(0);
26326     SDValue N1 = Op.getOperand(1);
26327     if (!Commute && MayFoldLoad(N1))
26328       return false;
26329     // Avoid disabling potential load folding opportunities.
26330     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
26331       return false;
26332     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
26333       return false;
26334     Promote = true;
26335   }
26336   }
26337
26338   PVT = MVT::i32;
26339   return Promote;
26340 }
26341
26342 //===----------------------------------------------------------------------===//
26343 //                           X86 Inline Assembly Support
26344 //===----------------------------------------------------------------------===//
26345
26346 namespace {
26347   // Helper to match a string separated by whitespace.
26348   bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
26349     s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
26350
26351     for (unsigned i = 0, e = args.size(); i != e; ++i) {
26352       StringRef piece(*args[i]);
26353       if (!s.startswith(piece)) // Check if the piece matches.
26354         return false;
26355
26356       s = s.substr(piece.size());
26357       StringRef::size_type pos = s.find_first_not_of(" \t");
26358       if (pos == 0) // We matched a prefix.
26359         return false;
26360
26361       s = s.substr(pos);
26362     }
26363
26364     return s.empty();
26365   }
26366   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
26367 }
26368
26369 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
26370
26371   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
26372     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
26373         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
26374         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
26375
26376       if (AsmPieces.size() == 3)
26377         return true;
26378       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
26379         return true;
26380     }
26381   }
26382   return false;
26383 }
26384
26385 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
26386   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
26387
26388   std::string AsmStr = IA->getAsmString();
26389
26390   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
26391   if (!Ty || Ty->getBitWidth() % 16 != 0)
26392     return false;
26393
26394   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
26395   SmallVector<StringRef, 4> AsmPieces;
26396   SplitString(AsmStr, AsmPieces, ";\n");
26397
26398   switch (AsmPieces.size()) {
26399   default: return false;
26400   case 1:
26401     // FIXME: this should verify that we are targeting a 486 or better.  If not,
26402     // we will turn this bswap into something that will be lowered to logical
26403     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
26404     // lower so don't worry about this.
26405     // bswap $0
26406     if (matchAsm(AsmPieces[0], "bswap", "$0") ||
26407         matchAsm(AsmPieces[0], "bswapl", "$0") ||
26408         matchAsm(AsmPieces[0], "bswapq", "$0") ||
26409         matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
26410         matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
26411         matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
26412       // No need to check constraints, nothing other than the equivalent of
26413       // "=r,0" would be valid here.
26414       return IntrinsicLowering::LowerToByteSwap(CI);
26415     }
26416
26417     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
26418     if (CI->getType()->isIntegerTy(16) &&
26419         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26420         (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
26421          matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
26422       AsmPieces.clear();
26423       const std::string &ConstraintsStr = IA->getConstraintString();
26424       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26425       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26426       if (clobbersFlagRegisters(AsmPieces))
26427         return IntrinsicLowering::LowerToByteSwap(CI);
26428     }
26429     break;
26430   case 3:
26431     if (CI->getType()->isIntegerTy(32) &&
26432         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26433         matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
26434         matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
26435         matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
26436       AsmPieces.clear();
26437       const std::string &ConstraintsStr = IA->getConstraintString();
26438       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26439       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26440       if (clobbersFlagRegisters(AsmPieces))
26441         return IntrinsicLowering::LowerToByteSwap(CI);
26442     }
26443
26444     if (CI->getType()->isIntegerTy(64)) {
26445       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
26446       if (Constraints.size() >= 2 &&
26447           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
26448           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
26449         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
26450         if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
26451             matchAsm(AsmPieces[1], "bswap", "%edx") &&
26452             matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
26453           return IntrinsicLowering::LowerToByteSwap(CI);
26454       }
26455     }
26456     break;
26457   }
26458   return false;
26459 }
26460
26461 /// getConstraintType - Given a constraint letter, return the type of
26462 /// constraint it is for this target.
26463 X86TargetLowering::ConstraintType
26464 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
26465   if (Constraint.size() == 1) {
26466     switch (Constraint[0]) {
26467     case 'R':
26468     case 'q':
26469     case 'Q':
26470     case 'f':
26471     case 't':
26472     case 'u':
26473     case 'y':
26474     case 'x':
26475     case 'Y':
26476     case 'l':
26477       return C_RegisterClass;
26478     case 'a':
26479     case 'b':
26480     case 'c':
26481     case 'd':
26482     case 'S':
26483     case 'D':
26484     case 'A':
26485       return C_Register;
26486     case 'I':
26487     case 'J':
26488     case 'K':
26489     case 'L':
26490     case 'M':
26491     case 'N':
26492     case 'G':
26493     case 'C':
26494     case 'e':
26495     case 'Z':
26496       return C_Other;
26497     default:
26498       break;
26499     }
26500   }
26501   return TargetLowering::getConstraintType(Constraint);
26502 }
26503
26504 /// Examine constraint type and operand type and determine a weight value.
26505 /// This object must already have been set up with the operand type
26506 /// and the current alternative constraint selected.
26507 TargetLowering::ConstraintWeight
26508   X86TargetLowering::getSingleConstraintMatchWeight(
26509     AsmOperandInfo &info, const char *constraint) const {
26510   ConstraintWeight weight = CW_Invalid;
26511   Value *CallOperandVal = info.CallOperandVal;
26512     // If we don't have a value, we can't do a match,
26513     // but allow it at the lowest weight.
26514   if (!CallOperandVal)
26515     return CW_Default;
26516   Type *type = CallOperandVal->getType();
26517   // Look at the constraint type.
26518   switch (*constraint) {
26519   default:
26520     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
26521   case 'R':
26522   case 'q':
26523   case 'Q':
26524   case 'a':
26525   case 'b':
26526   case 'c':
26527   case 'd':
26528   case 'S':
26529   case 'D':
26530   case 'A':
26531     if (CallOperandVal->getType()->isIntegerTy())
26532       weight = CW_SpecificReg;
26533     break;
26534   case 'f':
26535   case 't':
26536   case 'u':
26537     if (type->isFloatingPointTy())
26538       weight = CW_SpecificReg;
26539     break;
26540   case 'y':
26541     if (type->isX86_MMXTy() && Subtarget->hasMMX())
26542       weight = CW_SpecificReg;
26543     break;
26544   case 'x':
26545   case 'Y':
26546     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
26547         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
26548       weight = CW_Register;
26549     break;
26550   case 'I':
26551     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
26552       if (C->getZExtValue() <= 31)
26553         weight = CW_Constant;
26554     }
26555     break;
26556   case 'J':
26557     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26558       if (C->getZExtValue() <= 63)
26559         weight = CW_Constant;
26560     }
26561     break;
26562   case 'K':
26563     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26564       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
26565         weight = CW_Constant;
26566     }
26567     break;
26568   case 'L':
26569     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26570       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
26571         weight = CW_Constant;
26572     }
26573     break;
26574   case 'M':
26575     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26576       if (C->getZExtValue() <= 3)
26577         weight = CW_Constant;
26578     }
26579     break;
26580   case 'N':
26581     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26582       if (C->getZExtValue() <= 0xff)
26583         weight = CW_Constant;
26584     }
26585     break;
26586   case 'G':
26587   case 'C':
26588     if (dyn_cast<ConstantFP>(CallOperandVal)) {
26589       weight = CW_Constant;
26590     }
26591     break;
26592   case 'e':
26593     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26594       if ((C->getSExtValue() >= -0x80000000LL) &&
26595           (C->getSExtValue() <= 0x7fffffffLL))
26596         weight = CW_Constant;
26597     }
26598     break;
26599   case 'Z':
26600     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26601       if (C->getZExtValue() <= 0xffffffff)
26602         weight = CW_Constant;
26603     }
26604     break;
26605   }
26606   return weight;
26607 }
26608
26609 /// LowerXConstraint - try to replace an X constraint, which matches anything,
26610 /// with another that has more specific requirements based on the type of the
26611 /// corresponding operand.
26612 const char *X86TargetLowering::
26613 LowerXConstraint(EVT ConstraintVT) const {
26614   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
26615   // 'f' like normal targets.
26616   if (ConstraintVT.isFloatingPoint()) {
26617     if (Subtarget->hasSSE2())
26618       return "Y";
26619     if (Subtarget->hasSSE1())
26620       return "x";
26621   }
26622
26623   return TargetLowering::LowerXConstraint(ConstraintVT);
26624 }
26625
26626 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
26627 /// vector.  If it is invalid, don't add anything to Ops.
26628 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
26629                                                      std::string &Constraint,
26630                                                      std::vector<SDValue>&Ops,
26631                                                      SelectionDAG &DAG) const {
26632   SDValue Result;
26633
26634   // Only support length 1 constraints for now.
26635   if (Constraint.length() > 1) return;
26636
26637   char ConstraintLetter = Constraint[0];
26638   switch (ConstraintLetter) {
26639   default: break;
26640   case 'I':
26641     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26642       if (C->getZExtValue() <= 31) {
26643         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26644         break;
26645       }
26646     }
26647     return;
26648   case 'J':
26649     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26650       if (C->getZExtValue() <= 63) {
26651         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26652         break;
26653       }
26654     }
26655     return;
26656   case 'K':
26657     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26658       if (isInt<8>(C->getSExtValue())) {
26659         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26660         break;
26661       }
26662     }
26663     return;
26664   case 'L':
26665     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26666       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
26667           (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
26668         Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
26669         break;
26670       }
26671     }
26672     return;
26673   case 'M':
26674     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26675       if (C->getZExtValue() <= 3) {
26676         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26677         break;
26678       }
26679     }
26680     return;
26681   case 'N':
26682     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26683       if (C->getZExtValue() <= 255) {
26684         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26685         break;
26686       }
26687     }
26688     return;
26689   case 'O':
26690     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26691       if (C->getZExtValue() <= 127) {
26692         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26693         break;
26694       }
26695     }
26696     return;
26697   case 'e': {
26698     // 32-bit signed value
26699     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26700       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26701                                            C->getSExtValue())) {
26702         // Widen to 64 bits here to get it sign extended.
26703         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
26704         break;
26705       }
26706     // FIXME gcc accepts some relocatable values here too, but only in certain
26707     // memory models; it's complicated.
26708     }
26709     return;
26710   }
26711   case 'Z': {
26712     // 32-bit unsigned value
26713     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26714       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26715                                            C->getZExtValue())) {
26716         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26717         break;
26718       }
26719     }
26720     // FIXME gcc accepts some relocatable values here too, but only in certain
26721     // memory models; it's complicated.
26722     return;
26723   }
26724   case 'i': {
26725     // Literal immediates are always ok.
26726     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
26727       // Widen to 64 bits here to get it sign extended.
26728       Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
26729       break;
26730     }
26731
26732     // In any sort of PIC mode addresses need to be computed at runtime by
26733     // adding in a register or some sort of table lookup.  These can't
26734     // be used as immediates.
26735     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
26736       return;
26737
26738     // If we are in non-pic codegen mode, we allow the address of a global (with
26739     // an optional displacement) to be used with 'i'.
26740     GlobalAddressSDNode *GA = nullptr;
26741     int64_t Offset = 0;
26742
26743     // Match either (GA), (GA+C), (GA+C1+C2), etc.
26744     while (1) {
26745       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
26746         Offset += GA->getOffset();
26747         break;
26748       } else if (Op.getOpcode() == ISD::ADD) {
26749         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26750           Offset += C->getZExtValue();
26751           Op = Op.getOperand(0);
26752           continue;
26753         }
26754       } else if (Op.getOpcode() == ISD::SUB) {
26755         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26756           Offset += -C->getZExtValue();
26757           Op = Op.getOperand(0);
26758           continue;
26759         }
26760       }
26761
26762       // Otherwise, this isn't something we can handle, reject it.
26763       return;
26764     }
26765
26766     const GlobalValue *GV = GA->getGlobal();
26767     // If we require an extra load to get this address, as in PIC mode, we
26768     // can't accept it.
26769     if (isGlobalStubReference(
26770             Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
26771       return;
26772
26773     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
26774                                         GA->getValueType(0), Offset);
26775     break;
26776   }
26777   }
26778
26779   if (Result.getNode()) {
26780     Ops.push_back(Result);
26781     return;
26782   }
26783   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
26784 }
26785
26786 std::pair<unsigned, const TargetRegisterClass*>
26787 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
26788                                                 MVT VT) const {
26789   // First, see if this is a constraint that directly corresponds to an LLVM
26790   // register class.
26791   if (Constraint.size() == 1) {
26792     // GCC Constraint Letters
26793     switch (Constraint[0]) {
26794     default: break;
26795       // TODO: Slight differences here in allocation order and leaving
26796       // RIP in the class. Do they matter any more here than they do
26797       // in the normal allocation?
26798     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
26799       if (Subtarget->is64Bit()) {
26800         if (VT == MVT::i32 || VT == MVT::f32)
26801           return std::make_pair(0U, &X86::GR32RegClass);
26802         if (VT == MVT::i16)
26803           return std::make_pair(0U, &X86::GR16RegClass);
26804         if (VT == MVT::i8 || VT == MVT::i1)
26805           return std::make_pair(0U, &X86::GR8RegClass);
26806         if (VT == MVT::i64 || VT == MVT::f64)
26807           return std::make_pair(0U, &X86::GR64RegClass);
26808         break;
26809       }
26810       // 32-bit fallthrough
26811     case 'Q':   // Q_REGS
26812       if (VT == MVT::i32 || VT == MVT::f32)
26813         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
26814       if (VT == MVT::i16)
26815         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
26816       if (VT == MVT::i8 || VT == MVT::i1)
26817         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
26818       if (VT == MVT::i64)
26819         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
26820       break;
26821     case 'r':   // GENERAL_REGS
26822     case 'l':   // INDEX_REGS
26823       if (VT == MVT::i8 || VT == MVT::i1)
26824         return std::make_pair(0U, &X86::GR8RegClass);
26825       if (VT == MVT::i16)
26826         return std::make_pair(0U, &X86::GR16RegClass);
26827       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
26828         return std::make_pair(0U, &X86::GR32RegClass);
26829       return std::make_pair(0U, &X86::GR64RegClass);
26830     case 'R':   // LEGACY_REGS
26831       if (VT == MVT::i8 || VT == MVT::i1)
26832         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
26833       if (VT == MVT::i16)
26834         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
26835       if (VT == MVT::i32 || !Subtarget->is64Bit())
26836         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
26837       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
26838     case 'f':  // FP Stack registers.
26839       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
26840       // value to the correct fpstack register class.
26841       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
26842         return std::make_pair(0U, &X86::RFP32RegClass);
26843       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
26844         return std::make_pair(0U, &X86::RFP64RegClass);
26845       return std::make_pair(0U, &X86::RFP80RegClass);
26846     case 'y':   // MMX_REGS if MMX allowed.
26847       if (!Subtarget->hasMMX()) break;
26848       return std::make_pair(0U, &X86::VR64RegClass);
26849     case 'Y':   // SSE_REGS if SSE2 allowed
26850       if (!Subtarget->hasSSE2()) break;
26851       // FALL THROUGH.
26852     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
26853       if (!Subtarget->hasSSE1()) break;
26854
26855       switch (VT.SimpleTy) {
26856       default: break;
26857       // Scalar SSE types.
26858       case MVT::f32:
26859       case MVT::i32:
26860         return std::make_pair(0U, &X86::FR32RegClass);
26861       case MVT::f64:
26862       case MVT::i64:
26863         return std::make_pair(0U, &X86::FR64RegClass);
26864       // Vector types.
26865       case MVT::v16i8:
26866       case MVT::v8i16:
26867       case MVT::v4i32:
26868       case MVT::v2i64:
26869       case MVT::v4f32:
26870       case MVT::v2f64:
26871         return std::make_pair(0U, &X86::VR128RegClass);
26872       // AVX types.
26873       case MVT::v32i8:
26874       case MVT::v16i16:
26875       case MVT::v8i32:
26876       case MVT::v4i64:
26877       case MVT::v8f32:
26878       case MVT::v4f64:
26879         return std::make_pair(0U, &X86::VR256RegClass);
26880       case MVT::v8f64:
26881       case MVT::v16f32:
26882       case MVT::v16i32:
26883       case MVT::v8i64:
26884         return std::make_pair(0U, &X86::VR512RegClass);
26885       }
26886       break;
26887     }
26888   }
26889
26890   // Use the default implementation in TargetLowering to convert the register
26891   // constraint into a member of a register class.
26892   std::pair<unsigned, const TargetRegisterClass*> Res;
26893   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
26894
26895   // Not found as a standard register?
26896   if (!Res.second) {
26897     // Map st(0) -> st(7) -> ST0
26898     if (Constraint.size() == 7 && Constraint[0] == '{' &&
26899         tolower(Constraint[1]) == 's' &&
26900         tolower(Constraint[2]) == 't' &&
26901         Constraint[3] == '(' &&
26902         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
26903         Constraint[5] == ')' &&
26904         Constraint[6] == '}') {
26905
26906       Res.first = X86::FP0+Constraint[4]-'0';
26907       Res.second = &X86::RFP80RegClass;
26908       return Res;
26909     }
26910
26911     // GCC allows "st(0)" to be called just plain "st".
26912     if (StringRef("{st}").equals_lower(Constraint)) {
26913       Res.first = X86::FP0;
26914       Res.second = &X86::RFP80RegClass;
26915       return Res;
26916     }
26917
26918     // flags -> EFLAGS
26919     if (StringRef("{flags}").equals_lower(Constraint)) {
26920       Res.first = X86::EFLAGS;
26921       Res.second = &X86::CCRRegClass;
26922       return Res;
26923     }
26924
26925     // 'A' means EAX + EDX.
26926     if (Constraint == "A") {
26927       Res.first = X86::EAX;
26928       Res.second = &X86::GR32_ADRegClass;
26929       return Res;
26930     }
26931     return Res;
26932   }
26933
26934   // Otherwise, check to see if this is a register class of the wrong value
26935   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
26936   // turn into {ax},{dx}.
26937   if (Res.second->hasType(VT))
26938     return Res;   // Correct type already, nothing to do.
26939
26940   // All of the single-register GCC register classes map their values onto
26941   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
26942   // really want an 8-bit or 32-bit register, map to the appropriate register
26943   // class and return the appropriate register.
26944   if (Res.second == &X86::GR16RegClass) {
26945     if (VT == MVT::i8 || VT == MVT::i1) {
26946       unsigned DestReg = 0;
26947       switch (Res.first) {
26948       default: break;
26949       case X86::AX: DestReg = X86::AL; break;
26950       case X86::DX: DestReg = X86::DL; break;
26951       case X86::CX: DestReg = X86::CL; break;
26952       case X86::BX: DestReg = X86::BL; break;
26953       }
26954       if (DestReg) {
26955         Res.first = DestReg;
26956         Res.second = &X86::GR8RegClass;
26957       }
26958     } else if (VT == MVT::i32 || VT == MVT::f32) {
26959       unsigned DestReg = 0;
26960       switch (Res.first) {
26961       default: break;
26962       case X86::AX: DestReg = X86::EAX; break;
26963       case X86::DX: DestReg = X86::EDX; break;
26964       case X86::CX: DestReg = X86::ECX; break;
26965       case X86::BX: DestReg = X86::EBX; break;
26966       case X86::SI: DestReg = X86::ESI; break;
26967       case X86::DI: DestReg = X86::EDI; break;
26968       case X86::BP: DestReg = X86::EBP; break;
26969       case X86::SP: DestReg = X86::ESP; break;
26970       }
26971       if (DestReg) {
26972         Res.first = DestReg;
26973         Res.second = &X86::GR32RegClass;
26974       }
26975     } else if (VT == MVT::i64 || VT == MVT::f64) {
26976       unsigned DestReg = 0;
26977       switch (Res.first) {
26978       default: break;
26979       case X86::AX: DestReg = X86::RAX; break;
26980       case X86::DX: DestReg = X86::RDX; break;
26981       case X86::CX: DestReg = X86::RCX; break;
26982       case X86::BX: DestReg = X86::RBX; break;
26983       case X86::SI: DestReg = X86::RSI; break;
26984       case X86::DI: DestReg = X86::RDI; break;
26985       case X86::BP: DestReg = X86::RBP; break;
26986       case X86::SP: DestReg = X86::RSP; break;
26987       }
26988       if (DestReg) {
26989         Res.first = DestReg;
26990         Res.second = &X86::GR64RegClass;
26991       }
26992     }
26993   } else if (Res.second == &X86::FR32RegClass ||
26994              Res.second == &X86::FR64RegClass ||
26995              Res.second == &X86::VR128RegClass ||
26996              Res.second == &X86::VR256RegClass ||
26997              Res.second == &X86::FR32XRegClass ||
26998              Res.second == &X86::FR64XRegClass ||
26999              Res.second == &X86::VR128XRegClass ||
27000              Res.second == &X86::VR256XRegClass ||
27001              Res.second == &X86::VR512RegClass) {
27002     // Handle references to XMM physical registers that got mapped into the
27003     // wrong class.  This can happen with constraints like {xmm0} where the
27004     // target independent register mapper will just pick the first match it can
27005     // find, ignoring the required type.
27006
27007     if (VT == MVT::f32 || VT == MVT::i32)
27008       Res.second = &X86::FR32RegClass;
27009     else if (VT == MVT::f64 || VT == MVT::i64)
27010       Res.second = &X86::FR64RegClass;
27011     else if (X86::VR128RegClass.hasType(VT))
27012       Res.second = &X86::VR128RegClass;
27013     else if (X86::VR256RegClass.hasType(VT))
27014       Res.second = &X86::VR256RegClass;
27015     else if (X86::VR512RegClass.hasType(VT))
27016       Res.second = &X86::VR512RegClass;
27017   }
27018
27019   return Res;
27020 }
27021
27022 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
27023                                             Type *Ty) const {
27024   // Scaling factors are not free at all.
27025   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
27026   // will take 2 allocations in the out of order engine instead of 1
27027   // for plain addressing mode, i.e. inst (reg1).
27028   // E.g.,
27029   // vaddps (%rsi,%drx), %ymm0, %ymm1
27030   // Requires two allocations (one for the load, one for the computation)
27031   // whereas:
27032   // vaddps (%rsi), %ymm0, %ymm1
27033   // Requires just 1 allocation, i.e., freeing allocations for other operations
27034   // and having less micro operations to execute.
27035   //
27036   // For some X86 architectures, this is even worse because for instance for
27037   // stores, the complex addressing mode forces the instruction to use the
27038   // "load" ports instead of the dedicated "store" port.
27039   // E.g., on Haswell:
27040   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
27041   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
27042   if (isLegalAddressingMode(AM, Ty))
27043     // Scale represents reg2 * scale, thus account for 1
27044     // as soon as we use a second register.
27045     return AM.Scale != 0;
27046   return -1;
27047 }
27048
27049 bool X86TargetLowering::isTargetFTOL() const {
27050   return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
27051 }