lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86ISelLowering.h"
  16 #include "Utils/X86ShuffleDecode.h"
  17 #include "X86CallingConv.h"
  18 #include "X86FrameLowering.h"
  19 #include "X86InstrBuilder.h"
  20 #include "X86MachineFunctionInfo.h"
  21 #include "X86TargetMachine.h"
  22 #include "X86TargetObjectFile.h"
  23 #include "llvm/ADT/SmallBitVector.h"
  24 #include "llvm/ADT/SmallSet.h"
  25 #include "llvm/ADT/Statistic.h"
  26 #include "llvm/ADT/StringExtras.h"
  27 #include "llvm/ADT/StringSwitch.h"
  28 #include "llvm/ADT/VariadicFunction.h"
  29 #include "llvm/CodeGen/IntrinsicLowering.h"
  30 #include "llvm/CodeGen/MachineFrameInfo.h"
  31 #include "llvm/CodeGen/MachineFunction.h"
  32 #include "llvm/CodeGen/MachineInstrBuilder.h"
  33 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  34 #include "llvm/CodeGen/MachineModuleInfo.h"
  35 #include "llvm/CodeGen/MachineRegisterInfo.h"
  36 #include "llvm/IR/CallSite.h"
  37 #include "llvm/IR/CallingConv.h"
  38 #include "llvm/IR/Constants.h"
  39 #include "llvm/IR/DerivedTypes.h"
  40 #include "llvm/IR/Function.h"
  41 #include "llvm/IR/GlobalAlias.h"
  42 #include "llvm/IR/GlobalVariable.h"
  43 #include "llvm/IR/Instructions.h"
  44 #include "llvm/IR/Intrinsics.h"
  45 #include "llvm/MC/MCAsmInfo.h"
  46 #include "llvm/MC/MCContext.h"
  47 #include "llvm/MC/MCExpr.h"
  48 #include "llvm/MC/MCSymbol.h"
  49 #include "llvm/Support/CommandLine.h"
  50 #include "llvm/Support/Debug.h"
  51 #include "llvm/Support/ErrorHandling.h"
  52 #include "llvm/Support/MathExtras.h"
  53 #include "llvm/Target/TargetOptions.h"
  54 #include "X86IntrinsicsInfo.h"
  55 #include <bitset>
  56 #include <numeric>
  57 #include <cctype>
  58 using namespace llvm;
  59
  60 #define DEBUG_TYPE "x86-isel"
  61
  62 STATISTIC(NumTailCalls, "Number of tail calls");
  63
  64 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  65     "x86-experimental-vector-widening-legalization", cl::init(false),
  66     cl::desc("Enable an experimental vector type legalization through widening "
  67              "rather than promotion."),
  68     cl::Hidden);
  69
  70 static cl::opt<bool> ExperimentalVectorShuffleLowering(
  71     "x86-experimental-vector-shuffle-lowering", cl::init(true),
  72     cl::desc("Enable an experimental vector shuffle lowering code path."),
  73     cl::Hidden);
  74
  75 static cl::opt<bool> ExperimentalVectorShuffleLegality(
  76     "x86-experimental-vector-shuffle-legality", cl::init(false),
  77     cl::desc("Enable experimental shuffle legality based on the experimental "
  78              "shuffle lowering. Should only be used with the experimental "
  79              "shuffle lowering."),
  80     cl::Hidden);
  81
  82 static cl::opt<int> ReciprocalEstimateRefinementSteps(
  83     "x86-recip-refinement-steps", cl::init(1),
  84     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
  85              "result of the hardware reciprocal estimate instruction."),
  86     cl::NotHidden);
  87
  88 // Forward declarations.
  89 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
  90                        SDValue V2);
  91
  92 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
  93                                 SelectionDAG &DAG, SDLoc dl,
  94                                 unsigned vectorWidth) {
  95   assert((vectorWidth == 128 || vectorWidth == 256) &&
  96          "Unsupported vector width");
  97   EVT VT = Vec.getValueType();
  98   EVT ElVT = VT.getVectorElementType();
  99   unsigned Factor = VT.getSizeInBits()/vectorWidth;
 100   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
 101                                   VT.getVectorNumElements()/Factor);
 102
 103   // Extract from UNDEF is UNDEF.
 104   if (Vec.getOpcode() == ISD::UNDEF)
 105     return DAG.getUNDEF(ResultVT);
 106
 107   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
 108   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
 109
 110   // This is the index of the first element of the vectorWidth-bit chunk
 111   // we want.
 112   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
 113                                * ElemsPerChunk);
 114
 115   // If the input is a buildvector just emit a smaller one.
 116   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
 117     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
 118                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
 119                                     ElemsPerChunk));
 120
 121   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 122   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 123 }
 124
 125 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 126 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
 127 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
 128 /// instructions or a simple subregister reference. Idx is an index in the
 129 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
 130 /// lowering EXTRACT_VECTOR_ELT operations easier.
 131 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
 132                                    SelectionDAG &DAG, SDLoc dl) {
 133   assert((Vec.getValueType().is256BitVector() ||
 134           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
 135   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
 136 }
 137
 138 /// Generate a DAG to grab 256-bits from a 512-bit vector.
 139 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
 140                                    SelectionDAG &DAG, SDLoc dl) {
 141   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
 142   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
 143 }
 144
 145 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
 146                                unsigned IdxVal, SelectionDAG &DAG,
 147                                SDLoc dl, unsigned vectorWidth) {
 148   assert((vectorWidth == 128 || vectorWidth == 256) &&
 149          "Unsupported vector width");
 150   // Inserting UNDEF is Result
 151   if (Vec.getOpcode() == ISD::UNDEF)
 152     return Result;
 153   EVT VT = Vec.getValueType();
 154   EVT ElVT = VT.getVectorElementType();
 155   EVT ResultVT = Result.getValueType();
 156
 157   // Insert the relevant vectorWidth bits.
 158   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
 159
 160   // This is the index of the first element of the vectorWidth-bit chunk
 161   // we want.
 162   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
 163                                * ElemsPerChunk);
 164
 165   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 166   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 167 }
 168
 169 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
 170 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
 171 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
 172 /// simple superregister reference.  Idx is an index in the 128 bits
 173 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 174 /// lowering INSERT_VECTOR_ELT operations easier.
 175 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 176                                   SelectionDAG &DAG,SDLoc dl) {
 177   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
 178   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 179 }
 180
 181 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 182                                   SelectionDAG &DAG, SDLoc dl) {
 183   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
 184   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 185 }
 186
 187 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
 188 /// instructions. This is used because creating CONCAT_VECTOR nodes of
 189 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
 190 /// large BUILD_VECTORS.
 191 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
 192                                    unsigned NumElems, SelectionDAG &DAG,
 193                                    SDLoc dl) {
 194   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 195   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
 196 }
 197
 198 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
 199                                    unsigned NumElems, SelectionDAG &DAG,
 200                                    SDLoc dl) {
 201   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 202   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 203 }
 204
 205 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 206                                      const X86Subtarget &STI)
 207     : TargetLowering(TM), Subtarget(&STI) {
 208   X86ScalarSSEf64 = Subtarget->hasSSE2();
 209   X86ScalarSSEf32 = Subtarget->hasSSE1();
 210   TD = getDataLayout();
 211
 212   // Set up the TargetLowering object.
 213   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 214
 215   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 216   setBooleanContents(ZeroOrOneBooleanContent);
 217   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 218   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 219
 220   // For 64-bit, since we have so many registers, use the ILP scheduler.
 221   // For 32-bit, use the register pressure specific scheduling.
 222   // For Atom, always use ILP scheduling.
 223   if (Subtarget->isAtom())
 224     setSchedulingPreference(Sched::ILP);
 225   else if (Subtarget->is64Bit())
 226     setSchedulingPreference(Sched::ILP);
 227   else
 228     setSchedulingPreference(Sched::RegPressure);
 229   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
 230   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 231
 232   // Bypass expensive divides on Atom when compiling with O2.
 233   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 234     if (Subtarget->hasSlowDivide32())
 235       addBypassSlowDiv(32, 8);
 236     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
 237       addBypassSlowDiv(64, 16);
 238   }
 239
 240   if (Subtarget->isTargetKnownWindowsMSVC()) {
 241     // Setup Windows compiler runtime calls.
 242     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 243     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 244     setLibcallName(RTLIB::SREM_I64, "_allrem");
 245     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 246     setLibcallName(RTLIB::MUL_I64, "_allmul");
 247     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 248     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 249     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 250     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 251     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 252
 253     // The _ftol2 runtime function has an unusual calling conv, which
 254     // is modeled by a special pseudo-instruction.
 255     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
 256     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
 257     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
 258     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
 259   }
 260
 261   if (Subtarget->isTargetDarwin()) {
 262     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 263     setUseUnderscoreSetJmp(false);
 264     setUseUnderscoreLongJmp(false);
 265   } else if (Subtarget->isTargetWindowsGNU()) {
 266     // MS runtime is weird: it exports _setjmp, but longjmp!
 267     setUseUnderscoreSetJmp(true);
 268     setUseUnderscoreLongJmp(false);
 269   } else {
 270     setUseUnderscoreSetJmp(true);
 271     setUseUnderscoreLongJmp(true);
 272   }
 273
 274   // Set up the register classes.
 275   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 276   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 277   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 278   if (Subtarget->is64Bit())
 279     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 280
 281   for (MVT VT : MVT::integer_valuetypes())
 282     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 283
 284   // We don't accept any truncstore of integer registers.
 285   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 286   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 287   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 288   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 289   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 290   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 291
 292   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 293
 294   // SETOEQ and SETUNE require checking two conditions.
 295   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 296   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 297   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 298   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 299   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 300   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 301
 302   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 303   // operation.
 304   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 305   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 306   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 307
 308   if (Subtarget->is64Bit()) {
 309     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
 310     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 311   } else if (!TM.Options.UseSoftFloat) {
 312     // We have an algorithm for SSE2->double, and we turn this into a
 313     // 64-bit FILD followed by conditional FADD for other targets.
 314     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 315     // We have an algorithm for SSE2, and we turn this into a 64-bit
 316     // FILD for other targets.
 317     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 318   }
 319
 320   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 321   // this operation.
 322   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 323   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 324
 325   if (!TM.Options.UseSoftFloat) {
 326     // SSE has no i16 to fp conversion, only i32
 327     if (X86ScalarSSEf32) {
 328       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 329       // f32 and f64 cases are Legal, f80 case is not
 330       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 331     } else {
 332       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 333       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 334     }
 335   } else {
 336     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 337     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 338   }
 339
 340   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 341   // are Legal, f80 is custom lowered.
 342   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 343   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 344
 345   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 346   // this operation.
 347   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 348   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 349
 350   if (X86ScalarSSEf32) {
 351     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 352     // f32 and f64 cases are Legal, f80 case is not
 353     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 354   } else {
 355     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 356     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 357   }
 358
 359   // Handle FP_TO_UINT by promoting the destination to a larger signed
 360   // conversion.
 361   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 362   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 363   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 364
 365   if (Subtarget->is64Bit()) {
 366     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
 367     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
 368   } else if (!TM.Options.UseSoftFloat) {
 369     // Since AVX is a superset of SSE3, only check for SSE here.
 370     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
 371       // Expand FP_TO_UINT into a select.
 372       // FIXME: We would like to use a Custom expander here eventually to do
 373       // the optimal thing for SSE vs. the default expansion in the legalizer.
 374       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 375     else
 376       // With SSE3 we can use fisttpll to convert to a signed i64; without
 377       // SSE, we're stuck with a fistpll.
 378       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 379   }
 380
 381   if (isTargetFTOL()) {
 382     // Use the _ftol2 runtime function, which has a pseudo-instruction
 383     // to handle its weird calling convention.
 384     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 385   }
 386
 387   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 388   if (!X86ScalarSSEf64) {
 389     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 390     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 391     if (Subtarget->is64Bit()) {
 392       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 393       // Without SSE, i64->f64 goes through memory.
 394       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 395     }
 396   }
 397
 398   // Scalar integer divide and remainder are lowered to use operations that
 399   // produce two results, to match the available instructions. This exposes
 400   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 401   // into a single instruction.
 402   //
 403   // Scalar integer multiply-high is also lowered to use two-result
 404   // operations, to match the available instructions. However, plain multiply
 405   // (low) operations are left as Legal, as there are single-result
 406   // instructions for this in x86. Using the two-result multiply instructions
 407   // when both high and low results are needed must be arranged by dagcombine.
 408   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 409     MVT VT = IntVTs[i];
 410     setOperationAction(ISD::MULHS, VT, Expand);
 411     setOperationAction(ISD::MULHU, VT, Expand);
 412     setOperationAction(ISD::SDIV, VT, Expand);
 413     setOperationAction(ISD::UDIV, VT, Expand);
 414     setOperationAction(ISD::SREM, VT, Expand);
 415     setOperationAction(ISD::UREM, VT, Expand);
 416
 417     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
 418     setOperationAction(ISD::ADDC, VT, Custom);
 419     setOperationAction(ISD::ADDE, VT, Custom);
 420     setOperationAction(ISD::SUBC, VT, Custom);
 421     setOperationAction(ISD::SUBE, VT, Custom);
 422   }
 423
 424   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 425   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 426   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
 427   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
 428   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
 429   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
 430   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
 431   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
 432   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
 433   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
 434   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
 435   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
 436   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
 437   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
 438   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
 439   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
 440   if (Subtarget->is64Bit())
 441     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 442   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 443   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 444   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 445   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 446   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 447   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 448   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 449   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 450
 451   // Promote the i8 variants and force them on up to i32 which has a shorter
 452   // encoding.
 453   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
 454   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
 455   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
 456   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
 457   if (Subtarget->hasBMI()) {
 458     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
 459     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
 460     if (Subtarget->is64Bit())
 461       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
 462   } else {
 463     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 464     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 465     if (Subtarget->is64Bit())
 466       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 467   }
 468
 469   if (Subtarget->hasLZCNT()) {
 470     // When promoting the i8 variants, force them to i32 for a shorter
 471     // encoding.
 472     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
 473     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
 474     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
 475     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 476     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
 477     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
 478     if (Subtarget->is64Bit())
 479       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 480   } else {
 481     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 482     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 483     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 484     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 485     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 486     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 487     if (Subtarget->is64Bit()) {
 488       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 489       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 490     }
 491   }
 492
 493   // Special handling for half-precision floating point conversions.
 494   // If we don't have F16C support, then lower half float conversions
 495   // into library calls.
 496   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
 497     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 498     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 499   }
 500
 501   // There's never any support for operations beyond MVT::f32.
 502   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 503   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 504   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 505   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 506
 507   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 508   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 509   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 510   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 511   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 512   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 513
 514   if (Subtarget->hasPOPCNT()) {
 515     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 516   } else {
 517     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 518     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 519     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 520     if (Subtarget->is64Bit())
 521       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 522   }
 523
 524   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 525
 526   if (!Subtarget->hasMOVBE())
 527     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 528
 529   // These should be promoted to a larger select which is supported.
 530   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 531   // X86 wants to expand cmov itself.
 532   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
 533   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
 534   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
 535   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
 536   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
 537   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
 538   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
 539   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
 540   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
 541   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
 542   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
 543   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
 544   if (Subtarget->is64Bit()) {
 545     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
 546     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
 547   }
 548   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 549   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 550   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 551   // support continuation, user-level threading, and etc.. As a result, no
 552   // other SjLj exception interfaces are implemented and please don't build
 553   // your own exception handling based on them.
 554   // LLVM/Clang supports zero-cost DWARF exception handling.
 555   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 556   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 557
 558   // Darwin ABI issue.
 559   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
 560   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
 561   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
 562   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
 563   if (Subtarget->is64Bit())
 564     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 565   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
 566   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
 567   if (Subtarget->is64Bit()) {
 568     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
 569     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
 570     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
 571     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
 572     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
 573   }
 574   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
 575   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
 576   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
 577   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
 578   if (Subtarget->is64Bit()) {
 579     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
 580     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
 581     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
 582   }
 583
 584   if (Subtarget->hasSSE1())
 585     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 586
 587   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 588
 589   // Expand certain atomics
 590   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 591     MVT VT = IntVTs[i];
 592     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 593     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 594     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 595   }
 596
 597   if (Subtarget->hasCmpxchg16b()) {
 598     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 599   }
 600
 601   // FIXME - use subtarget debug flags
 602   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
 603       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
 604     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 605   }
 606
 607   if (Subtarget->is64Bit()) {
 608     setExceptionPointerRegister(X86::RAX);
 609     setExceptionSelectorRegister(X86::RDX);
 610   } else {
 611     setExceptionPointerRegister(X86::EAX);
 612     setExceptionSelectorRegister(X86::EDX);
 613   }
 614   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 615   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 616
 617   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 618   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 619
 620   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 621   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 622
 623   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 624   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 625   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 626   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
 627     // TargetInfo::X86_64ABIBuiltinVaList
 628     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
 629     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
 630   } else {
 631     // TargetInfo::CharPtrBuiltinVaList
 632     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
 633     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
 634   }
 635
 636   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 637   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 638
 639   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
 640
 641   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
 642     // f32 and f64 use SSE.
 643     // Set up the FP register classes.
 644     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 645     addRegisterClass(MVT::f64, &X86::FR64RegClass);
 646
 647     // Use ANDPD to simulate FABS.
 648     setOperationAction(ISD::FABS , MVT::f64, Custom);
 649     setOperationAction(ISD::FABS , MVT::f32, Custom);
 650
 651     // Use XORP to simulate FNEG.
 652     setOperationAction(ISD::FNEG , MVT::f64, Custom);
 653     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 654
 655     // Use ANDPD and ORPD to simulate FCOPYSIGN.
 656     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 657     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 658
 659     // Lower this to FGETSIGNx86 plus an AND.
 660     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 661     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 662
 663     // We don't support sin/cos/fmod
 664     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 665     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 666     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 667     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 668     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 669     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 670
 671     // Expand FP immediates into loads from the stack, except for the special
 672     // cases we handle.
 673     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 674     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 675   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
 676     // Use SSE for f32, x87 for f64.
 677     // Set up the FP register classes.
 678     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 679     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 680
 681     // Use ANDPS to simulate FABS.
 682     setOperationAction(ISD::FABS , MVT::f32, Custom);
 683
 684     // Use XORP to simulate FNEG.
 685     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 686
 687     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 688
 689     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 690     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 691     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 692
 693     // We don't support sin/cos/fmod
 694     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 695     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 696     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 697
 698     // Special cases we handle for FP constants.
 699     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 700     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 701     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 702     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 703     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 704
 705     if (!TM.Options.UnsafeFPMath) {
 706       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 707       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 708       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 709     }
 710   } else if (!TM.Options.UseSoftFloat) {
 711     // f32 and f64 in x87.
 712     // Set up the FP register classes.
 713     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 714     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 715
 716     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 717     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
 718     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 719     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 720
 721     if (!TM.Options.UnsafeFPMath) {
 722       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 723       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 724       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 725       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 726       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 727       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 728     }
 729     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 730     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 731     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 732     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 733     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 734     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 735     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 736     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 737   }
 738
 739   // We don't support FMA.
 740   setOperationAction(ISD::FMA, MVT::f64, Expand);
 741   setOperationAction(ISD::FMA, MVT::f32, Expand);
 742
 743   // Long double always uses X87.
 744   if (!TM.Options.UseSoftFloat) {
 745     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 746     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 747     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 748     {
 749       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
 750       addLegalFPImmediate(TmpFlt);  // FLD0
 751       TmpFlt.changeSign();
 752       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 753
 754       bool ignored;
 755       APFloat TmpFlt2(+1.0);
 756       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
 757                       &ignored);
 758       addLegalFPImmediate(TmpFlt2);  // FLD1
 759       TmpFlt2.changeSign();
 760       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 761     }
 762
 763     if (!TM.Options.UnsafeFPMath) {
 764       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 765       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 766       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 767     }
 768
 769     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 770     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 771     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 772     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 773     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 774     setOperationAction(ISD::FMA, MVT::f80, Expand);
 775   }
 776
 777   // Always use a library call for pow.
 778   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 779   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 780   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 781
 782   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 783   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 784   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 785   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 786   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 787   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 788   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 789
 790   // First set operation action for all vector types to either promote
 791   // (for widening) or expand (for scalarization). Then we will selectively
 792   // turn on ones that can be effectively codegen'd.
 793   for (MVT VT : MVT::vector_valuetypes()) {
 794     setOperationAction(ISD::ADD , VT, Expand);
 795     setOperationAction(ISD::SUB , VT, Expand);
 796     setOperationAction(ISD::FADD, VT, Expand);
 797     setOperationAction(ISD::FNEG, VT, Expand);
 798     setOperationAction(ISD::FSUB, VT, Expand);
 799     setOperationAction(ISD::MUL , VT, Expand);
 800     setOperationAction(ISD::FMUL, VT, Expand);
 801     setOperationAction(ISD::SDIV, VT, Expand);
 802     setOperationAction(ISD::UDIV, VT, Expand);
 803     setOperationAction(ISD::FDIV, VT, Expand);
 804     setOperationAction(ISD::SREM, VT, Expand);
 805     setOperationAction(ISD::UREM, VT, Expand);
 806     setOperationAction(ISD::LOAD, VT, Expand);
 807     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 808     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 809     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 810     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 811     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 812     setOperationAction(ISD::FABS, VT, Expand);
 813     setOperationAction(ISD::FSIN, VT, Expand);
 814     setOperationAction(ISD::FSINCOS, VT, Expand);
 815     setOperationAction(ISD::FCOS, VT, Expand);
 816     setOperationAction(ISD::FSINCOS, VT, Expand);
 817     setOperationAction(ISD::FREM, VT, Expand);
 818     setOperationAction(ISD::FMA,  VT, Expand);
 819     setOperationAction(ISD::FPOWI, VT, Expand);
 820     setOperationAction(ISD::FSQRT, VT, Expand);
 821     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 822     setOperationAction(ISD::FFLOOR, VT, Expand);
 823     setOperationAction(ISD::FCEIL, VT, Expand);
 824     setOperationAction(ISD::FTRUNC, VT, Expand);
 825     setOperationAction(ISD::FRINT, VT, Expand);
 826     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 827     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 828     setOperationAction(ISD::MULHS, VT, Expand);
 829     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 830     setOperationAction(ISD::MULHU, VT, Expand);
 831     setOperationAction(ISD::SDIVREM, VT, Expand);
 832     setOperationAction(ISD::UDIVREM, VT, Expand);
 833     setOperationAction(ISD::FPOW, VT, Expand);
 834     setOperationAction(ISD::CTPOP, VT, Expand);
 835     setOperationAction(ISD::CTTZ, VT, Expand);
 836     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
 837     setOperationAction(ISD::CTLZ, VT, Expand);
 838     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
 839     setOperationAction(ISD::SHL, VT, Expand);
 840     setOperationAction(ISD::SRA, VT, Expand);
 841     setOperationAction(ISD::SRL, VT, Expand);
 842     setOperationAction(ISD::ROTL, VT, Expand);
 843     setOperationAction(ISD::ROTR, VT, Expand);
 844     setOperationAction(ISD::BSWAP, VT, Expand);
 845     setOperationAction(ISD::SETCC, VT, Expand);
 846     setOperationAction(ISD::FLOG, VT, Expand);
 847     setOperationAction(ISD::FLOG2, VT, Expand);
 848     setOperationAction(ISD::FLOG10, VT, Expand);
 849     setOperationAction(ISD::FEXP, VT, Expand);
 850     setOperationAction(ISD::FEXP2, VT, Expand);
 851     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 852     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 853     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 854     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 855     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 856     setOperationAction(ISD::TRUNCATE, VT, Expand);
 857     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 858     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 859     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 860     setOperationAction(ISD::VSELECT, VT, Expand);
 861     setOperationAction(ISD::SELECT_CC, VT, Expand);
 862     for (MVT InnerVT : MVT::vector_valuetypes()) {
 863       setTruncStoreAction(InnerVT, VT, Expand);
 864
 865       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 866       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 867
 868       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 869       // types, we have to deal with them whether we ask for Expansion or not.
 870       // Setting Expand causes its own optimisation problems though, so leave
 871       // them legal.
 872       if (VT.getVectorElementType() == MVT::i1)
 873         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 874     }
 875   }
 876
 877   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 878   // with -msoft-float, disable use of MMX as well.
 879   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
 880     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 881     // No operations on x86mmx supported, everything uses intrinsics.
 882   }
 883
 884   // MMX-sized vectors (other than x86mmx) are expected to be expanded
 885   // into smaller operations.
 886   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
 887   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
 888   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
 889   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
 890   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
 891   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
 892   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
 893   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
 894   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
 895   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
 896   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
 897   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
 898   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
 899   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
 900   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
 901   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
 902   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
 903   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
 904   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
 905   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
 906   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
 907   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
 908   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
 909   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
 910   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
 911   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
 912   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
 913   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
 914   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
 915
 916   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
 917     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 918
 919     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
 920     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
 921     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
 922     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
 923     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
 924     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 925     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 926     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
 927     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 928     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 929     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 930     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 931     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 932   }
 933
 934   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
 935     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 936
 937     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 938     // registers cannot be used even for integer operations.
 939     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
 940     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
 941     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
 942     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 943
 944     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
 945     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
 946     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
 947     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
 948     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 949     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 950     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 951     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 952     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 953     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 954     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
 955     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
 956     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
 957     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
 958     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 959     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
 960     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
 961     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
 962     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
 963     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
 964     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 965     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 966
 967     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
 968     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
 969     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
 970     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
 971
 972     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
 973     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
 974     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 975     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 976     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 977
 978     // Only provide customized ctpop vector bit twiddling for vector types we
 979     // know to perform better than using the popcnt instructions on each vector
 980     // element. If popcnt isn't supported, always provide the custom version.
 981     if (!Subtarget->hasPOPCNT()) {
 982       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
 983       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
 984     }
 985
 986     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
 987     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
 988       MVT VT = (MVT::SimpleValueType)i;
 989       // Do not attempt to custom lower non-power-of-2 vectors
 990       if (!isPowerOf2_32(VT.getVectorNumElements()))
 991         continue;
 992       // Do not attempt to custom lower non-128-bit vectors
 993       if (!VT.is128BitVector())
 994         continue;
 995       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 996       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 997       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 998     }
 999
1000     // We support custom legalizing of sext and anyext loads for specific
1001     // memory vector types which we can load as a scalar (or sequence of
1002     // scalars) and extend in-register to a legal 128-bit vector type. For sext
1003     // loads these must work with a single scalar load.
1004     for (MVT VT : MVT::integer_vector_valuetypes()) {
1005       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
1006       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
1007       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
1008       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
1009       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
1010       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
1011       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
1012       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
1013       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
1014     }
1015
1016     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
1017     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
1018     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
1019     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
1020     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
1021     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
1022
1023     if (Subtarget->is64Bit()) {
1024       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1025       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1026     }
1027
1028     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
1029     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1030       MVT VT = (MVT::SimpleValueType)i;
1031
1032       // Do not attempt to promote non-128-bit vectors
1033       if (!VT.is128BitVector())
1034         continue;
1035
1036       setOperationAction(ISD::AND,    VT, Promote);
1037       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
1038       setOperationAction(ISD::OR,     VT, Promote);
1039       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
1040       setOperationAction(ISD::XOR,    VT, Promote);
1041       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
1042       setOperationAction(ISD::LOAD,   VT, Promote);
1043       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
1044       setOperationAction(ISD::SELECT, VT, Promote);
1045       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1046     }
1047
1048     // Custom lower v2i64 and v2f64 selects.
1049     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
1050     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
1051     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1052     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1053
1054     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1055     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1056
1057     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
1058     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
1059     // As there is no 64-bit GPR available, we need build a special custom
1060     // sequence to convert from v2i32 to v2f32.
1061     if (!Subtarget->is64Bit())
1062       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
1063
1064     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1065     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1066
1067     for (MVT VT : MVT::fp_vector_valuetypes())
1068       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
1069
1070     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1071     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1072     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1073   }
1074
1075   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1076     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
1077     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
1078     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
1079     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
1080     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
1081     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
1082     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
1083     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
1084     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
1085     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
1086
1087     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
1088     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
1089     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
1090     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
1091     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
1092     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
1093     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
1094     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
1095     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
1096     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
1097
1098     // FIXME: Do we need to handle scalar-to-vector here?
1099     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1100
1101     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
1102     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
1103     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
1104     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
1105     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
1106     // There is no BLENDI for byte vectors. We don't need to custom lower
1107     // some vselects for now.
1108     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1109
1110     // SSE41 brings specific instructions for doing vector sign extend even in
1111     // cases where we don't have SRA.
1112     for (MVT VT : MVT::integer_vector_valuetypes()) {
1113       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
1114       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
1115       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
1116     }
1117
1118     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1119     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1120     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1121     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1122     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1123     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1124     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1125
1126     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1127     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1128     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1129     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1130     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1131     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1132
1133     // i8 and i16 vectors are custom because the source register and source
1134     // source memory operand types are not the same width.  f32 vectors are
1135     // custom since the immediate controlling the insert encodes additional
1136     // information.
1137     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1138     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1139     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1140     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1141
1142     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1143     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1144     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1145     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1146
1147     // FIXME: these should be Legal, but that's only for the case where
1148     // the index is constant.  For now custom expand to deal with that.
1149     if (Subtarget->is64Bit()) {
1150       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1151       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1152     }
1153   }
1154
1155   if (Subtarget->hasSSE2()) {
1156     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1157     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1158
1159     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1160     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1161
1162     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1163     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1164
1165     // In the customized shift lowering, the legal cases in AVX2 will be
1166     // recognized.
1167     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
1168     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
1169
1170     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
1171     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
1172
1173     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
1174   }
1175
1176   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1177     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1178     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1179     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1180     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1181     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1182     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1183
1184     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1185     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1186     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1187
1188     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1189     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1190     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1191     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1192     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1193     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1194     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1195     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1196     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1197     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1198     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1199     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1200
1201     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1202     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1203     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1204     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1205     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1206     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1207     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1208     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1209     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1210     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1211     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1212     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1213
1214     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1215     // even though v8i16 is a legal type.
1216     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1217     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1218     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1219
1220     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1221     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1222     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1223
1224     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1225     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1226
1227     for (MVT VT : MVT::fp_vector_valuetypes())
1228       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1229
1230     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1231     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1232
1233     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1234     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1235
1236     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1237     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1238
1239     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1240     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1241     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1242     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1243
1244     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1245     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1246     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1247
1248     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
1249     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
1250     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
1251     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
1252
1253     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
1254     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
1255     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1256     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1257     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1258     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1259     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1260     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1261     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1262     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1263     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1264     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1265
1266     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1267       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1268       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1269       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1270       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1271       setOperationAction(ISD::FMA,             MVT::f32, Legal);
1272       setOperationAction(ISD::FMA,             MVT::f64, Legal);
1273     }
1274
1275     if (Subtarget->hasInt256()) {
1276       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1277       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1278       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1279       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1280
1281       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1282       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1283       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1284       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1285
1286       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1287       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1288       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1289       // Don't lower v32i8 because there is no 128-bit byte mul
1290
1291       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
1292       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
1293       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
1294       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
1295
1296       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
1297       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1298
1299       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1300       // when we have a 256bit-wide blend with immediate.
1301       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1302
1303       // Only provide customized ctpop vector bit twiddling for vector types we
1304       // know to perform better than using the popcnt instructions on each
1305       // vector element. If popcnt isn't supported, always provide the custom
1306       // version.
1307       if (!Subtarget->hasPOPCNT())
1308         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
1309
1310       // Custom CTPOP always performs better on natively supported v8i32
1311       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
1312
1313       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1314       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1315       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1316       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1317       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1318       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1319       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1320
1321       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1322       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1323       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1324       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1325       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1326       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1327     } else {
1328       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1329       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1330       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1331       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1332
1333       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1334       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1335       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1336       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1337
1338       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1339       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1340       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1341       // Don't lower v32i8 because there is no 128-bit byte mul
1342     }
1343
1344     // In the customized shift lowering, the legal cases in AVX2 will be
1345     // recognized.
1346     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
1347     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
1348
1349     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
1350     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
1351
1352     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
1353
1354     // Custom lower several nodes for 256-bit types.
1355     for (MVT VT : MVT::vector_valuetypes()) {
1356       if (VT.getScalarSizeInBits() >= 32) {
1357         setOperationAction(ISD::MLOAD,  VT, Legal);
1358         setOperationAction(ISD::MSTORE, VT, Legal);
1359       }
1360       // Extract subvector is special because the value type
1361       // (result) is 128-bit but the source is 256-bit wide.
1362       if (VT.is128BitVector()) {
1363         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1364       }
1365       // Do not attempt to custom lower other non-256-bit vectors
1366       if (!VT.is256BitVector())
1367         continue;
1368
1369       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1370       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1371       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1372       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1373       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1374       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1375       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1376     }
1377
1378     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1379     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1380       MVT VT = (MVT::SimpleValueType)i;
1381
1382       // Do not attempt to promote non-256-bit vectors
1383       if (!VT.is256BitVector())
1384         continue;
1385
1386       setOperationAction(ISD::AND,    VT, Promote);
1387       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1388       setOperationAction(ISD::OR,     VT, Promote);
1389       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1390       setOperationAction(ISD::XOR,    VT, Promote);
1391       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1392       setOperationAction(ISD::LOAD,   VT, Promote);
1393       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1394       setOperationAction(ISD::SELECT, VT, Promote);
1395       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1396     }
1397   }
1398
1399   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1400     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1401     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1402     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1403     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1404
1405     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1406     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1407     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1408
1409     for (MVT VT : MVT::fp_vector_valuetypes())
1410       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1411
1412     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1413     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1414     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1415     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1416     setOperationAction(ISD::AND,                MVT::i1,    Legal);
1417     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
1418     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
1419     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
1420     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
1421     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
1422
1423     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
1424     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
1425     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
1426     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
1427     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
1428     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
1429
1430     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
1431     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
1432     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
1433     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
1434     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
1435     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
1436     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
1437     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
1438
1439     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
1440     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
1441     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
1442     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
1443     if (Subtarget->is64Bit()) {
1444       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
1445       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
1446       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
1447       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
1448     }
1449     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1450     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1451     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1452     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1453     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1454     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1455     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1456     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1457     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1458     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1459     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1460     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1461     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1462     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1463
1464     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1465     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1466     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1467     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1468     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1469     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1470     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1471     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1472     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1473     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1474     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1475     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1476     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1477
1478     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1479     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1480     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1481     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1482     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
1483     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
1484
1485     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1486     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1487
1488     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1489
1490     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1491     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1492     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1493     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1494     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1495     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1496     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1497     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1498     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1499
1500     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
1501     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
1502
1503     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
1504     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
1505
1506     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1507
1508     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
1509     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
1510
1511     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
1512     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
1513
1514     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
1515     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
1516
1517     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
1518     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
1519     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
1520     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
1521     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
1522     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
1523
1524     if (Subtarget->hasCDI()) {
1525       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
1526       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1527     }
1528
1529     // Custom lower several nodes.
1530     for (MVT VT : MVT::vector_valuetypes()) {
1531       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1532       // Extract subvector is special because the value type
1533       // (result) is 256/128-bit but the source is 512-bit wide.
1534       if (VT.is128BitVector() || VT.is256BitVector()) {
1535         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1536       }
1537       if (VT.getVectorElementType() == MVT::i1)
1538         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1539
1540       // Do not attempt to custom lower other non-512-bit vectors
1541       if (!VT.is512BitVector())
1542         continue;
1543
1544       if ( EltSize >= 32) {
1545         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1546         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1547         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1548         setOperationAction(ISD::VSELECT,             VT, Legal);
1549         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1550         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1551         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1552         setOperationAction(ISD::MLOAD,               VT, Legal);
1553         setOperationAction(ISD::MSTORE,              VT, Legal);
1554       }
1555     }
1556     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1557       MVT VT = (MVT::SimpleValueType)i;
1558
1559       // Do not attempt to promote non-512-bit vectors.
1560       if (!VT.is512BitVector())
1561         continue;
1562
1563       setOperationAction(ISD::SELECT, VT, Promote);
1564       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1565     }
1566   }// has  AVX-512
1567
1568   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
1569     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1570     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1571
1572     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1573     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1574
1575     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
1576     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
1577     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1578     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1579     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
1580     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
1581     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
1582     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
1583     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1584
1585     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1586       const MVT VT = (MVT::SimpleValueType)i;
1587
1588       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1589
1590       // Do not attempt to promote non-512-bit vectors.
1591       if (!VT.is512BitVector())
1592         continue;
1593
1594       if (EltSize < 32) {
1595         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1596         setOperationAction(ISD::VSELECT,             VT, Legal);
1597       }
1598     }
1599   }
1600
1601   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
1602     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1603     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1604
1605     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
1606     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
1607     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
1608
1609     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
1610     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
1611     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
1612     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
1613     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
1614     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
1615   }
1616
1617   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1618   // of this type with custom code.
1619   for (MVT VT : MVT::vector_valuetypes())
1620     setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1621
1622   // We want to custom lower some of our intrinsics.
1623   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1624   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1625   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1626   if (!Subtarget->is64Bit())
1627     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1628
1629   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1630   // handle type legalization for these operations here.
1631   //
1632   // FIXME: We really should do custom legalization for addition and
1633   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1634   // than generic legalization for 64-bit multiplication-with-overflow, though.
1635   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1636     // Add/Sub/Mul with overflow operations are custom lowered.
1637     MVT VT = IntVTs[i];
1638     setOperationAction(ISD::SADDO, VT, Custom);
1639     setOperationAction(ISD::UADDO, VT, Custom);
1640     setOperationAction(ISD::SSUBO, VT, Custom);
1641     setOperationAction(ISD::USUBO, VT, Custom);
1642     setOperationAction(ISD::SMULO, VT, Custom);
1643     setOperationAction(ISD::UMULO, VT, Custom);
1644   }
1645
1646
1647   if (!Subtarget->is64Bit()) {
1648     // These libcalls are not available in 32-bit.
1649     setLibcallName(RTLIB::SHL_I128, nullptr);
1650     setLibcallName(RTLIB::SRL_I128, nullptr);
1651     setLibcallName(RTLIB::SRA_I128, nullptr);
1652   }
1653
1654   // Combine sin / cos into one node or libcall if possible.
1655   if (Subtarget->hasSinCos()) {
1656     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1657     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1658     if (Subtarget->isTargetDarwin()) {
1659       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1660       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1661       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1662       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1663     }
1664   }
1665
1666   if (Subtarget->isTargetWin64()) {
1667     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1668     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1669     setOperationAction(ISD::SREM, MVT::i128, Custom);
1670     setOperationAction(ISD::UREM, MVT::i128, Custom);
1671     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1672     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1673   }
1674
1675   // We have target-specific dag combine patterns for the following nodes:
1676   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1677   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1678   setTargetDAGCombine(ISD::VSELECT);
1679   setTargetDAGCombine(ISD::SELECT);
1680   setTargetDAGCombine(ISD::SHL);
1681   setTargetDAGCombine(ISD::SRA);
1682   setTargetDAGCombine(ISD::SRL);
1683   setTargetDAGCombine(ISD::OR);
1684   setTargetDAGCombine(ISD::AND);
1685   setTargetDAGCombine(ISD::ADD);
1686   setTargetDAGCombine(ISD::FADD);
1687   setTargetDAGCombine(ISD::FSUB);
1688   setTargetDAGCombine(ISD::FMA);
1689   setTargetDAGCombine(ISD::SUB);
1690   setTargetDAGCombine(ISD::LOAD);
1691   setTargetDAGCombine(ISD::MLOAD);
1692   setTargetDAGCombine(ISD::STORE);
1693   setTargetDAGCombine(ISD::MSTORE);
1694   setTargetDAGCombine(ISD::ZERO_EXTEND);
1695   setTargetDAGCombine(ISD::ANY_EXTEND);
1696   setTargetDAGCombine(ISD::SIGN_EXTEND);
1697   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1698   setTargetDAGCombine(ISD::TRUNCATE);
1699   setTargetDAGCombine(ISD::SINT_TO_FP);
1700   setTargetDAGCombine(ISD::SETCC);
1701   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1702   setTargetDAGCombine(ISD::BUILD_VECTOR);
1703   setTargetDAGCombine(ISD::MUL);
1704   setTargetDAGCombine(ISD::XOR);
1705
1706   computeRegisterProperties();
1707
1708   // On Darwin, -Os means optimize for size without hurting performance,
1709   // do not reduce the limit.
1710   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1711   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1712   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1713   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1714   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1715   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1716   setPrefLoopAlignment(4); // 2^4 bytes.
1717
1718   // Predictable cmov don't hurt on atom because it's in-order.
1719   PredictableSelectIsExpensive = !Subtarget->isAtom();
1720   EnableExtLdPromotion = true;
1721   setPrefFunctionAlignment(4); // 2^4 bytes.
1722
1723   verifyIntrinsicTables();
1724 }
1725
1726 // This has so far only been implemented for 64-bit MachO.
1727 bool X86TargetLowering::useLoadStackGuardNode() const {
1728   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
1729 }
1730
1731 TargetLoweringBase::LegalizeTypeAction
1732 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1733   if (ExperimentalVectorWideningLegalization &&
1734       VT.getVectorNumElements() != 1 &&
1735       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1736     return TypeWidenVector;
1737
1738   return TargetLoweringBase::getPreferredVectorAction(VT);
1739 }
1740
1741 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1742   if (!VT.isVector())
1743     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1744
1745   const unsigned NumElts = VT.getVectorNumElements();
1746   const EVT EltVT = VT.getVectorElementType();
1747   if (VT.is512BitVector()) {
1748     if (Subtarget->hasAVX512())
1749       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1750           EltVT == MVT::f32 || EltVT == MVT::f64)
1751         switch(NumElts) {
1752         case  8: return MVT::v8i1;
1753         case 16: return MVT::v16i1;
1754       }
1755     if (Subtarget->hasBWI())
1756       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1757         switch(NumElts) {
1758         case 32: return MVT::v32i1;
1759         case 64: return MVT::v64i1;
1760       }
1761   }
1762
1763   if (VT.is256BitVector() || VT.is128BitVector()) {
1764     if (Subtarget->hasVLX())
1765       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1766           EltVT == MVT::f32 || EltVT == MVT::f64)
1767         switch(NumElts) {
1768         case 2: return MVT::v2i1;
1769         case 4: return MVT::v4i1;
1770         case 8: return MVT::v8i1;
1771       }
1772     if (Subtarget->hasBWI() && Subtarget->hasVLX())
1773       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1774         switch(NumElts) {
1775         case  8: return MVT::v8i1;
1776         case 16: return MVT::v16i1;
1777         case 32: return MVT::v32i1;
1778       }
1779   }
1780
1781   return VT.changeVectorElementTypeToInteger();
1782 }
1783
1784 /// Helper for getByValTypeAlignment to determine
1785 /// the desired ByVal argument alignment.
1786 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1787   if (MaxAlign == 16)
1788     return;
1789   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1790     if (VTy->getBitWidth() == 128)
1791       MaxAlign = 16;
1792   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1793     unsigned EltAlign = 0;
1794     getMaxByValAlign(ATy->getElementType(), EltAlign);
1795     if (EltAlign > MaxAlign)
1796       MaxAlign = EltAlign;
1797   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1798     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1799       unsigned EltAlign = 0;
1800       getMaxByValAlign(STy->getElementType(i), EltAlign);
1801       if (EltAlign > MaxAlign)
1802         MaxAlign = EltAlign;
1803       if (MaxAlign == 16)
1804         break;
1805     }
1806   }
1807 }
1808
1809 /// Return the desired alignment for ByVal aggregate
1810 /// function arguments in the caller parameter area. For X86, aggregates
1811 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1812 /// are at 4-byte boundaries.
1813 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1814   if (Subtarget->is64Bit()) {
1815     // Max of 8 and alignment of type.
1816     unsigned TyAlign = TD->getABITypeAlignment(Ty);
1817     if (TyAlign > 8)
1818       return TyAlign;
1819     return 8;
1820   }
1821
1822   unsigned Align = 4;
1823   if (Subtarget->hasSSE1())
1824     getMaxByValAlign(Ty, Align);
1825   return Align;
1826 }
1827
1828 /// Returns the target specific optimal type for load
1829 /// and store operations as a result of memset, memcpy, and memmove
1830 /// lowering. If DstAlign is zero that means it's safe to destination
1831 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1832 /// means there isn't a need to check it against alignment requirement,
1833 /// probably because the source does not need to be loaded. If 'IsMemset' is
1834 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1835 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1836 /// source is constant so it does not need to be loaded.
1837 /// It returns EVT::Other if the type should be determined using generic
1838 /// target-independent logic.
1839 EVT
1840 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1841                                        unsigned DstAlign, unsigned SrcAlign,
1842                                        bool IsMemset, bool ZeroMemset,
1843                                        bool MemcpyStrSrc,
1844                                        MachineFunction &MF) const {
1845   const Function *F = MF.getFunction();
1846   if ((!IsMemset || ZeroMemset) &&
1847       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1848                                        Attribute::NoImplicitFloat)) {
1849     if (Size >= 16 &&
1850         (Subtarget->isUnalignedMemAccessFast() ||
1851          ((DstAlign == 0 || DstAlign >= 16) &&
1852           (SrcAlign == 0 || SrcAlign >= 16)))) {
1853       if (Size >= 32) {
1854         if (Subtarget->hasInt256())
1855           return MVT::v8i32;
1856         if (Subtarget->hasFp256())
1857           return MVT::v8f32;
1858       }
1859       if (Subtarget->hasSSE2())
1860         return MVT::v4i32;
1861       if (Subtarget->hasSSE1())
1862         return MVT::v4f32;
1863     } else if (!MemcpyStrSrc && Size >= 8 &&
1864                !Subtarget->is64Bit() &&
1865                Subtarget->hasSSE2()) {
1866       // Do not use f64 to lower memcpy if source is string constant. It's
1867       // better to use i32 to avoid the loads.
1868       return MVT::f64;
1869     }
1870   }
1871   if (Subtarget->is64Bit() && Size >= 8)
1872     return MVT::i64;
1873   return MVT::i32;
1874 }
1875
1876 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1877   if (VT == MVT::f32)
1878     return X86ScalarSSEf32;
1879   else if (VT == MVT::f64)
1880     return X86ScalarSSEf64;
1881   return true;
1882 }
1883
1884 bool
1885 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1886                                                   unsigned,
1887                                                   unsigned,
1888                                                   bool *Fast) const {
1889   if (Fast)
1890     *Fast = Subtarget->isUnalignedMemAccessFast();
1891   return true;
1892 }
1893
1894 /// Return the entry encoding for a jump table in the
1895 /// current function.  The returned value is a member of the
1896 /// MachineJumpTableInfo::JTEntryKind enum.
1897 unsigned X86TargetLowering::getJumpTableEncoding() const {
1898   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1899   // symbol.
1900   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1901       Subtarget->isPICStyleGOT())
1902     return MachineJumpTableInfo::EK_Custom32;
1903
1904   // Otherwise, use the normal jump table encoding heuristics.
1905   return TargetLowering::getJumpTableEncoding();
1906 }
1907
1908 const MCExpr *
1909 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1910                                              const MachineBasicBlock *MBB,
1911                                              unsigned uid,MCContext &Ctx) const{
1912   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
1913          Subtarget->isPICStyleGOT());
1914   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1915   // entries.
1916   return MCSymbolRefExpr::Create(MBB->getSymbol(),
1917                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1918 }
1919
1920 /// Returns relocation base for the given PIC jumptable.
1921 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1922                                                     SelectionDAG &DAG) const {
1923   if (!Subtarget->is64Bit())
1924     // This doesn't have SDLoc associated with it, but is not really the
1925     // same as a Register.
1926     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1927   return Table;
1928 }
1929
1930 /// This returns the relocation base for the given PIC jumptable,
1931 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1932 const MCExpr *X86TargetLowering::
1933 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1934                              MCContext &Ctx) const {
1935   // X86-64 uses RIP relative addressing based on the jump table label.
1936   if (Subtarget->isPICStyleRIPRel())
1937     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1938
1939   // Otherwise, the reference is relative to the PIC base.
1940   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1941 }
1942
1943 // FIXME: Why this routine is here? Move to RegInfo!
1944 std::pair<const TargetRegisterClass*, uint8_t>
1945 X86TargetLowering::findRepresentativeClass(MVT VT) const{
1946   const TargetRegisterClass *RRC = nullptr;
1947   uint8_t Cost = 1;
1948   switch (VT.SimpleTy) {
1949   default:
1950     return TargetLowering::findRepresentativeClass(VT);
1951   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1952     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1953     break;
1954   case MVT::x86mmx:
1955     RRC = &X86::VR64RegClass;
1956     break;
1957   case MVT::f32: case MVT::f64:
1958   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1959   case MVT::v4f32: case MVT::v2f64:
1960   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1961   case MVT::v4f64:
1962     RRC = &X86::VR128RegClass;
1963     break;
1964   }
1965   return std::make_pair(RRC, Cost);
1966 }
1967
1968 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1969                                                unsigned &Offset) const {
1970   if (!Subtarget->isTargetLinux())
1971     return false;
1972
1973   if (Subtarget->is64Bit()) {
1974     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1975     Offset = 0x28;
1976     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1977       AddressSpace = 256;
1978     else
1979       AddressSpace = 257;
1980   } else {
1981     // %gs:0x14 on i386
1982     Offset = 0x14;
1983     AddressSpace = 256;
1984   }
1985   return true;
1986 }
1987
1988 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1989                                             unsigned DestAS) const {
1990   assert(SrcAS != DestAS && "Expected different address spaces!");
1991
1992   return SrcAS < 256 && DestAS < 256;
1993 }
1994
1995 //===----------------------------------------------------------------------===//
1996 //               Return Value Calling Convention Implementation
1997 //===----------------------------------------------------------------------===//
1998
1999 #include "X86GenCallingConv.inc"
2000
2001 bool
2002 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2003                                   MachineFunction &MF, bool isVarArg,
2004                         const SmallVectorImpl<ISD::OutputArg> &Outs,
2005                         LLVMContext &Context) const {
2006   SmallVector<CCValAssign, 16> RVLocs;
2007   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2008   return CCInfo.CheckReturn(Outs, RetCC_X86);
2009 }
2010
2011 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2012   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2013   return ScratchRegs;
2014 }
2015
2016 SDValue
2017 X86TargetLowering::LowerReturn(SDValue Chain,
2018                                CallingConv::ID CallConv, bool isVarArg,
2019                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2020                                const SmallVectorImpl<SDValue> &OutVals,
2021                                SDLoc dl, SelectionDAG &DAG) const {
2022   MachineFunction &MF = DAG.getMachineFunction();
2023   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2024
2025   SmallVector<CCValAssign, 16> RVLocs;
2026   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2027   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2028
2029   SDValue Flag;
2030   SmallVector<SDValue, 6> RetOps;
2031   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2032   // Operand #1 = Bytes To Pop
2033   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
2034                    MVT::i16));
2035
2036   // Copy the result values into the output registers.
2037   for (unsigned i = 0; i != RVLocs.size(); ++i) {
2038     CCValAssign &VA = RVLocs[i];
2039     assert(VA.isRegLoc() && "Can only return in registers!");
2040     SDValue ValToCopy = OutVals[i];
2041     EVT ValVT = ValToCopy.getValueType();
2042
2043     // Promote values to the appropriate types.
2044     if (VA.getLocInfo() == CCValAssign::SExt)
2045       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2046     else if (VA.getLocInfo() == CCValAssign::ZExt)
2047       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2048     else if (VA.getLocInfo() == CCValAssign::AExt)
2049       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2050     else if (VA.getLocInfo() == CCValAssign::BCvt)
2051       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
2052
2053     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2054            "Unexpected FP-extend for return value.");
2055
2056     // If this is x86-64, and we disabled SSE, we can't return FP values,
2057     // or SSE or MMX vectors.
2058     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2059          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2060           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
2061       report_fatal_error("SSE register return with SSE disabled");
2062     }
2063     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2064     // llvm-gcc has never done it right and no one has noticed, so this
2065     // should be OK for now.
2066     if (ValVT == MVT::f64 &&
2067         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
2068       report_fatal_error("SSE2 register return with SSE2 disabled");
2069
2070     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2071     // the RET instruction and handled by the FP Stackifier.
2072     if (VA.getLocReg() == X86::FP0 ||
2073         VA.getLocReg() == X86::FP1) {
2074       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2075       // change the value to the FP stack register class.
2076       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2077         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2078       RetOps.push_back(ValToCopy);
2079       // Don't emit a copytoreg.
2080       continue;
2081     }
2082
2083     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2084     // which is returned in RAX / RDX.
2085     if (Subtarget->is64Bit()) {
2086       if (ValVT == MVT::x86mmx) {
2087         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2088           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
2089           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2090                                   ValToCopy);
2091           // If we don't have SSE2 available, convert to v4f32 so the generated
2092           // register is legal.
2093           if (!Subtarget->hasSSE2())
2094             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
2095         }
2096       }
2097     }
2098
2099     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2100     Flag = Chain.getValue(1);
2101     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2102   }
2103
2104   // The x86-64 ABIs require that for returning structs by value we copy
2105   // the sret argument into %rax/%eax (depending on ABI) for the return.
2106   // Win32 requires us to put the sret argument to %eax as well.
2107   // We saved the argument into a virtual register in the entry block,
2108   // so now we copy the value out and into %rax/%eax.
2109   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
2110       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
2111     MachineFunction &MF = DAG.getMachineFunction();
2112     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2113     unsigned Reg = FuncInfo->getSRetReturnReg();
2114     assert(Reg &&
2115            "SRetReturnReg should have been set in LowerFormalArguments().");
2116     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
2117
2118     unsigned RetValReg
2119         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
2120           X86::RAX : X86::EAX;
2121     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2122     Flag = Chain.getValue(1);
2123
2124     // RAX/EAX now acts like a return value.
2125     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
2126   }
2127
2128   RetOps[0] = Chain;  // Update chain.
2129
2130   // Add the flag if we have it.
2131   if (Flag.getNode())
2132     RetOps.push_back(Flag);
2133
2134   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
2135 }
2136
2137 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2138   if (N->getNumValues() != 1)
2139     return false;
2140   if (!N->hasNUsesOfValue(1, 0))
2141     return false;
2142
2143   SDValue TCChain = Chain;
2144   SDNode *Copy = *N->use_begin();
2145   if (Copy->getOpcode() == ISD::CopyToReg) {
2146     // If the copy has a glue operand, we conservatively assume it isn't safe to
2147     // perform a tail call.
2148     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2149       return false;
2150     TCChain = Copy->getOperand(0);
2151   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2152     return false;
2153
2154   bool HasRet = false;
2155   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2156        UI != UE; ++UI) {
2157     if (UI->getOpcode() != X86ISD::RET_FLAG)
2158       return false;
2159     // If we are returning more than one value, we can definitely
2160     // not make a tail call see PR19530
2161     if (UI->getNumOperands() > 4)
2162       return false;
2163     if (UI->getNumOperands() == 4 &&
2164         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2165       return false;
2166     HasRet = true;
2167   }
2168
2169   if (!HasRet)
2170     return false;
2171
2172   Chain = TCChain;
2173   return true;
2174 }
2175
2176 EVT
2177 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
2178                                             ISD::NodeType ExtendKind) const {
2179   MVT ReturnMVT;
2180   // TODO: Is this also valid on 32-bit?
2181   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2182     ReturnMVT = MVT::i8;
2183   else
2184     ReturnMVT = MVT::i32;
2185
2186   EVT MinVT = getRegisterType(Context, ReturnMVT);
2187   return VT.bitsLT(MinVT) ? MinVT : VT;
2188 }
2189
2190 /// Lower the result values of a call into the
2191 /// appropriate copies out of appropriate physical registers.
2192 ///
2193 SDValue
2194 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2195                                    CallingConv::ID CallConv, bool isVarArg,
2196                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2197                                    SDLoc dl, SelectionDAG &DAG,
2198                                    SmallVectorImpl<SDValue> &InVals) const {
2199
2200   // Assign locations to each value returned by this call.
2201   SmallVector<CCValAssign, 16> RVLocs;
2202   bool Is64Bit = Subtarget->is64Bit();
2203   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2204                  *DAG.getContext());
2205   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2206
2207   // Copy all of the result registers out of their specified physreg.
2208   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2209     CCValAssign &VA = RVLocs[i];
2210     EVT CopyVT = VA.getValVT();
2211
2212     // If this is x86-64, and we disabled SSE, we can't return FP values
2213     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2214         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2215       report_fatal_error("SSE register return with SSE disabled");
2216     }
2217
2218     // If we prefer to use the value in xmm registers, copy it out as f80 and
2219     // use a truncate to move it from fp stack reg to xmm reg.
2220     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2221         isScalarFPTypeInSSEReg(VA.getValVT()))
2222       CopyVT = MVT::f80;
2223
2224     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2225                                CopyVT, InFlag).getValue(1);
2226     SDValue Val = Chain.getValue(0);
2227
2228     if (CopyVT != VA.getValVT())
2229       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2230                         // This truncation won't change the value.
2231                         DAG.getIntPtrConstant(1));
2232
2233     InFlag = Chain.getValue(2);
2234     InVals.push_back(Val);
2235   }
2236
2237   return Chain;
2238 }
2239
2240 //===----------------------------------------------------------------------===//
2241 //                C & StdCall & Fast Calling Convention implementation
2242 //===----------------------------------------------------------------------===//
2243 //  StdCall calling convention seems to be standard for many Windows' API
2244 //  routines and around. It differs from C calling convention just a little:
2245 //  callee should clean up the stack, not caller. Symbols should be also
2246 //  decorated in some fancy way :) It doesn't support any vector arguments.
2247 //  For info on fast calling convention see Fast Calling Convention (tail call)
2248 //  implementation LowerX86_32FastCCCallTo.
2249
2250 /// CallIsStructReturn - Determines whether a call uses struct return
2251 /// semantics.
2252 enum StructReturnType {
2253   NotStructReturn,
2254   RegStructReturn,
2255   StackStructReturn
2256 };
2257 static StructReturnType
2258 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2259   if (Outs.empty())
2260     return NotStructReturn;
2261
2262   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2263   if (!Flags.isSRet())
2264     return NotStructReturn;
2265   if (Flags.isInReg())
2266     return RegStructReturn;
2267   return StackStructReturn;
2268 }
2269
2270 /// Determines whether a function uses struct return semantics.
2271 static StructReturnType
2272 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2273   if (Ins.empty())
2274     return NotStructReturn;
2275
2276   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2277   if (!Flags.isSRet())
2278     return NotStructReturn;
2279   if (Flags.isInReg())
2280     return RegStructReturn;
2281   return StackStructReturn;
2282 }
2283
2284 /// Make a copy of an aggregate at address specified by "Src" to address
2285 /// "Dst" with size and alignment information specified by the specific
2286 /// parameter attribute. The copy will be passed as a byval function parameter.
2287 static SDValue
2288 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2289                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2290                           SDLoc dl) {
2291   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2292
2293   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2294                        /*isVolatile*/false, /*AlwaysInline=*/true,
2295                        MachinePointerInfo(), MachinePointerInfo());
2296 }
2297
2298 /// Return true if the calling convention is one that
2299 /// supports tail call optimization.
2300 static bool IsTailCallConvention(CallingConv::ID CC) {
2301   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2302           CC == CallingConv::HiPE);
2303 }
2304
2305 /// \brief Return true if the calling convention is a C calling convention.
2306 static bool IsCCallConvention(CallingConv::ID CC) {
2307   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2308           CC == CallingConv::X86_64_SysV);
2309 }
2310
2311 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2312   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2313     return false;
2314
2315   CallSite CS(CI);
2316   CallingConv::ID CalleeCC = CS.getCallingConv();
2317   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2318     return false;
2319
2320   return true;
2321 }
2322
2323 /// Return true if the function is being made into
2324 /// a tailcall target by changing its ABI.
2325 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2326                                    bool GuaranteedTailCallOpt) {
2327   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2328 }
2329
2330 SDValue
2331 X86TargetLowering::LowerMemArgument(SDValue Chain,
2332                                     CallingConv::ID CallConv,
2333                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2334                                     SDLoc dl, SelectionDAG &DAG,
2335                                     const CCValAssign &VA,
2336                                     MachineFrameInfo *MFI,
2337                                     unsigned i) const {
2338   // Create the nodes corresponding to a load from this parameter slot.
2339   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2340   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2341       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2342   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2343   EVT ValVT;
2344
2345   // If value is passed by pointer we have address passed instead of the value
2346   // itself.
2347   if (VA.getLocInfo() == CCValAssign::Indirect)
2348     ValVT = VA.getLocVT();
2349   else
2350     ValVT = VA.getValVT();
2351
2352   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2353   // changed with more analysis.
2354   // In case of tail call optimization mark all arguments mutable. Since they
2355   // could be overwritten by lowering of arguments in case of a tail call.
2356   if (Flags.isByVal()) {
2357     unsigned Bytes = Flags.getByValSize();
2358     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2359     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2360     return DAG.getFrameIndex(FI, getPointerTy());
2361   } else {
2362     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2363                                     VA.getLocMemOffset(), isImmutable);
2364     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2365     return DAG.getLoad(ValVT, dl, Chain, FIN,
2366                        MachinePointerInfo::getFixedStack(FI),
2367                        false, false, false, 0);
2368   }
2369 }
2370
2371 // FIXME: Get this from tablegen.
2372 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2373                                                 const X86Subtarget *Subtarget) {
2374   assert(Subtarget->is64Bit());
2375
2376   if (Subtarget->isCallingConvWin64(CallConv)) {
2377     static const MCPhysReg GPR64ArgRegsWin64[] = {
2378       X86::RCX, X86::RDX, X86::R8,  X86::R9
2379     };
2380     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2381   }
2382
2383   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2384     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2385   };
2386   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2387 }
2388
2389 // FIXME: Get this from tablegen.
2390 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2391                                                 CallingConv::ID CallConv,
2392                                                 const X86Subtarget *Subtarget) {
2393   assert(Subtarget->is64Bit());
2394   if (Subtarget->isCallingConvWin64(CallConv)) {
2395     // The XMM registers which might contain var arg parameters are shadowed
2396     // in their paired GPR.  So we only need to save the GPR to their home
2397     // slots.
2398     // TODO: __vectorcall will change this.
2399     return None;
2400   }
2401
2402   const Function *Fn = MF.getFunction();
2403   bool NoImplicitFloatOps = Fn->getAttributes().
2404       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2405   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
2406          "SSE register cannot be used when SSE is disabled!");
2407   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2408       !Subtarget->hasSSE1())
2409     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2410     // registers.
2411     return None;
2412
2413   static const MCPhysReg XMMArgRegs64Bit[] = {
2414     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2415     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2416   };
2417   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2418 }
2419
2420 SDValue
2421 X86TargetLowering::LowerFormalArguments(SDValue Chain,
2422                                         CallingConv::ID CallConv,
2423                                         bool isVarArg,
2424                                       const SmallVectorImpl<ISD::InputArg> &Ins,
2425                                         SDLoc dl,
2426                                         SelectionDAG &DAG,
2427                                         SmallVectorImpl<SDValue> &InVals)
2428                                           const {
2429   MachineFunction &MF = DAG.getMachineFunction();
2430   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2431
2432   const Function* Fn = MF.getFunction();
2433   if (Fn->hasExternalLinkage() &&
2434       Subtarget->isTargetCygMing() &&
2435       Fn->getName() == "main")
2436     FuncInfo->setForceFramePointer(true);
2437
2438   MachineFrameInfo *MFI = MF.getFrameInfo();
2439   bool Is64Bit = Subtarget->is64Bit();
2440   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2441
2442   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2443          "Var args not supported with calling convention fastcc, ghc or hipe");
2444
2445   // Assign locations to all of the incoming arguments.
2446   SmallVector<CCValAssign, 16> ArgLocs;
2447   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2448
2449   // Allocate shadow area for Win64
2450   if (IsWin64)
2451     CCInfo.AllocateStack(32, 8);
2452
2453   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2454
2455   unsigned LastVal = ~0U;
2456   SDValue ArgValue;
2457   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2458     CCValAssign &VA = ArgLocs[i];
2459     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2460     // places.
2461     assert(VA.getValNo() != LastVal &&
2462            "Don't support value assigned to multiple locs yet");
2463     (void)LastVal;
2464     LastVal = VA.getValNo();
2465
2466     if (VA.isRegLoc()) {
2467       EVT RegVT = VA.getLocVT();
2468       const TargetRegisterClass *RC;
2469       if (RegVT == MVT::i32)
2470         RC = &X86::GR32RegClass;
2471       else if (Is64Bit && RegVT == MVT::i64)
2472         RC = &X86::GR64RegClass;
2473       else if (RegVT == MVT::f32)
2474         RC = &X86::FR32RegClass;
2475       else if (RegVT == MVT::f64)
2476         RC = &X86::FR64RegClass;
2477       else if (RegVT.is512BitVector())
2478         RC = &X86::VR512RegClass;
2479       else if (RegVT.is256BitVector())
2480         RC = &X86::VR256RegClass;
2481       else if (RegVT.is128BitVector())
2482         RC = &X86::VR128RegClass;
2483       else if (RegVT == MVT::x86mmx)
2484         RC = &X86::VR64RegClass;
2485       else if (RegVT == MVT::i1)
2486         RC = &X86::VK1RegClass;
2487       else if (RegVT == MVT::v8i1)
2488         RC = &X86::VK8RegClass;
2489       else if (RegVT == MVT::v16i1)
2490         RC = &X86::VK16RegClass;
2491       else if (RegVT == MVT::v32i1)
2492         RC = &X86::VK32RegClass;
2493       else if (RegVT == MVT::v64i1)
2494         RC = &X86::VK64RegClass;
2495       else
2496         llvm_unreachable("Unknown argument type!");
2497
2498       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2499       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2500
2501       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2502       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2503       // right size.
2504       if (VA.getLocInfo() == CCValAssign::SExt)
2505         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2506                                DAG.getValueType(VA.getValVT()));
2507       else if (VA.getLocInfo() == CCValAssign::ZExt)
2508         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2509                                DAG.getValueType(VA.getValVT()));
2510       else if (VA.getLocInfo() == CCValAssign::BCvt)
2511         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2512
2513       if (VA.isExtInLoc()) {
2514         // Handle MMX values passed in XMM regs.
2515         if (RegVT.isVector())
2516           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2517         else
2518           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2519       }
2520     } else {
2521       assert(VA.isMemLoc());
2522       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2523     }
2524
2525     // If value is passed via pointer - do a load.
2526     if (VA.getLocInfo() == CCValAssign::Indirect)
2527       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2528                              MachinePointerInfo(), false, false, false, 0);
2529
2530     InVals.push_back(ArgValue);
2531   }
2532
2533   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
2534     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2535       // The x86-64 ABIs require that for returning structs by value we copy
2536       // the sret argument into %rax/%eax (depending on ABI) for the return.
2537       // Win32 requires us to put the sret argument to %eax as well.
2538       // Save the argument into a virtual register so that we can access it
2539       // from the return points.
2540       if (Ins[i].Flags.isSRet()) {
2541         unsigned Reg = FuncInfo->getSRetReturnReg();
2542         if (!Reg) {
2543           MVT PtrTy = getPointerTy();
2544           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2545           FuncInfo->setSRetReturnReg(Reg);
2546         }
2547         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2548         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2549         break;
2550       }
2551     }
2552   }
2553
2554   unsigned StackSize = CCInfo.getNextStackOffset();
2555   // Align stack specially for tail calls.
2556   if (FuncIsMadeTailCallSafe(CallConv,
2557                              MF.getTarget().Options.GuaranteedTailCallOpt))
2558     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2559
2560   // If the function takes variable number of arguments, make a frame index for
2561   // the start of the first vararg value... for expansion of llvm.va_start. We
2562   // can skip this if there are no va_start calls.
2563   if (MFI->hasVAStart() &&
2564       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2565                    CallConv != CallingConv::X86_ThisCall))) {
2566     FuncInfo->setVarArgsFrameIndex(
2567         MFI->CreateFixedObject(1, StackSize, true));
2568   }
2569
2570   // Figure out if XMM registers are in use.
2571   assert(!(MF.getTarget().Options.UseSoftFloat &&
2572            Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
2573                                             Attribute::NoImplicitFloat)) &&
2574          "SSE register cannot be used when SSE is disabled!");
2575
2576   // 64-bit calling conventions support varargs and register parameters, so we
2577   // have to do extra work to spill them in the prologue.
2578   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2579     // Find the first unallocated argument registers.
2580     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2581     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2582     unsigned NumIntRegs =
2583         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
2584     unsigned NumXMMRegs =
2585         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
2586     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2587            "SSE register cannot be used when SSE is disabled!");
2588
2589     // Gather all the live in physical registers.
2590     SmallVector<SDValue, 6> LiveGPRs;
2591     SmallVector<SDValue, 8> LiveXMMRegs;
2592     SDValue ALVal;
2593     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2594       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2595       LiveGPRs.push_back(
2596           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2597     }
2598     if (!ArgXMMs.empty()) {
2599       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2600       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2601       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2602         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2603         LiveXMMRegs.push_back(
2604             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2605       }
2606     }
2607
2608     if (IsWin64) {
2609       const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
2610       // Get to the caller-allocated home save location.  Add 8 to account
2611       // for the return address.
2612       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2613       FuncInfo->setRegSaveFrameIndex(
2614           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2615       // Fixup to set vararg frame on shadow area (4 x i64).
2616       if (NumIntRegs < 4)
2617         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2618     } else {
2619       // For X86-64, if there are vararg parameters that are passed via
2620       // registers, then we must store them to their spots on the stack so
2621       // they may be loaded by deferencing the result of va_next.
2622       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2623       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2624       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2625           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2626     }
2627
2628     // Store the integer parameter registers.
2629     SmallVector<SDValue, 8> MemOps;
2630     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2631                                       getPointerTy());
2632     unsigned Offset = FuncInfo->getVarArgsGPOffset();
2633     for (SDValue Val : LiveGPRs) {
2634       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2635                                 DAG.getIntPtrConstant(Offset));
2636       SDValue Store =
2637         DAG.getStore(Val.getValue(1), dl, Val, FIN,
2638                      MachinePointerInfo::getFixedStack(
2639                        FuncInfo->getRegSaveFrameIndex(), Offset),
2640                      false, false, 0);
2641       MemOps.push_back(Store);
2642       Offset += 8;
2643     }
2644
2645     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2646       // Now store the XMM (fp + vector) parameter registers.
2647       SmallVector<SDValue, 12> SaveXMMOps;
2648       SaveXMMOps.push_back(Chain);
2649       SaveXMMOps.push_back(ALVal);
2650       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2651                              FuncInfo->getRegSaveFrameIndex()));
2652       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2653                              FuncInfo->getVarArgsFPOffset()));
2654       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2655                         LiveXMMRegs.end());
2656       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2657                                    MVT::Other, SaveXMMOps));
2658     }
2659
2660     if (!MemOps.empty())
2661       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2662   }
2663
2664   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2665     // Find the largest legal vector type.
2666     MVT VecVT = MVT::Other;
2667     // FIXME: Only some x86_32 calling conventions support AVX512.
2668     if (Subtarget->hasAVX512() &&
2669         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2670                      CallConv == CallingConv::Intel_OCL_BI)))
2671       VecVT = MVT::v16f32;
2672     else if (Subtarget->hasAVX())
2673       VecVT = MVT::v8f32;
2674     else if (Subtarget->hasSSE2())
2675       VecVT = MVT::v4f32;
2676
2677     // We forward some GPRs and some vector types.
2678     SmallVector<MVT, 2> RegParmTypes;
2679     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2680     RegParmTypes.push_back(IntVT);
2681     if (VecVT != MVT::Other)
2682       RegParmTypes.push_back(VecVT);
2683
2684     // Compute the set of forwarded registers. The rest are scratch.
2685     SmallVectorImpl<ForwardedRegister> &Forwards =
2686         FuncInfo->getForwardedMustTailRegParms();
2687     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2688
2689     // Conservatively forward AL on x86_64, since it might be used for varargs.
2690     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2691       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2692       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2693     }
2694
2695     // Copy all forwards from physical to virtual registers.
2696     for (ForwardedRegister &F : Forwards) {
2697       // FIXME: Can we use a less constrained schedule?
2698       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2699       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2700       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2701     }
2702   }
2703
2704   // Some CCs need callee pop.
2705   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2706                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
2707     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2708   } else {
2709     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2710     // If this is an sret function, the return should pop the hidden pointer.
2711     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2712         !Subtarget->getTargetTriple().isOSMSVCRT() &&
2713         argsAreStructReturn(Ins) == StackStructReturn)
2714       FuncInfo->setBytesToPopOnReturn(4);
2715   }
2716
2717   if (!Is64Bit) {
2718     // RegSaveFrameIndex is X86-64 only.
2719     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2720     if (CallConv == CallingConv::X86_FastCall ||
2721         CallConv == CallingConv::X86_ThisCall)
2722       // fastcc functions can't have varargs.
2723       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2724   }
2725
2726   FuncInfo->setArgumentStackSize(StackSize);
2727
2728   return Chain;
2729 }
2730
2731 SDValue
2732 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2733                                     SDValue StackPtr, SDValue Arg,
2734                                     SDLoc dl, SelectionDAG &DAG,
2735                                     const CCValAssign &VA,
2736                                     ISD::ArgFlagsTy Flags) const {
2737   unsigned LocMemOffset = VA.getLocMemOffset();
2738   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2739   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2740   if (Flags.isByVal())
2741     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2742
2743   return DAG.getStore(Chain, dl, Arg, PtrOff,
2744                       MachinePointerInfo::getStack(LocMemOffset),
2745                       false, false, 0);
2746 }
2747
2748 /// Emit a load of return address if tail call
2749 /// optimization is performed and it is required.
2750 SDValue
2751 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2752                                            SDValue &OutRetAddr, SDValue Chain,
2753                                            bool IsTailCall, bool Is64Bit,
2754                                            int FPDiff, SDLoc dl) const {
2755   // Adjust the Return address stack slot.
2756   EVT VT = getPointerTy();
2757   OutRetAddr = getReturnAddressFrameIndex(DAG);
2758
2759   // Load the "old" Return address.
2760   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2761                            false, false, false, 0);
2762   return SDValue(OutRetAddr.getNode(), 1);
2763 }
2764
2765 /// Emit a store of the return address if tail call
2766 /// optimization is performed and it is required (FPDiff!=0).
2767 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2768                                         SDValue Chain, SDValue RetAddrFrIdx,
2769                                         EVT PtrVT, unsigned SlotSize,
2770                                         int FPDiff, SDLoc dl) {
2771   // Store the return address to the appropriate stack slot.
2772   if (!FPDiff) return Chain;
2773   // Calculate the new stack slot for the return address.
2774   int NewReturnAddrFI =
2775     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2776                                          false);
2777   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2778   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2779                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2780                        false, false, 0);
2781   return Chain;
2782 }
2783
2784 SDValue
2785 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2786                              SmallVectorImpl<SDValue> &InVals) const {
2787   SelectionDAG &DAG                     = CLI.DAG;
2788   SDLoc &dl                             = CLI.DL;
2789   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2790   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2791   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2792   SDValue Chain                         = CLI.Chain;
2793   SDValue Callee                        = CLI.Callee;
2794   CallingConv::ID CallConv              = CLI.CallConv;
2795   bool &isTailCall                      = CLI.IsTailCall;
2796   bool isVarArg                         = CLI.IsVarArg;
2797
2798   MachineFunction &MF = DAG.getMachineFunction();
2799   bool Is64Bit        = Subtarget->is64Bit();
2800   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
2801   StructReturnType SR = callIsStructReturn(Outs);
2802   bool IsSibcall      = false;
2803   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2804
2805   if (MF.getTarget().Options.DisableTailCalls)
2806     isTailCall = false;
2807
2808   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2809   if (IsMustTail) {
2810     // Force this to be a tail call.  The verifier rules are enough to ensure
2811     // that we can lower this successfully without moving the return address
2812     // around.
2813     isTailCall = true;
2814   } else if (isTailCall) {
2815     // Check if it's really possible to do a tail call.
2816     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2817                     isVarArg, SR != NotStructReturn,
2818                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2819                     Outs, OutVals, Ins, DAG);
2820
2821     // Sibcalls are automatically detected tailcalls which do not require
2822     // ABI changes.
2823     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2824       IsSibcall = true;
2825
2826     if (isTailCall)
2827       ++NumTailCalls;
2828   }
2829
2830   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2831          "Var args not supported with calling convention fastcc, ghc or hipe");
2832
2833   // Analyze operands of the call, assigning locations to each operand.
2834   SmallVector<CCValAssign, 16> ArgLocs;
2835   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2836
2837   // Allocate shadow area for Win64
2838   if (IsWin64)
2839     CCInfo.AllocateStack(32, 8);
2840
2841   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2842
2843   // Get a count of how many bytes are to be pushed on the stack.
2844   unsigned NumBytes = CCInfo.getNextStackOffset();
2845   if (IsSibcall)
2846     // This is a sibcall. The memory operands are available in caller's
2847     // own caller's stack.
2848     NumBytes = 0;
2849   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2850            IsTailCallConvention(CallConv))
2851     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2852
2853   int FPDiff = 0;
2854   if (isTailCall && !IsSibcall && !IsMustTail) {
2855     // Lower arguments at fp - stackoffset + fpdiff.
2856     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2857
2858     FPDiff = NumBytesCallerPushed - NumBytes;
2859
2860     // Set the delta of movement of the returnaddr stackslot.
2861     // But only set if delta is greater than previous delta.
2862     if (FPDiff < X86Info->getTCReturnAddrDelta())
2863       X86Info->setTCReturnAddrDelta(FPDiff);
2864   }
2865
2866   unsigned NumBytesToPush = NumBytes;
2867   unsigned NumBytesToPop = NumBytes;
2868
2869   // If we have an inalloca argument, all stack space has already been allocated
2870   // for us and be right at the top of the stack.  We don't support multiple
2871   // arguments passed in memory when using inalloca.
2872   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2873     NumBytesToPush = 0;
2874     if (!ArgLocs.back().isMemLoc())
2875       report_fatal_error("cannot use inalloca attribute on a register "
2876                          "parameter");
2877     if (ArgLocs.back().getLocMemOffset() != 0)
2878       report_fatal_error("any parameter with the inalloca attribute must be "
2879                          "the only memory argument");
2880   }
2881
2882   if (!IsSibcall)
2883     Chain = DAG.getCALLSEQ_START(
2884         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
2885
2886   SDValue RetAddrFrIdx;
2887   // Load return address for tail calls.
2888   if (isTailCall && FPDiff)
2889     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2890                                     Is64Bit, FPDiff, dl);
2891
2892   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2893   SmallVector<SDValue, 8> MemOpChains;
2894   SDValue StackPtr;
2895
2896   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2897   // of tail call optimization arguments are handle later.
2898   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2899   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2900     // Skip inalloca arguments, they have already been written.
2901     ISD::ArgFlagsTy Flags = Outs[i].Flags;
2902     if (Flags.isInAlloca())
2903       continue;
2904
2905     CCValAssign &VA = ArgLocs[i];
2906     EVT RegVT = VA.getLocVT();
2907     SDValue Arg = OutVals[i];
2908     bool isByVal = Flags.isByVal();
2909
2910     // Promote the value if needed.
2911     switch (VA.getLocInfo()) {
2912     default: llvm_unreachable("Unknown loc info!");
2913     case CCValAssign::Full: break;
2914     case CCValAssign::SExt:
2915       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2916       break;
2917     case CCValAssign::ZExt:
2918       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2919       break;
2920     case CCValAssign::AExt:
2921       if (RegVT.is128BitVector()) {
2922         // Special case: passing MMX values in XMM registers.
2923         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2924         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2925         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2926       } else
2927         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2928       break;
2929     case CCValAssign::BCvt:
2930       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2931       break;
2932     case CCValAssign::Indirect: {
2933       // Store the argument.
2934       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2935       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2936       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2937                            MachinePointerInfo::getFixedStack(FI),
2938                            false, false, 0);
2939       Arg = SpillSlot;
2940       break;
2941     }
2942     }
2943
2944     if (VA.isRegLoc()) {
2945       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2946       if (isVarArg && IsWin64) {
2947         // Win64 ABI requires argument XMM reg to be copied to the corresponding
2948         // shadow reg if callee is a varargs function.
2949         unsigned ShadowReg = 0;
2950         switch (VA.getLocReg()) {
2951         case X86::XMM0: ShadowReg = X86::RCX; break;
2952         case X86::XMM1: ShadowReg = X86::RDX; break;
2953         case X86::XMM2: ShadowReg = X86::R8; break;
2954         case X86::XMM3: ShadowReg = X86::R9; break;
2955         }
2956         if (ShadowReg)
2957           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2958       }
2959     } else if (!IsSibcall && (!isTailCall || isByVal)) {
2960       assert(VA.isMemLoc());
2961       if (!StackPtr.getNode())
2962         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2963                                       getPointerTy());
2964       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2965                                              dl, DAG, VA, Flags));
2966     }
2967   }
2968
2969   if (!MemOpChains.empty())
2970     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2971
2972   if (Subtarget->isPICStyleGOT()) {
2973     // ELF / PIC requires GOT in the EBX register before function calls via PLT
2974     // GOT pointer.
2975     if (!isTailCall) {
2976       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2977                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2978     } else {
2979       // If we are tail calling and generating PIC/GOT style code load the
2980       // address of the callee into ECX. The value in ecx is used as target of
2981       // the tail jump. This is done to circumvent the ebx/callee-saved problem
2982       // for tail calls on PIC/GOT architectures. Normally we would just put the
2983       // address of GOT into ebx and then call target@PLT. But for tail calls
2984       // ebx would be restored (since ebx is callee saved) before jumping to the
2985       // target@PLT.
2986
2987       // Note: The actual moving to ECX is done further down.
2988       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2989       if (G && !G->getGlobal()->hasHiddenVisibility() &&
2990           !G->getGlobal()->hasProtectedVisibility())
2991         Callee = LowerGlobalAddress(Callee, DAG);
2992       else if (isa<ExternalSymbolSDNode>(Callee))
2993         Callee = LowerExternalSymbol(Callee, DAG);
2994     }
2995   }
2996
2997   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
2998     // From AMD64 ABI document:
2999     // For calls that may call functions that use varargs or stdargs
3000     // (prototype-less calls or calls to functions containing ellipsis (...) in
3001     // the declaration) %al is used as hidden argument to specify the number
3002     // of SSE registers used. The contents of %al do not need to match exactly
3003     // the number of registers, but must be an ubound on the number of SSE
3004     // registers used and is in the range 0 - 8 inclusive.
3005
3006     // Count the number of XMM registers allocated.
3007     static const MCPhysReg XMMArgRegs[] = {
3008       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3009       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3010     };
3011     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
3012     assert((Subtarget->hasSSE1() || !NumXMMRegs)
3013            && "SSE registers cannot be used when SSE is disabled");
3014
3015     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3016                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
3017   }
3018
3019   if (isVarArg && IsMustTail) {
3020     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3021     for (const auto &F : Forwards) {
3022       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3023       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3024     }
3025   }
3026
3027   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3028   // don't need this because the eligibility check rejects calls that require
3029   // shuffling arguments passed in memory.
3030   if (!IsSibcall && isTailCall) {
3031     // Force all the incoming stack arguments to be loaded from the stack
3032     // before any new outgoing arguments are stored to the stack, because the
3033     // outgoing stack slots may alias the incoming argument stack slots, and
3034     // the alias isn't otherwise explicit. This is slightly more conservative
3035     // than necessary, because it means that each store effectively depends
3036     // on every argument instead of just those arguments it would clobber.
3037     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3038
3039     SmallVector<SDValue, 8> MemOpChains2;
3040     SDValue FIN;
3041     int FI = 0;
3042     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3043       CCValAssign &VA = ArgLocs[i];
3044       if (VA.isRegLoc())
3045         continue;
3046       assert(VA.isMemLoc());
3047       SDValue Arg = OutVals[i];
3048       ISD::ArgFlagsTy Flags = Outs[i].Flags;
3049       // Skip inalloca arguments.  They don't require any work.
3050       if (Flags.isInAlloca())
3051         continue;
3052       // Create frame index.
3053       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3054       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3055       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3056       FIN = DAG.getFrameIndex(FI, getPointerTy());
3057
3058       if (Flags.isByVal()) {
3059         // Copy relative to framepointer.
3060         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
3061         if (!StackPtr.getNode())
3062           StackPtr = DAG.getCopyFromReg(Chain, dl,
3063                                         RegInfo->getStackRegister(),
3064                                         getPointerTy());
3065         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
3066
3067         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3068                                                          ArgChain,
3069                                                          Flags, DAG, dl));
3070       } else {
3071         // Store relative to framepointer.
3072         MemOpChains2.push_back(
3073           DAG.getStore(ArgChain, dl, Arg, FIN,
3074                        MachinePointerInfo::getFixedStack(FI),
3075                        false, false, 0));
3076       }
3077     }
3078
3079     if (!MemOpChains2.empty())
3080       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3081
3082     // Store the return address to the appropriate stack slot.
3083     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3084                                      getPointerTy(), RegInfo->getSlotSize(),
3085                                      FPDiff, dl);
3086   }
3087
3088   // Build a sequence of copy-to-reg nodes chained together with token chain
3089   // and flag operands which copy the outgoing args into registers.
3090   SDValue InFlag;
3091   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3092     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3093                              RegsToPass[i].second, InFlag);
3094     InFlag = Chain.getValue(1);
3095   }
3096
3097   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3098     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3099     // In the 64-bit large code model, we have to make all calls
3100     // through a register, since the call instruction's 32-bit
3101     // pc-relative offset may not be large enough to hold the whole
3102     // address.
3103   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3104     // If the callee is a GlobalAddress node (quite common, every direct call
3105     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3106     // it.
3107     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3108
3109     // We should use extra load for direct calls to dllimported functions in
3110     // non-JIT mode.
3111     const GlobalValue *GV = G->getGlobal();
3112     if (!GV->hasDLLImportStorageClass()) {
3113       unsigned char OpFlags = 0;
3114       bool ExtraLoad = false;
3115       unsigned WrapperKind = ISD::DELETED_NODE;
3116
3117       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
3118       // external symbols most go through the PLT in PIC mode.  If the symbol
3119       // has hidden or protected visibility, or if it is static or local, then
3120       // we don't need to use the PLT - we can directly call it.
3121       if (Subtarget->isTargetELF() &&
3122           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
3123           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
3124         OpFlags = X86II::MO_PLT;
3125       } else if (Subtarget->isPICStyleStubAny() &&
3126                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
3127                  (!Subtarget->getTargetTriple().isMacOSX() ||
3128                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3129         // PC-relative references to external symbols should go through $stub,
3130         // unless we're building with the leopard linker or later, which
3131         // automatically synthesizes these stubs.
3132         OpFlags = X86II::MO_DARWIN_STUB;
3133       } else if (Subtarget->isPICStyleRIPRel() &&
3134                  isa<Function>(GV) &&
3135                  cast<Function>(GV)->getAttributes().
3136                    hasAttribute(AttributeSet::FunctionIndex,
3137                                 Attribute::NonLazyBind)) {
3138         // If the function is marked as non-lazy, generate an indirect call
3139         // which loads from the GOT directly. This avoids runtime overhead
3140         // at the cost of eager binding (and one extra byte of encoding).
3141         OpFlags = X86II::MO_GOTPCREL;
3142         WrapperKind = X86ISD::WrapperRIP;
3143         ExtraLoad = true;
3144       }
3145
3146       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
3147                                           G->getOffset(), OpFlags);
3148
3149       // Add a wrapper if needed.
3150       if (WrapperKind != ISD::DELETED_NODE)
3151         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
3152       // Add extra indirection if needed.
3153       if (ExtraLoad)
3154         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
3155                              MachinePointerInfo::getGOT(),
3156                              false, false, false, 0);
3157     }
3158   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3159     unsigned char OpFlags = 0;
3160
3161     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
3162     // external symbols should go through the PLT.
3163     if (Subtarget->isTargetELF() &&
3164         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
3165       OpFlags = X86II::MO_PLT;
3166     } else if (Subtarget->isPICStyleStubAny() &&
3167                (!Subtarget->getTargetTriple().isMacOSX() ||
3168                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3169       // PC-relative references to external symbols should go through $stub,
3170       // unless we're building with the leopard linker or later, which
3171       // automatically synthesizes these stubs.
3172       OpFlags = X86II::MO_DARWIN_STUB;
3173     }
3174
3175     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
3176                                          OpFlags);
3177   } else if (Subtarget->isTarget64BitILP32() &&
3178              Callee->getValueType(0) == MVT::i32) {
3179     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3180     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3181   }
3182
3183   // Returns a chain & a flag for retval copy to use.
3184   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3185   SmallVector<SDValue, 8> Ops;
3186
3187   if (!IsSibcall && isTailCall) {
3188     Chain = DAG.getCALLSEQ_END(Chain,
3189                                DAG.getIntPtrConstant(NumBytesToPop, true),
3190                                DAG.getIntPtrConstant(0, true), InFlag, dl);
3191     InFlag = Chain.getValue(1);
3192   }
3193
3194   Ops.push_back(Chain);
3195   Ops.push_back(Callee);
3196
3197   if (isTailCall)
3198     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
3199
3200   // Add argument registers to the end of the list so that they are known live
3201   // into the call.
3202   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3203     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3204                                   RegsToPass[i].second.getValueType()));
3205
3206   // Add a register mask operand representing the call-preserved registers.
3207   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
3208   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
3209   assert(Mask && "Missing call preserved mask for calling convention");
3210   Ops.push_back(DAG.getRegisterMask(Mask));
3211
3212   if (InFlag.getNode())
3213     Ops.push_back(InFlag);
3214
3215   if (isTailCall) {
3216     // We used to do:
3217     //// If this is the first return lowered for this function, add the regs
3218     //// to the liveout set for the function.
3219     // This isn't right, although it's probably harmless on x86; liveouts
3220     // should be computed from returns not tail calls.  Consider a void
3221     // function making a tail call to a function returning int.
3222     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3223   }
3224
3225   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3226   InFlag = Chain.getValue(1);
3227
3228   // Create the CALLSEQ_END node.
3229   unsigned NumBytesForCalleeToPop;
3230   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3231                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3232     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3233   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3234            !Subtarget->getTargetTriple().isOSMSVCRT() &&
3235            SR == StackStructReturn)
3236     // If this is a call to a struct-return function, the callee
3237     // pops the hidden struct pointer, so we have to push it back.
3238     // This is common for Darwin/X86, Linux & Mingw32 targets.
3239     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3240     NumBytesForCalleeToPop = 4;
3241   else
3242     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3243
3244   // Returns a flag for retval copy to use.
3245   if (!IsSibcall) {
3246     Chain = DAG.getCALLSEQ_END(Chain,
3247                                DAG.getIntPtrConstant(NumBytesToPop, true),
3248                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
3249                                                      true),
3250                                InFlag, dl);
3251     InFlag = Chain.getValue(1);
3252   }
3253
3254   // Handle result values, copying them out of physregs into vregs that we
3255   // return.
3256   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3257                          Ins, dl, DAG, InVals);
3258 }
3259
3260 //===----------------------------------------------------------------------===//
3261 //                Fast Calling Convention (tail call) implementation
3262 //===----------------------------------------------------------------------===//
3263
3264 //  Like std call, callee cleans arguments, convention except that ECX is
3265 //  reserved for storing the tail called function address. Only 2 registers are
3266 //  free for argument passing (inreg). Tail call optimization is performed
3267 //  provided:
3268 //                * tailcallopt is enabled
3269 //                * caller/callee are fastcc
3270 //  On X86_64 architecture with GOT-style position independent code only local
3271 //  (within module) calls are supported at the moment.
3272 //  To keep the stack aligned according to platform abi the function
3273 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3274 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3275 //  If a tail called function callee has more arguments than the caller the
3276 //  caller needs to make sure that there is room to move the RETADDR to. This is
3277 //  achieved by reserving an area the size of the argument delta right after the
3278 //  original RETADDR, but before the saved framepointer or the spilled registers
3279 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3280 //  stack layout:
3281 //    arg1
3282 //    arg2
3283 //    RETADDR
3284 //    [ new RETADDR
3285 //      move area ]
3286 //    (possible EBP)
3287 //    ESI
3288 //    EDI
3289 //    local1 ..
3290
3291 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3292 /// for a 16 byte align requirement.
3293 unsigned
3294 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3295                                                SelectionDAG& DAG) const {
3296   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3297   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
3298   unsigned StackAlignment = TFI.getStackAlignment();
3299   uint64_t AlignMask = StackAlignment - 1;
3300   int64_t Offset = StackSize;
3301   unsigned SlotSize = RegInfo->getSlotSize();
3302   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3303     // Number smaller than 12 so just add the difference.
3304     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3305   } else {
3306     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3307     Offset = ((~AlignMask) & Offset) + StackAlignment +
3308       (StackAlignment-SlotSize);
3309   }
3310   return Offset;
3311 }
3312
3313 /// MatchingStackOffset - Return true if the given stack call argument is
3314 /// already available in the same position (relatively) of the caller's
3315 /// incoming argument stack.
3316 static
3317 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3318                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3319                          const X86InstrInfo *TII) {
3320   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3321   int FI = INT_MAX;
3322   if (Arg.getOpcode() == ISD::CopyFromReg) {
3323     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3324     if (!TargetRegisterInfo::isVirtualRegister(VR))
3325       return false;
3326     MachineInstr *Def = MRI->getVRegDef(VR);
3327     if (!Def)
3328       return false;
3329     if (!Flags.isByVal()) {
3330       if (!TII->isLoadFromStackSlot(Def, FI))
3331         return false;
3332     } else {
3333       unsigned Opcode = Def->getOpcode();
3334       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3335            Opcode == X86::LEA64_32r) &&
3336           Def->getOperand(1).isFI()) {
3337         FI = Def->getOperand(1).getIndex();
3338         Bytes = Flags.getByValSize();
3339       } else
3340         return false;
3341     }
3342   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3343     if (Flags.isByVal())
3344       // ByVal argument is passed in as a pointer but it's now being
3345       // dereferenced. e.g.
3346       // define @foo(%struct.X* %A) {
3347       //   tail call @bar(%struct.X* byval %A)
3348       // }
3349       return false;
3350     SDValue Ptr = Ld->getBasePtr();
3351     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3352     if (!FINode)
3353       return false;
3354     FI = FINode->getIndex();
3355   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3356     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3357     FI = FINode->getIndex();
3358     Bytes = Flags.getByValSize();
3359   } else
3360     return false;
3361
3362   assert(FI != INT_MAX);
3363   if (!MFI->isFixedObjectIndex(FI))
3364     return false;
3365   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3366 }
3367
3368 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
3369 /// for tail call optimization. Targets which want to do tail call
3370 /// optimization should implement this function.
3371 bool
3372 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3373                                                      CallingConv::ID CalleeCC,
3374                                                      bool isVarArg,
3375                                                      bool isCalleeStructRet,
3376                                                      bool isCallerStructRet,
3377                                                      Type *RetTy,
3378                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
3379                                     const SmallVectorImpl<SDValue> &OutVals,
3380                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3381                                                      SelectionDAG &DAG) const {
3382   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3383     return false;
3384
3385   // If -tailcallopt is specified, make fastcc functions tail-callable.
3386   const MachineFunction &MF = DAG.getMachineFunction();
3387   const Function *CallerF = MF.getFunction();
3388
3389   // If the function return type is x86_fp80 and the callee return type is not,
3390   // then the FP_EXTEND of the call result is not a nop. It's not safe to
3391   // perform a tailcall optimization here.
3392   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3393     return false;
3394
3395   CallingConv::ID CallerCC = CallerF->getCallingConv();
3396   bool CCMatch = CallerCC == CalleeCC;
3397   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3398   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3399
3400   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3401     if (IsTailCallConvention(CalleeCC) && CCMatch)
3402       return true;
3403     return false;
3404   }
3405
3406   // Look for obvious safe cases to perform tail call optimization that do not
3407   // require ABI changes. This is what gcc calls sibcall.
3408
3409   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3410   // emit a special epilogue.
3411   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3412   if (RegInfo->needsStackRealignment(MF))
3413     return false;
3414
3415   // Also avoid sibcall optimization if either caller or callee uses struct
3416   // return semantics.
3417   if (isCalleeStructRet || isCallerStructRet)
3418     return false;
3419
3420   // An stdcall/thiscall caller is expected to clean up its arguments; the
3421   // callee isn't going to do that.
3422   // FIXME: this is more restrictive than needed. We could produce a tailcall
3423   // when the stack adjustment matches. For example, with a thiscall that takes
3424   // only one argument.
3425   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3426                    CallerCC == CallingConv::X86_ThisCall))
3427     return false;
3428
3429   // Do not sibcall optimize vararg calls unless all arguments are passed via
3430   // registers.
3431   if (isVarArg && !Outs.empty()) {
3432
3433     // Optimizing for varargs on Win64 is unlikely to be safe without
3434     // additional testing.
3435     if (IsCalleeWin64 || IsCallerWin64)
3436       return false;
3437
3438     SmallVector<CCValAssign, 16> ArgLocs;
3439     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3440                    *DAG.getContext());
3441
3442     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3443     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3444       if (!ArgLocs[i].isRegLoc())
3445         return false;
3446   }
3447
3448   // If the call result is in ST0 / ST1, it needs to be popped off the x87
3449   // stack.  Therefore, if it's not used by the call it is not safe to optimize
3450   // this into a sibcall.
3451   bool Unused = false;
3452   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3453     if (!Ins[i].Used) {
3454       Unused = true;
3455       break;
3456     }
3457   }
3458   if (Unused) {
3459     SmallVector<CCValAssign, 16> RVLocs;
3460     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
3461                    *DAG.getContext());
3462     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3463     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3464       CCValAssign &VA = RVLocs[i];
3465       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3466         return false;
3467     }
3468   }
3469
3470   // If the calling conventions do not match, then we'd better make sure the
3471   // results are returned in the same way as what the caller expects.
3472   if (!CCMatch) {
3473     SmallVector<CCValAssign, 16> RVLocs1;
3474     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
3475                     *DAG.getContext());
3476     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3477
3478     SmallVector<CCValAssign, 16> RVLocs2;
3479     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
3480                     *DAG.getContext());
3481     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3482
3483     if (RVLocs1.size() != RVLocs2.size())
3484       return false;
3485     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3486       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3487         return false;
3488       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3489         return false;
3490       if (RVLocs1[i].isRegLoc()) {
3491         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3492           return false;
3493       } else {
3494         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3495           return false;
3496       }
3497     }
3498   }
3499
3500   // If the callee takes no arguments then go on to check the results of the
3501   // call.
3502   if (!Outs.empty()) {
3503     // Check if stack adjustment is needed. For now, do not do this if any
3504     // argument is passed on the stack.
3505     SmallVector<CCValAssign, 16> ArgLocs;
3506     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3507                    *DAG.getContext());
3508
3509     // Allocate shadow area for Win64
3510     if (IsCalleeWin64)
3511       CCInfo.AllocateStack(32, 8);
3512
3513     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3514     if (CCInfo.getNextStackOffset()) {
3515       MachineFunction &MF = DAG.getMachineFunction();
3516       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3517         return false;
3518
3519       // Check if the arguments are already laid out in the right way as
3520       // the caller's fixed stack objects.
3521       MachineFrameInfo *MFI = MF.getFrameInfo();
3522       const MachineRegisterInfo *MRI = &MF.getRegInfo();
3523       const X86InstrInfo *TII = Subtarget->getInstrInfo();
3524       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3525         CCValAssign &VA = ArgLocs[i];
3526         SDValue Arg = OutVals[i];
3527         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3528         if (VA.getLocInfo() == CCValAssign::Indirect)
3529           return false;
3530         if (!VA.isRegLoc()) {
3531           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3532                                    MFI, MRI, TII))
3533             return false;
3534         }
3535       }
3536     }
3537
3538     // If the tailcall address may be in a register, then make sure it's
3539     // possible to register allocate for it. In 32-bit, the call address can
3540     // only target EAX, EDX, or ECX since the tail call must be scheduled after
3541     // callee-saved registers are restored. These happen to be the same
3542     // registers used to pass 'inreg' arguments so watch out for those.
3543     if (!Subtarget->is64Bit() &&
3544         ((!isa<GlobalAddressSDNode>(Callee) &&
3545           !isa<ExternalSymbolSDNode>(Callee)) ||
3546          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3547       unsigned NumInRegs = 0;
3548       // In PIC we need an extra register to formulate the address computation
3549       // for the callee.
3550       unsigned MaxInRegs =
3551         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3552
3553       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3554         CCValAssign &VA = ArgLocs[i];
3555         if (!VA.isRegLoc())
3556           continue;
3557         unsigned Reg = VA.getLocReg();
3558         switch (Reg) {
3559         default: break;
3560         case X86::EAX: case X86::EDX: case X86::ECX:
3561           if (++NumInRegs == MaxInRegs)
3562             return false;
3563           break;
3564         }
3565       }
3566     }
3567   }
3568
3569   return true;
3570 }
3571
3572 FastISel *
3573 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3574                                   const TargetLibraryInfo *libInfo) const {
3575   return X86::createFastISel(funcInfo, libInfo);
3576 }
3577
3578 //===----------------------------------------------------------------------===//
3579 //                           Other Lowering Hooks
3580 //===----------------------------------------------------------------------===//
3581
3582 static bool MayFoldLoad(SDValue Op) {
3583   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3584 }
3585
3586 static bool MayFoldIntoStore(SDValue Op) {
3587   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3588 }
3589
3590 static bool isTargetShuffle(unsigned Opcode) {
3591   switch(Opcode) {
3592   default: return false;
3593   case X86ISD::BLENDI:
3594   case X86ISD::PSHUFB:
3595   case X86ISD::PSHUFD:
3596   case X86ISD::PSHUFHW:
3597   case X86ISD::PSHUFLW:
3598   case X86ISD::SHUFP:
3599   case X86ISD::PALIGNR:
3600   case X86ISD::MOVLHPS:
3601   case X86ISD::MOVLHPD:
3602   case X86ISD::MOVHLPS:
3603   case X86ISD::MOVLPS:
3604   case X86ISD::MOVLPD:
3605   case X86ISD::MOVSHDUP:
3606   case X86ISD::MOVSLDUP:
3607   case X86ISD::MOVDDUP:
3608   case X86ISD::MOVSS:
3609   case X86ISD::MOVSD:
3610   case X86ISD::UNPCKL:
3611   case X86ISD::UNPCKH:
3612   case X86ISD::VPERMILPI:
3613   case X86ISD::VPERM2X128:
3614   case X86ISD::VPERMI:
3615     return true;
3616   }
3617 }
3618
3619 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3620                                     SDValue V1, SelectionDAG &DAG) {
3621   switch(Opc) {
3622   default: llvm_unreachable("Unknown x86 shuffle node");
3623   case X86ISD::MOVSHDUP:
3624   case X86ISD::MOVSLDUP:
3625   case X86ISD::MOVDDUP:
3626     return DAG.getNode(Opc, dl, VT, V1);
3627   }
3628 }
3629
3630 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3631                                     SDValue V1, unsigned TargetMask,
3632                                     SelectionDAG &DAG) {
3633   switch(Opc) {
3634   default: llvm_unreachable("Unknown x86 shuffle node");
3635   case X86ISD::PSHUFD:
3636   case X86ISD::PSHUFHW:
3637   case X86ISD::PSHUFLW:
3638   case X86ISD::VPERMILPI:
3639   case X86ISD::VPERMI:
3640     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3641   }
3642 }
3643
3644 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3645                                     SDValue V1, SDValue V2, unsigned TargetMask,
3646                                     SelectionDAG &DAG) {
3647   switch(Opc) {
3648   default: llvm_unreachable("Unknown x86 shuffle node");
3649   case X86ISD::PALIGNR:
3650   case X86ISD::VALIGN:
3651   case X86ISD::SHUFP:
3652   case X86ISD::VPERM2X128:
3653     return DAG.getNode(Opc, dl, VT, V1, V2,
3654                        DAG.getConstant(TargetMask, MVT::i8));
3655   }
3656 }
3657
3658 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3659                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
3660   switch(Opc) {
3661   default: llvm_unreachable("Unknown x86 shuffle node");
3662   case X86ISD::MOVLHPS:
3663   case X86ISD::MOVLHPD:
3664   case X86ISD::MOVHLPS:
3665   case X86ISD::MOVLPS:
3666   case X86ISD::MOVLPD:
3667   case X86ISD::MOVSS:
3668   case X86ISD::MOVSD:
3669   case X86ISD::UNPCKL:
3670   case X86ISD::UNPCKH:
3671     return DAG.getNode(Opc, dl, VT, V1, V2);
3672   }
3673 }
3674
3675 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3676   MachineFunction &MF = DAG.getMachineFunction();
3677   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3678   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3679   int ReturnAddrIndex = FuncInfo->getRAIndex();
3680
3681   if (ReturnAddrIndex == 0) {
3682     // Set up a frame object for the return address.
3683     unsigned SlotSize = RegInfo->getSlotSize();
3684     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3685                                                            -(int64_t)SlotSize,
3686                                                            false);
3687     FuncInfo->setRAIndex(ReturnAddrIndex);
3688   }
3689
3690   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3691 }
3692
3693 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3694                                        bool hasSymbolicDisplacement) {
3695   // Offset should fit into 32 bit immediate field.
3696   if (!isInt<32>(Offset))
3697     return false;
3698
3699   // If we don't have a symbolic displacement - we don't have any extra
3700   // restrictions.
3701   if (!hasSymbolicDisplacement)
3702     return true;
3703
3704   // FIXME: Some tweaks might be needed for medium code model.
3705   if (M != CodeModel::Small && M != CodeModel::Kernel)
3706     return false;
3707
3708   // For small code model we assume that latest object is 16MB before end of 31
3709   // bits boundary. We may also accept pretty large negative constants knowing
3710   // that all objects are in the positive half of address space.
3711   if (M == CodeModel::Small && Offset < 16*1024*1024)
3712     return true;
3713
3714   // For kernel code model we know that all object resist in the negative half
3715   // of 32bits address space. We may not accept negative offsets, since they may
3716   // be just off and we may accept pretty large positive ones.
3717   if (M == CodeModel::Kernel && Offset >= 0)
3718     return true;
3719
3720   return false;
3721 }
3722
3723 /// isCalleePop - Determines whether the callee is required to pop its
3724 /// own arguments. Callee pop is necessary to support tail calls.
3725 bool X86::isCalleePop(CallingConv::ID CallingConv,
3726                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3727   switch (CallingConv) {
3728   default:
3729     return false;
3730   case CallingConv::X86_StdCall:
3731   case CallingConv::X86_FastCall:
3732   case CallingConv::X86_ThisCall:
3733     return !is64Bit;
3734   case CallingConv::Fast:
3735   case CallingConv::GHC:
3736   case CallingConv::HiPE:
3737     if (IsVarArg)
3738       return false;
3739     return TailCallOpt;
3740   }
3741 }
3742
3743 /// \brief Return true if the condition is an unsigned comparison operation.
3744 static bool isX86CCUnsigned(unsigned X86CC) {
3745   switch (X86CC) {
3746   default: llvm_unreachable("Invalid integer condition!");
3747   case X86::COND_E:     return true;
3748   case X86::COND_G:     return false;
3749   case X86::COND_GE:    return false;
3750   case X86::COND_L:     return false;
3751   case X86::COND_LE:    return false;
3752   case X86::COND_NE:    return true;
3753   case X86::COND_B:     return true;
3754   case X86::COND_A:     return true;
3755   case X86::COND_BE:    return true;
3756   case X86::COND_AE:    return true;
3757   }
3758   llvm_unreachable("covered switch fell through?!");
3759 }
3760
3761 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3762 /// specific condition code, returning the condition code and the LHS/RHS of the
3763 /// comparison to make.
3764 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3765                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3766   if (!isFP) {
3767     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3768       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3769         // X > -1   -> X == 0, jump !sign.
3770         RHS = DAG.getConstant(0, RHS.getValueType());
3771         return X86::COND_NS;
3772       }
3773       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3774         // X < 0   -> X == 0, jump on sign.
3775         return X86::COND_S;
3776       }
3777       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3778         // X < 1   -> X <= 0
3779         RHS = DAG.getConstant(0, RHS.getValueType());
3780         return X86::COND_LE;
3781       }
3782     }
3783
3784     switch (SetCCOpcode) {
3785     default: llvm_unreachable("Invalid integer condition!");
3786     case ISD::SETEQ:  return X86::COND_E;
3787     case ISD::SETGT:  return X86::COND_G;
3788     case ISD::SETGE:  return X86::COND_GE;
3789     case ISD::SETLT:  return X86::COND_L;
3790     case ISD::SETLE:  return X86::COND_LE;
3791     case ISD::SETNE:  return X86::COND_NE;
3792     case ISD::SETULT: return X86::COND_B;
3793     case ISD::SETUGT: return X86::COND_A;
3794     case ISD::SETULE: return X86::COND_BE;
3795     case ISD::SETUGE: return X86::COND_AE;
3796     }
3797   }
3798
3799   // First determine if it is required or is profitable to flip the operands.
3800
3801   // If LHS is a foldable load, but RHS is not, flip the condition.
3802   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3803       !ISD::isNON_EXTLoad(RHS.getNode())) {
3804     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3805     std::swap(LHS, RHS);
3806   }
3807
3808   switch (SetCCOpcode) {
3809   default: break;
3810   case ISD::SETOLT:
3811   case ISD::SETOLE:
3812   case ISD::SETUGT:
3813   case ISD::SETUGE:
3814     std::swap(LHS, RHS);
3815     break;
3816   }
3817
3818   // On a floating point condition, the flags are set as follows:
3819   // ZF  PF  CF   op
3820   //  0 | 0 | 0 | X > Y
3821   //  0 | 0 | 1 | X < Y
3822   //  1 | 0 | 0 | X == Y
3823   //  1 | 1 | 1 | unordered
3824   switch (SetCCOpcode) {
3825   default: llvm_unreachable("Condcode should be pre-legalized away");
3826   case ISD::SETUEQ:
3827   case ISD::SETEQ:   return X86::COND_E;
3828   case ISD::SETOLT:              // flipped
3829   case ISD::SETOGT:
3830   case ISD::SETGT:   return X86::COND_A;
3831   case ISD::SETOLE:              // flipped
3832   case ISD::SETOGE:
3833   case ISD::SETGE:   return X86::COND_AE;
3834   case ISD::SETUGT:              // flipped
3835   case ISD::SETULT:
3836   case ISD::SETLT:   return X86::COND_B;
3837   case ISD::SETUGE:              // flipped
3838   case ISD::SETULE:
3839   case ISD::SETLE:   return X86::COND_BE;
3840   case ISD::SETONE:
3841   case ISD::SETNE:   return X86::COND_NE;
3842   case ISD::SETUO:   return X86::COND_P;
3843   case ISD::SETO:    return X86::COND_NP;
3844   case ISD::SETOEQ:
3845   case ISD::SETUNE:  return X86::COND_INVALID;
3846   }
3847 }
3848
3849 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
3850 /// code. Current x86 isa includes the following FP cmov instructions:
3851 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3852 static bool hasFPCMov(unsigned X86CC) {
3853   switch (X86CC) {
3854   default:
3855     return false;
3856   case X86::COND_B:
3857   case X86::COND_BE:
3858   case X86::COND_E:
3859   case X86::COND_P:
3860   case X86::COND_A:
3861   case X86::COND_AE:
3862   case X86::COND_NE:
3863   case X86::COND_NP:
3864     return true;
3865   }
3866 }
3867
3868 /// isFPImmLegal - Returns true if the target can instruction select the
3869 /// specified FP immediate natively. If false, the legalizer will
3870 /// materialize the FP immediate as a load from a constant pool.
3871 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3872   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3873     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3874       return true;
3875   }
3876   return false;
3877 }
3878
3879 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
3880                                               ISD::LoadExtType ExtTy,
3881                                               EVT NewVT) const {
3882   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3883   // relocation target a movq or addq instruction: don't let the load shrink.
3884   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3885   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3886     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3887       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3888   return true;
3889 }
3890
3891 /// \brief Returns true if it is beneficial to convert a load of a constant
3892 /// to just the constant itself.
3893 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3894                                                           Type *Ty) const {
3895   assert(Ty->isIntegerTy());
3896
3897   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3898   if (BitSize == 0 || BitSize > 64)
3899     return false;
3900   return true;
3901 }
3902
3903 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
3904                                                 unsigned Index) const {
3905   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3906     return false;
3907
3908   return (Index == 0 || Index == ResVT.getVectorNumElements());
3909 }
3910
3911 bool X86TargetLowering::isCheapToSpeculateCttz() const {
3912   // Speculate cttz only if we can directly use TZCNT.
3913   return Subtarget->hasBMI();
3914 }
3915
3916 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
3917   // Speculate ctlz only if we can directly use LZCNT.
3918   return Subtarget->hasLZCNT();
3919 }
3920
3921 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
3922 /// the specified range (L, H].
3923 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3924   return (Val < 0) || (Val >= Low && Val < Hi);
3925 }
3926
3927 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3928 /// specified value.
3929 static bool isUndefOrEqual(int Val, int CmpVal) {
3930   return (Val < 0 || Val == CmpVal);
3931 }
3932
3933 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3934 /// from position Pos and ending in Pos+Size, falls within the specified
3935 /// sequential range (Low, Low+Size]. or is undef.
3936 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3937                                        unsigned Pos, unsigned Size, int Low) {
3938   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3939     if (!isUndefOrEqual(Mask[i], Low))
3940       return false;
3941   return true;
3942 }
3943
3944 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3945 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
3946 /// operand - by default will match for first operand.
3947 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
3948                          bool TestSecondOperand = false) {
3949   if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
3950       VT != MVT::v2f64 && VT != MVT::v2i64)
3951     return false;
3952
3953   unsigned NumElems = VT.getVectorNumElements();
3954   unsigned Lo = TestSecondOperand ? NumElems : 0;
3955   unsigned Hi = Lo + NumElems;
3956
3957   for (unsigned i = 0; i < NumElems; ++i)
3958     if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
3959       return false;
3960
3961   return true;
3962 }
3963
3964 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3965 /// is suitable for input to PSHUFHW.
3966 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3967   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3968     return false;
3969
3970   // Lower quadword copied in order or undef.
3971   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3972     return false;
3973
3974   // Upper quadword shuffled.
3975   for (unsigned i = 4; i != 8; ++i)
3976     if (!isUndefOrInRange(Mask[i], 4, 8))
3977       return false;
3978
3979   if (VT == MVT::v16i16) {
3980     // Lower quadword copied in order or undef.
3981     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3982       return false;
3983
3984     // Upper quadword shuffled.
3985     for (unsigned i = 12; i != 16; ++i)
3986       if (!isUndefOrInRange(Mask[i], 12, 16))
3987         return false;
3988   }
3989
3990   return true;
3991 }
3992
3993 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3994 /// is suitable for input to PSHUFLW.
3995 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3996   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3997     return false;
3998
3999   // Upper quadword copied in order.
4000   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
4001     return false;
4002
4003   // Lower quadword shuffled.
4004   for (unsigned i = 0; i != 4; ++i)
4005     if (!isUndefOrInRange(Mask[i], 0, 4))
4006       return false;
4007
4008   if (VT == MVT::v16i16) {
4009     // Upper quadword copied in order.
4010     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
4011       return false;
4012
4013     // Lower quadword shuffled.
4014     for (unsigned i = 8; i != 12; ++i)
4015       if (!isUndefOrInRange(Mask[i], 8, 12))
4016         return false;
4017   }
4018
4019   return true;
4020 }
4021
4022 /// \brief Return true if the mask specifies a shuffle of elements that is
4023 /// suitable for input to intralane (palignr) or interlane (valign) vector
4024 /// right-shift.
4025 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
4026   unsigned NumElts = VT.getVectorNumElements();
4027   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
4028   unsigned NumLaneElts = NumElts/NumLanes;
4029
4030   // Do not handle 64-bit element shuffles with palignr.
4031   if (NumLaneElts == 2)
4032     return false;
4033
4034   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
4035     unsigned i;
4036     for (i = 0; i != NumLaneElts; ++i) {
4037       if (Mask[i+l] >= 0)
4038         break;
4039     }
4040
4041     // Lane is all undef, go to next lane
4042     if (i == NumLaneElts)
4043       continue;
4044
4045     int Start = Mask[i+l];
4046
4047     // Make sure its in this lane in one of the sources
4048     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
4049         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
4050       return false;
4051
4052     // If not lane 0, then we must match lane 0
4053     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
4054       return false;
4055
4056     // Correct second source to be contiguous with first source
4057     if (Start >= (int)NumElts)
4058       Start -= NumElts - NumLaneElts;
4059
4060     // Make sure we're shifting in the right direction.
4061     if (Start <= (int)(i+l))
4062       return false;
4063
4064     Start -= i;
4065
4066     // Check the rest of the elements to see if they are consecutive.
4067     for (++i; i != NumLaneElts; ++i) {
4068       int Idx = Mask[i+l];
4069
4070       // Make sure its in this lane
4071       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
4072           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
4073         return false;
4074
4075       // If not lane 0, then we must match lane 0
4076       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
4077         return false;
4078
4079       if (Idx >= (int)NumElts)
4080         Idx -= NumElts - NumLaneElts;
4081
4082       if (!isUndefOrEqual(Idx, Start+i))
4083         return false;
4084
4085     }
4086   }
4087
4088   return true;
4089 }
4090
4091 /// \brief Return true if the node specifies a shuffle of elements that is
4092 /// suitable for input to PALIGNR.
4093 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
4094                           const X86Subtarget *Subtarget) {
4095   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
4096       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
4097       VT.is512BitVector())
4098     // FIXME: Add AVX512BW.
4099     return false;
4100
4101   return isAlignrMask(Mask, VT, false);
4102 }
4103
4104 /// \brief Return true if the node specifies a shuffle of elements that is
4105 /// suitable for input to VALIGN.
4106 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
4107                           const X86Subtarget *Subtarget) {
4108   // FIXME: Add AVX512VL.
4109   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
4110     return false;
4111   return isAlignrMask(Mask, VT, true);
4112 }
4113
4114 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
4115 /// the two vector operands have swapped position.
4116 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
4117                                      unsigned NumElems) {
4118   for (unsigned i = 0; i != NumElems; ++i) {
4119     int idx = Mask[i];
4120     if (idx < 0)
4121       continue;
4122     else if (idx < (int)NumElems)
4123       Mask[i] = idx + NumElems;
4124     else
4125       Mask[i] = idx - NumElems;
4126   }
4127 }
4128
4129 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
4130 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
4131 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
4132 /// reverse of what x86 shuffles want.
4133 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
4134
4135   unsigned NumElems = VT.getVectorNumElements();
4136   unsigned NumLanes = VT.getSizeInBits()/128;
4137   unsigned NumLaneElems = NumElems/NumLanes;
4138
4139   if (NumLaneElems != 2 && NumLaneElems != 4)
4140     return false;
4141
4142   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4143   bool symetricMaskRequired =
4144     (VT.getSizeInBits() >= 256) && (EltSize == 32);
4145
4146   // VSHUFPSY divides the resulting vector into 4 chunks.
4147   // The sources are also splitted into 4 chunks, and each destination
4148   // chunk must come from a different source chunk.
4149   //
4150   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
4151   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
4152   //
4153   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
4154   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
4155   //
4156   // VSHUFPDY divides the resulting vector into 4 chunks.
4157   // The sources are also splitted into 4 chunks, and each destination
4158   // chunk must come from a different source chunk.
4159   //
4160   //  SRC1 =>      X3       X2       X1       X0
4161   //  SRC2 =>      Y3       Y2       Y1       Y0
4162   //
4163   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
4164   //
4165   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
4166   unsigned HalfLaneElems = NumLaneElems/2;
4167   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
4168     for (unsigned i = 0; i != NumLaneElems; ++i) {
4169       int Idx = Mask[i+l];
4170       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
4171       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
4172         return false;
4173       // For VSHUFPSY, the mask of the second half must be the same as the
4174       // first but with the appropriate offsets. This works in the same way as
4175       // VPERMILPS works with masks.
4176       if (!symetricMaskRequired || Idx < 0)
4177         continue;
4178       if (MaskVal[i] < 0) {
4179         MaskVal[i] = Idx - l;
4180         continue;
4181       }
4182       if ((signed)(Idx - l) != MaskVal[i])
4183         return false;
4184     }
4185   }
4186
4187   return true;
4188 }
4189
4190 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
4191 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
4192 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
4193   if (!VT.is128BitVector())
4194     return false;
4195
4196   unsigned NumElems = VT.getVectorNumElements();
4197
4198   if (NumElems != 4)
4199     return false;
4200
4201   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
4202   return isUndefOrEqual(Mask[0], 6) &&
4203          isUndefOrEqual(Mask[1], 7) &&
4204          isUndefOrEqual(Mask[2], 2) &&
4205          isUndefOrEqual(Mask[3], 3);
4206 }
4207
4208 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
4209 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
4210 /// <2, 3, 2, 3>
4211 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
4212   if (!VT.is128BitVector())
4213     return false;
4214
4215   unsigned NumElems = VT.getVectorNumElements();
4216
4217   if (NumElems != 4)
4218     return false;
4219
4220   return isUndefOrEqual(Mask[0], 2) &&
4221          isUndefOrEqual(Mask[1], 3) &&
4222          isUndefOrEqual(Mask[2], 2) &&
4223          isUndefOrEqual(Mask[3], 3);
4224 }
4225
4226 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
4227 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
4228 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
4229   if (!VT.is128BitVector())
4230     return false;
4231
4232   unsigned NumElems = VT.getVectorNumElements();
4233
4234   if (NumElems != 2 && NumElems != 4)
4235     return false;
4236
4237   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4238     if (!isUndefOrEqual(Mask[i], i + NumElems))
4239       return false;
4240
4241   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4242     if (!isUndefOrEqual(Mask[i], i))
4243       return false;
4244
4245   return true;
4246 }
4247
4248 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
4249 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
4250 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
4251   if (!VT.is128BitVector())
4252     return false;
4253
4254   unsigned NumElems = VT.getVectorNumElements();
4255
4256   if (NumElems != 2 && NumElems != 4)
4257     return false;
4258
4259   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4260     if (!isUndefOrEqual(Mask[i], i))
4261       return false;
4262
4263   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4264     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
4265       return false;
4266
4267   return true;
4268 }
4269
4270 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
4271 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
4272 /// i. e: If all but one element come from the same vector.
4273 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
4274   // TODO: Deal with AVX's VINSERTPS
4275   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
4276     return false;
4277
4278   unsigned CorrectPosV1 = 0;
4279   unsigned CorrectPosV2 = 0;
4280   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
4281     if (Mask[i] == -1) {
4282       ++CorrectPosV1;
4283       ++CorrectPosV2;
4284       continue;
4285     }
4286
4287     if (Mask[i] == i)
4288       ++CorrectPosV1;
4289     else if (Mask[i] == i + 4)
4290       ++CorrectPosV2;
4291   }
4292
4293   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
4294     // We have 3 elements (undefs count as elements from any vector) from one
4295     // vector, and one from another.
4296     return true;
4297
4298   return false;
4299 }
4300
4301 //
4302 // Some special combinations that can be optimized.
4303 //
4304 static
4305 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
4306                                SelectionDAG &DAG) {
4307   MVT VT = SVOp->getSimpleValueType(0);
4308   SDLoc dl(SVOp);
4309
4310   if (VT != MVT::v8i32 && VT != MVT::v8f32)
4311     return SDValue();
4312
4313   ArrayRef<int> Mask = SVOp->getMask();
4314
4315   // These are the special masks that may be optimized.
4316   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
4317   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
4318   bool MatchEvenMask = true;
4319   bool MatchOddMask  = true;
4320   for (int i=0; i<8; ++i) {
4321     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
4322       MatchEvenMask = false;
4323     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
4324       MatchOddMask = false;
4325   }
4326
4327   if (!MatchEvenMask && !MatchOddMask)
4328     return SDValue();
4329
4330   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
4331
4332   SDValue Op0 = SVOp->getOperand(0);
4333   SDValue Op1 = SVOp->getOperand(1);
4334
4335   if (MatchEvenMask) {
4336     // Shift the second operand right to 32 bits.
4337     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
4338     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
4339   } else {
4340     // Shift the first operand left to 32 bits.
4341     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
4342     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
4343   }
4344   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
4345   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
4346 }
4347
4348 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
4349 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
4350 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
4351                          bool HasInt256, bool V2IsSplat = false) {
4352
4353   assert(VT.getSizeInBits() >= 128 &&
4354          "Unsupported vector type for unpckl");
4355
4356   unsigned NumElts = VT.getVectorNumElements();
4357   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4358       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4359     return false;
4360
4361   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4362          "Unsupported vector type for unpckh");
4363
4364   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4365   unsigned NumLanes = VT.getSizeInBits()/128;
4366   unsigned NumLaneElts = NumElts/NumLanes;
4367
4368   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4369     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4370       int BitI  = Mask[l+i];
4371       int BitI1 = Mask[l+i+1];
4372       if (!isUndefOrEqual(BitI, j))
4373         return false;
4374       if (V2IsSplat) {
4375         if (!isUndefOrEqual(BitI1, NumElts))
4376           return false;
4377       } else {
4378         if (!isUndefOrEqual(BitI1, j + NumElts))
4379           return false;
4380       }
4381     }
4382   }
4383
4384   return true;
4385 }
4386
4387 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
4388 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
4389 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
4390                          bool HasInt256, bool V2IsSplat = false) {
4391   assert(VT.getSizeInBits() >= 128 &&
4392          "Unsupported vector type for unpckh");
4393
4394   unsigned NumElts = VT.getVectorNumElements();
4395   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4396       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4397     return false;
4398
4399   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4400          "Unsupported vector type for unpckh");
4401
4402   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4403   unsigned NumLanes = VT.getSizeInBits()/128;
4404   unsigned NumLaneElts = NumElts/NumLanes;
4405
4406   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4407     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4408       int BitI  = Mask[l+i];
4409       int BitI1 = Mask[l+i+1];
4410       if (!isUndefOrEqual(BitI, j))
4411         return false;
4412       if (V2IsSplat) {
4413         if (isUndefOrEqual(BitI1, NumElts))
4414           return false;
4415       } else {
4416         if (!isUndefOrEqual(BitI1, j+NumElts))
4417           return false;
4418       }
4419     }
4420   }
4421   return true;
4422 }
4423
4424 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4425 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4426 /// <0, 0, 1, 1>
4427 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4428   unsigned NumElts = VT.getVectorNumElements();
4429   bool Is256BitVec = VT.is256BitVector();
4430
4431   if (VT.is512BitVector())
4432     return false;
4433   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4434          "Unsupported vector type for unpckh");
4435
4436   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4437       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4438     return false;
4439
4440   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4441   // FIXME: Need a better way to get rid of this, there's no latency difference
4442   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4443   // the former later. We should also remove the "_undef" special mask.
4444   if (NumElts == 4 && Is256BitVec)
4445     return false;
4446
4447   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4448   // independently on 128-bit lanes.
4449   unsigned NumLanes = VT.getSizeInBits()/128;
4450   unsigned NumLaneElts = NumElts/NumLanes;
4451
4452   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4453     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4454       int BitI  = Mask[l+i];
4455       int BitI1 = Mask[l+i+1];
4456
4457       if (!isUndefOrEqual(BitI, j))
4458         return false;
4459       if (!isUndefOrEqual(BitI1, j))
4460         return false;
4461     }
4462   }
4463
4464   return true;
4465 }
4466
4467 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4468 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4469 /// <2, 2, 3, 3>
4470 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4471   unsigned NumElts = VT.getVectorNumElements();
4472
4473   if (VT.is512BitVector())
4474     return false;
4475
4476   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4477          "Unsupported vector type for unpckh");
4478
4479   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4480       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4481     return false;
4482
4483   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4484   // independently on 128-bit lanes.
4485   unsigned NumLanes = VT.getSizeInBits()/128;
4486   unsigned NumLaneElts = NumElts/NumLanes;
4487
4488   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4489     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4490       int BitI  = Mask[l+i];
4491       int BitI1 = Mask[l+i+1];
4492       if (!isUndefOrEqual(BitI, j))
4493         return false;
4494       if (!isUndefOrEqual(BitI1, j))
4495         return false;
4496     }
4497   }
4498   return true;
4499 }
4500
4501 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
4502 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
4503 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
4504   if (!VT.is512BitVector())
4505     return false;
4506
4507   unsigned NumElts = VT.getVectorNumElements();
4508   unsigned HalfSize = NumElts/2;
4509   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
4510     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
4511       *Imm = 1;
4512       return true;
4513     }
4514   }
4515   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
4516     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
4517       *Imm = 0;
4518       return true;
4519     }
4520   }
4521   return false;
4522 }
4523
4524 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4525 /// specifies a shuffle of elements that is suitable for input to MOVSS,
4526 /// MOVSD, and MOVD, i.e. setting the lowest element.
4527 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4528   if (VT.getVectorElementType().getSizeInBits() < 32)
4529     return false;
4530   if (!VT.is128BitVector())
4531     return false;
4532
4533   unsigned NumElts = VT.getVectorNumElements();
4534
4535   if (!isUndefOrEqual(Mask[0], NumElts))
4536     return false;
4537
4538   for (unsigned i = 1; i != NumElts; ++i)
4539     if (!isUndefOrEqual(Mask[i], i))
4540       return false;
4541
4542   return true;
4543 }
4544
4545 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4546 /// as permutations between 128-bit chunks or halves. As an example: this
4547 /// shuffle bellow:
4548 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4549 /// The first half comes from the second half of V1 and the second half from the
4550 /// the second half of V2.
4551 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4552   if (!HasFp256 || !VT.is256BitVector())
4553     return false;
4554
4555   // The shuffle result is divided into half A and half B. In total the two
4556   // sources have 4 halves, namely: C, D, E, F. The final values of A and
4557   // B must come from C, D, E or F.
4558   unsigned HalfSize = VT.getVectorNumElements()/2;
4559   bool MatchA = false, MatchB = false;
4560
4561   // Check if A comes from one of C, D, E, F.
4562   for (unsigned Half = 0; Half != 4; ++Half) {
4563     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4564       MatchA = true;
4565       break;
4566     }
4567   }
4568
4569   // Check if B comes from one of C, D, E, F.
4570   for (unsigned Half = 0; Half != 4; ++Half) {
4571     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4572       MatchB = true;
4573       break;
4574     }
4575   }
4576
4577   return MatchA && MatchB;
4578 }
4579
4580 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4581 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4582 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4583   MVT VT = SVOp->getSimpleValueType(0);
4584
4585   unsigned HalfSize = VT.getVectorNumElements()/2;
4586
4587   unsigned FstHalf = 0, SndHalf = 0;
4588   for (unsigned i = 0; i < HalfSize; ++i) {
4589     if (SVOp->getMaskElt(i) > 0) {
4590       FstHalf = SVOp->getMaskElt(i)/HalfSize;
4591       break;
4592     }
4593   }
4594   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4595     if (SVOp->getMaskElt(i) > 0) {
4596       SndHalf = SVOp->getMaskElt(i)/HalfSize;
4597       break;
4598     }
4599   }
4600
4601   return (FstHalf | (SndHalf << 4));
4602 }
4603
4604 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
4605 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4606   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4607   if (EltSize < 32)
4608     return false;
4609
4610   unsigned NumElts = VT.getVectorNumElements();
4611   Imm8 = 0;
4612   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4613     for (unsigned i = 0; i != NumElts; ++i) {
4614       if (Mask[i] < 0)
4615         continue;
4616       Imm8 |= Mask[i] << (i*2);
4617     }
4618     return true;
4619   }
4620
4621   unsigned LaneSize = 4;
4622   SmallVector<int, 4> MaskVal(LaneSize, -1);
4623
4624   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4625     for (unsigned i = 0; i != LaneSize; ++i) {
4626       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4627         return false;
4628       if (Mask[i+l] < 0)
4629         continue;
4630       if (MaskVal[i] < 0) {
4631         MaskVal[i] = Mask[i+l] - l;
4632         Imm8 |= MaskVal[i] << (i*2);
4633         continue;
4634       }
4635       if (Mask[i+l] != (signed)(MaskVal[i]+l))
4636         return false;
4637     }
4638   }
4639   return true;
4640 }
4641
4642 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4643 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4644 /// Note that VPERMIL mask matching is different depending whether theunderlying
4645 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4646 /// to the same elements of the low, but to the higher half of the source.
4647 /// In VPERMILPD the two lanes could be shuffled independently of each other
4648 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4649 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4650   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4651   if (VT.getSizeInBits() < 256 || EltSize < 32)
4652     return false;
4653   bool symetricMaskRequired = (EltSize == 32);
4654   unsigned NumElts = VT.getVectorNumElements();
4655
4656   unsigned NumLanes = VT.getSizeInBits()/128;
4657   unsigned LaneSize = NumElts/NumLanes;
4658   // 2 or 4 elements in one lane
4659
4660   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4661   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4662     for (unsigned i = 0; i != LaneSize; ++i) {
4663       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4664         return false;
4665       if (symetricMaskRequired) {
4666         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4667           ExpectedMaskVal[i] = Mask[i+l] - l;
4668           continue;
4669         }
4670         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4671           return false;
4672       }
4673     }
4674   }
4675   return true;
4676 }
4677
4678 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4679 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
4680 /// element of vector 2 and the other elements to come from vector 1 in order.
4681 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4682                                bool V2IsSplat = false, bool V2IsUndef = false) {
4683   if (!VT.is128BitVector())
4684     return false;
4685
4686   unsigned NumOps = VT.getVectorNumElements();
4687   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4688     return false;
4689
4690   if (!isUndefOrEqual(Mask[0], 0))
4691     return false;
4692
4693   for (unsigned i = 1; i != NumOps; ++i)
4694     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4695           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4696           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4697       return false;
4698
4699   return true;
4700 }
4701
4702 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4703 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4704 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4705 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4706                            const X86Subtarget *Subtarget) {
4707   if (!Subtarget->hasSSE3())
4708     return false;
4709
4710   unsigned NumElems = VT.getVectorNumElements();
4711
4712   if ((VT.is128BitVector() && NumElems != 4) ||
4713       (VT.is256BitVector() && NumElems != 8) ||
4714       (VT.is512BitVector() && NumElems != 16))
4715     return false;
4716
4717   // "i+1" is the value the indexed mask element must have
4718   for (unsigned i = 0; i != NumElems; i += 2)
4719     if (!isUndefOrEqual(Mask[i], i+1) ||
4720         !isUndefOrEqual(Mask[i+1], i+1))
4721       return false;
4722
4723   return true;
4724 }
4725
4726 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4727 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4728 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4729 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4730                            const X86Subtarget *Subtarget) {
4731   if (!Subtarget->hasSSE3())
4732     return false;
4733
4734   unsigned NumElems = VT.getVectorNumElements();
4735
4736   if ((VT.is128BitVector() && NumElems != 4) ||
4737       (VT.is256BitVector() && NumElems != 8) ||
4738       (VT.is512BitVector() && NumElems != 16))
4739     return false;
4740
4741   // "i" is the value the indexed mask element must have
4742   for (unsigned i = 0; i != NumElems; i += 2)
4743     if (!isUndefOrEqual(Mask[i], i) ||
4744         !isUndefOrEqual(Mask[i+1], i))
4745       return false;
4746
4747   return true;
4748 }
4749
4750 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4751 /// specifies a shuffle of elements that is suitable for input to 256-bit
4752 /// version of MOVDDUP.
4753 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4754   if (!HasFp256 || !VT.is256BitVector())
4755     return false;
4756
4757   unsigned NumElts = VT.getVectorNumElements();
4758   if (NumElts != 4)
4759     return false;
4760
4761   for (unsigned i = 0; i != NumElts/2; ++i)
4762     if (!isUndefOrEqual(Mask[i], 0))
4763       return false;
4764   for (unsigned i = NumElts/2; i != NumElts; ++i)
4765     if (!isUndefOrEqual(Mask[i], NumElts/2))
4766       return false;
4767   return true;
4768 }
4769
4770 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4771 /// specifies a shuffle of elements that is suitable for input to 128-bit
4772 /// version of MOVDDUP.
4773 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4774   if (!VT.is128BitVector())
4775     return false;
4776
4777   unsigned e = VT.getVectorNumElements() / 2;
4778   for (unsigned i = 0; i != e; ++i)
4779     if (!isUndefOrEqual(Mask[i], i))
4780       return false;
4781   for (unsigned i = 0; i != e; ++i)
4782     if (!isUndefOrEqual(Mask[e+i], i))
4783       return false;
4784   return true;
4785 }
4786
4787 /// isVEXTRACTIndex - Return true if the specified
4788 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4789 /// suitable for instruction that extract 128 or 256 bit vectors
4790 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4791   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4792   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4793     return false;
4794
4795   // The index should be aligned on a vecWidth-bit boundary.
4796   uint64_t Index =
4797     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4798
4799   MVT VT = N->getSimpleValueType(0);
4800   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4801   bool Result = (Index * ElSize) % vecWidth == 0;
4802
4803   return Result;
4804 }
4805
4806 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4807 /// operand specifies a subvector insert that is suitable for input to
4808 /// insertion of 128 or 256-bit subvectors
4809 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4810   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4811   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4812     return false;
4813   // The index should be aligned on a vecWidth-bit boundary.
4814   uint64_t Index =
4815     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4816
4817   MVT VT = N->getSimpleValueType(0);
4818   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4819   bool Result = (Index * ElSize) % vecWidth == 0;
4820
4821   return Result;
4822 }
4823
4824 bool X86::isVINSERT128Index(SDNode *N) {
4825   return isVINSERTIndex(N, 128);
4826 }
4827
4828 bool X86::isVINSERT256Index(SDNode *N) {
4829   return isVINSERTIndex(N, 256);
4830 }
4831
4832 bool X86::isVEXTRACT128Index(SDNode *N) {
4833   return isVEXTRACTIndex(N, 128);
4834 }
4835
4836 bool X86::isVEXTRACT256Index(SDNode *N) {
4837   return isVEXTRACTIndex(N, 256);
4838 }
4839
4840 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4841 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4842 /// Handles 128-bit and 256-bit.
4843 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4844   MVT VT = N->getSimpleValueType(0);
4845
4846   assert((VT.getSizeInBits() >= 128) &&
4847          "Unsupported vector type for PSHUF/SHUFP");
4848
4849   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4850   // independently on 128-bit lanes.
4851   unsigned NumElts = VT.getVectorNumElements();
4852   unsigned NumLanes = VT.getSizeInBits()/128;
4853   unsigned NumLaneElts = NumElts/NumLanes;
4854
4855   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
4856          "Only supports 2, 4 or 8 elements per lane");
4857
4858   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4859   unsigned Mask = 0;
4860   for (unsigned i = 0; i != NumElts; ++i) {
4861     int Elt = N->getMaskElt(i);
4862     if (Elt < 0) continue;
4863     Elt &= NumLaneElts - 1;
4864     unsigned ShAmt = (i << Shift) % 8;
4865     Mask |= Elt << ShAmt;
4866   }
4867
4868   return Mask;
4869 }
4870
4871 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4872 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4873 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4874   MVT VT = N->getSimpleValueType(0);
4875
4876   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4877          "Unsupported vector type for PSHUFHW");
4878
4879   unsigned NumElts = VT.getVectorNumElements();
4880
4881   unsigned Mask = 0;
4882   for (unsigned l = 0; l != NumElts; l += 8) {
4883     // 8 nodes per lane, but we only care about the last 4.
4884     for (unsigned i = 0; i < 4; ++i) {
4885       int Elt = N->getMaskElt(l+i+4);
4886       if (Elt < 0) continue;
4887       Elt &= 0x3; // only 2-bits.
4888       Mask |= Elt << (i * 2);
4889     }
4890   }
4891
4892   return Mask;
4893 }
4894
4895 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4896 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4897 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4898   MVT VT = N->getSimpleValueType(0);
4899
4900   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4901          "Unsupported vector type for PSHUFHW");
4902
4903   unsigned NumElts = VT.getVectorNumElements();
4904
4905   unsigned Mask = 0;
4906   for (unsigned l = 0; l != NumElts; l += 8) {
4907     // 8 nodes per lane, but we only care about the first 4.
4908     for (unsigned i = 0; i < 4; ++i) {
4909       int Elt = N->getMaskElt(l+i);
4910       if (Elt < 0) continue;
4911       Elt &= 0x3; // only 2-bits
4912       Mask |= Elt << (i * 2);
4913     }
4914   }
4915
4916   return Mask;
4917 }
4918
4919 /// \brief Return the appropriate immediate to shuffle the specified
4920 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
4921 /// VALIGN (if Interlane is true) instructions.
4922 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
4923                                            bool InterLane) {
4924   MVT VT = SVOp->getSimpleValueType(0);
4925   unsigned EltSize = InterLane ? 1 :
4926     VT.getVectorElementType().getSizeInBits() >> 3;
4927
4928   unsigned NumElts = VT.getVectorNumElements();
4929   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4930   unsigned NumLaneElts = NumElts/NumLanes;
4931
4932   int Val = 0;
4933   unsigned i;
4934   for (i = 0; i != NumElts; ++i) {
4935     Val = SVOp->getMaskElt(i);
4936     if (Val >= 0)
4937       break;
4938   }
4939   if (Val >= (int)NumElts)
4940     Val -= NumElts - NumLaneElts;
4941
4942   assert(Val - i > 0 && "PALIGNR imm should be positive");
4943   return (Val - i) * EltSize;
4944 }
4945
4946 /// \brief Return the appropriate immediate to shuffle the specified
4947 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
4948 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4949   return getShuffleAlignrImmediate(SVOp, false);
4950 }
4951
4952 /// \brief Return the appropriate immediate to shuffle the specified
4953 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
4954 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
4955   return getShuffleAlignrImmediate(SVOp, true);
4956 }
4957
4958
4959 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4960   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4961   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4962     llvm_unreachable("Illegal extract subvector for VEXTRACT");
4963
4964   uint64_t Index =
4965     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4966
4967   MVT VecVT = N->getOperand(0).getSimpleValueType();
4968   MVT ElVT = VecVT.getVectorElementType();
4969
4970   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4971   return Index / NumElemsPerChunk;
4972 }
4973
4974 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4975   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4976   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4977     llvm_unreachable("Illegal insert subvector for VINSERT");
4978
4979   uint64_t Index =
4980     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4981
4982   MVT VecVT = N->getSimpleValueType(0);
4983   MVT ElVT = VecVT.getVectorElementType();
4984
4985   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4986   return Index / NumElemsPerChunk;
4987 }
4988
4989 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
4990 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4991 /// and VINSERTI128 instructions.
4992 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4993   return getExtractVEXTRACTImmediate(N, 128);
4994 }
4995
4996 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
4997 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
4998 /// and VINSERTI64x4 instructions.
4999 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
5000   return getExtractVEXTRACTImmediate(N, 256);
5001 }
5002
5003 /// getInsertVINSERT128Immediate - Return the appropriate immediate
5004 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
5005 /// and VINSERTI128 instructions.
5006 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
5007   return getInsertVINSERTImmediate(N, 128);
5008 }
5009
5010 /// getInsertVINSERT256Immediate - Return the appropriate immediate
5011 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
5012 /// and VINSERTI64x4 instructions.
5013 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
5014   return getInsertVINSERTImmediate(N, 256);
5015 }
5016
5017 /// isZero - Returns true if Elt is a constant integer zero
5018 static bool isZero(SDValue V) {
5019   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
5020   return C && C->isNullValue();
5021 }
5022
5023 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
5024 /// constant +0.0.
5025 bool X86::isZeroNode(SDValue Elt) {
5026   if (isZero(Elt))
5027     return true;
5028   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
5029     return CFP->getValueAPF().isPosZero();
5030   return false;
5031 }
5032
5033 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
5034 /// match movhlps. The lower half elements should come from upper half of
5035 /// V1 (and in order), and the upper half elements should come from the upper
5036 /// half of V2 (and in order).
5037 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
5038   if (!VT.is128BitVector())
5039     return false;
5040   if (VT.getVectorNumElements() != 4)
5041     return false;
5042   for (unsigned i = 0, e = 2; i != e; ++i)
5043     if (!isUndefOrEqual(Mask[i], i+2))
5044       return false;
5045   for (unsigned i = 2; i != 4; ++i)
5046     if (!isUndefOrEqual(Mask[i], i+4))
5047       return false;
5048   return true;
5049 }
5050
5051 /// isScalarLoadToVector - Returns true if the node is a scalar load that
5052 /// is promoted to a vector. It also returns the LoadSDNode by reference if
5053 /// required.
5054 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
5055   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
5056     return false;
5057   N = N->getOperand(0).getNode();
5058   if (!ISD::isNON_EXTLoad(N))
5059     return false;
5060   if (LD)
5061     *LD = cast<LoadSDNode>(N);
5062   return true;
5063 }
5064
5065 // Test whether the given value is a vector value which will be legalized
5066 // into a load.
5067 static bool WillBeConstantPoolLoad(SDNode *N) {
5068   if (N->getOpcode() != ISD::BUILD_VECTOR)
5069     return false;
5070
5071   // Check for any non-constant elements.
5072   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
5073     switch (N->getOperand(i).getNode()->getOpcode()) {
5074     case ISD::UNDEF:
5075     case ISD::ConstantFP:
5076     case ISD::Constant:
5077       break;
5078     default:
5079       return false;
5080     }
5081
5082   // Vectors of all-zeros and all-ones are materialized with special
5083   // instructions rather than being loaded.
5084   return !ISD::isBuildVectorAllZeros(N) &&
5085          !ISD::isBuildVectorAllOnes(N);
5086 }
5087
5088 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
5089 /// match movlp{s|d}. The lower half elements should come from lower half of
5090 /// V1 (and in order), and the upper half elements should come from the upper
5091 /// half of V2 (and in order). And since V1 will become the source of the
5092 /// MOVLP, it must be either a vector load or a scalar load to vector.
5093 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
5094                                ArrayRef<int> Mask, MVT VT) {
5095   if (!VT.is128BitVector())
5096     return false;
5097
5098   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
5099     return false;
5100   // Is V2 is a vector load, don't do this transformation. We will try to use
5101   // load folding shufps op.
5102   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
5103     return false;
5104
5105   unsigned NumElems = VT.getVectorNumElements();
5106
5107   if (NumElems != 2 && NumElems != 4)
5108     return false;
5109   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
5110     if (!isUndefOrEqual(Mask[i], i))
5111       return false;
5112   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
5113     if (!isUndefOrEqual(Mask[i], i+NumElems))
5114       return false;
5115   return true;
5116 }
5117
5118 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
5119 /// to an zero vector.
5120 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
5121 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
5122   SDValue V1 = N->getOperand(0);
5123   SDValue V2 = N->getOperand(1);
5124   unsigned NumElems = N->getValueType(0).getVectorNumElements();
5125   for (unsigned i = 0; i != NumElems; ++i) {
5126     int Idx = N->getMaskElt(i);
5127     if (Idx >= (int)NumElems) {
5128       unsigned Opc = V2.getOpcode();
5129       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
5130         continue;
5131       if (Opc != ISD::BUILD_VECTOR ||
5132           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
5133         return false;
5134     } else if (Idx >= 0) {
5135       unsigned Opc = V1.getOpcode();
5136       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
5137         continue;
5138       if (Opc != ISD::BUILD_VECTOR ||
5139           !X86::isZeroNode(V1.getOperand(Idx)))
5140         return false;
5141     }
5142   }
5143   return true;
5144 }
5145
5146 /// getZeroVector - Returns a vector of specified type with all zero elements.
5147 ///
5148 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
5149                              SelectionDAG &DAG, SDLoc dl) {
5150   assert(VT.isVector() && "Expected a vector type");
5151
5152   // Always build SSE zero vectors as <4 x i32> bitcasted
5153   // to their dest type. This ensures they get CSE'd.
5154   SDValue Vec;
5155   if (VT.is128BitVector()) {  // SSE
5156     if (Subtarget->hasSSE2()) {  // SSE2
5157       SDValue Cst = DAG.getConstant(0, MVT::i32);
5158       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5159     } else { // SSE1
5160       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5161       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
5162     }
5163   } else if (VT.is256BitVector()) { // AVX
5164     if (Subtarget->hasInt256()) { // AVX2
5165       SDValue Cst = DAG.getConstant(0, MVT::i32);
5166       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5167       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5168     } else {
5169       // 256-bit logic and arithmetic instructions in AVX are all
5170       // floating-point, no support for integer ops. Emit fp zeroed vectors.
5171       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5172       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5173       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
5174     }
5175   } else if (VT.is512BitVector()) { // AVX-512
5176       SDValue Cst = DAG.getConstant(0, MVT::i32);
5177       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5178                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5179       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
5180   } else if (VT.getScalarType() == MVT::i1) {
5181     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
5182     SDValue Cst = DAG.getConstant(0, MVT::i1);
5183     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5184     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5185   } else
5186     llvm_unreachable("Unexpected vector type");
5187
5188   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5189 }
5190
5191 /// getOnesVector - Returns a vector of specified type with all bits set.
5192 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5193 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
5194 /// Then bitcast to their original type, ensuring they get CSE'd.
5195 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
5196                              SDLoc dl) {
5197   assert(VT.isVector() && "Expected a vector type");
5198
5199   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
5200   SDValue Vec;
5201   if (VT.is256BitVector()) {
5202     if (HasInt256) { // AVX2
5203       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5204       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5205     } else { // AVX
5206       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5207       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5208     }
5209   } else if (VT.is128BitVector()) {
5210     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5211   } else
5212     llvm_unreachable("Unexpected vector type");
5213
5214   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5215 }
5216
5217 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
5218 /// that point to V2 points to its first element.
5219 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
5220   for (unsigned i = 0; i != NumElems; ++i) {
5221     if (Mask[i] > (int)NumElems) {
5222       Mask[i] = NumElems;
5223     }
5224   }
5225 }
5226
5227 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
5228 /// operation of specified width.
5229 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
5230                        SDValue V2) {
5231   unsigned NumElems = VT.getVectorNumElements();
5232   SmallVector<int, 8> Mask;
5233   Mask.push_back(NumElems);
5234   for (unsigned i = 1; i != NumElems; ++i)
5235     Mask.push_back(i);
5236   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5237 }
5238
5239 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
5240 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5241                           SDValue V2) {
5242   unsigned NumElems = VT.getVectorNumElements();
5243   SmallVector<int, 8> Mask;
5244   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
5245     Mask.push_back(i);
5246     Mask.push_back(i + NumElems);
5247   }
5248   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5249 }
5250
5251 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
5252 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5253                           SDValue V2) {
5254   unsigned NumElems = VT.getVectorNumElements();
5255   SmallVector<int, 8> Mask;
5256   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
5257     Mask.push_back(i + Half);
5258     Mask.push_back(i + NumElems + Half);
5259   }
5260   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5261 }
5262
5263 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
5264 // a generic shuffle instruction because the target has no such instructions.
5265 // Generate shuffles which repeat i16 and i8 several times until they can be
5266 // represented by v4f32 and then be manipulated by target suported shuffles.
5267 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
5268   MVT VT = V.getSimpleValueType();
5269   int NumElems = VT.getVectorNumElements();
5270   SDLoc dl(V);
5271
5272   while (NumElems > 4) {
5273     if (EltNo < NumElems/2) {
5274       V = getUnpackl(DAG, dl, VT, V, V);
5275     } else {
5276       V = getUnpackh(DAG, dl, VT, V, V);
5277       EltNo -= NumElems/2;
5278     }
5279     NumElems >>= 1;
5280   }
5281   return V;
5282 }
5283
5284 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
5285 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
5286   MVT VT = V.getSimpleValueType();
5287   SDLoc dl(V);
5288
5289   if (VT.is128BitVector()) {
5290     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
5291     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
5292     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
5293                              &SplatMask[0]);
5294   } else if (VT.is256BitVector()) {
5295     // To use VPERMILPS to splat scalars, the second half of indicies must
5296     // refer to the higher part, which is a duplication of the lower one,
5297     // because VPERMILPS can only handle in-lane permutations.
5298     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
5299                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
5300
5301     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
5302     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
5303                              &SplatMask[0]);
5304   } else
5305     llvm_unreachable("Vector size not supported");
5306
5307   return DAG.getNode(ISD::BITCAST, dl, VT, V);
5308 }
5309
5310 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
5311 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
5312   MVT SrcVT = SV->getSimpleValueType(0);
5313   SDValue V1 = SV->getOperand(0);
5314   SDLoc dl(SV);
5315
5316   int EltNo = SV->getSplatIndex();
5317   int NumElems = SrcVT.getVectorNumElements();
5318   bool Is256BitVec = SrcVT.is256BitVector();
5319
5320   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
5321          "Unknown how to promote splat for type");
5322
5323   // Extract the 128-bit part containing the splat element and update
5324   // the splat element index when it refers to the higher register.
5325   if (Is256BitVec) {
5326     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
5327     if (EltNo >= NumElems/2)
5328       EltNo -= NumElems/2;
5329   }
5330
5331   // All i16 and i8 vector types can't be used directly by a generic shuffle
5332   // instruction because the target has no such instruction. Generate shuffles
5333   // which repeat i16 and i8 several times until they fit in i32, and then can
5334   // be manipulated by target suported shuffles.
5335   MVT EltVT = SrcVT.getVectorElementType();
5336   if (EltVT == MVT::i8 || EltVT == MVT::i16)
5337     V1 = PromoteSplati8i16(V1, DAG, EltNo);
5338
5339   // Recreate the 256-bit vector and place the same 128-bit vector
5340   // into the low and high part. This is necessary because we want
5341   // to use VPERM* to shuffle the vectors
5342   if (Is256BitVec) {
5343     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
5344   }
5345
5346   return getLegalSplat(DAG, V1, EltNo);
5347 }
5348
5349 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
5350 /// vector of zero or undef vector.  This produces a shuffle where the low
5351 /// element of V2 is swizzled into the zero/undef vector, landing at element
5352 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
5353 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
5354                                            bool IsZero,
5355                                            const X86Subtarget *Subtarget,
5356                                            SelectionDAG &DAG) {
5357   MVT VT = V2.getSimpleValueType();
5358   SDValue V1 = IsZero
5359     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5360   unsigned NumElems = VT.getVectorNumElements();
5361   SmallVector<int, 16> MaskVec;
5362   for (unsigned i = 0; i != NumElems; ++i)
5363     // If this is the insertion idx, put the low elt of V2 here.
5364     MaskVec.push_back(i == Idx ? NumElems : i);
5365   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
5366 }
5367
5368 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
5369 /// target specific opcode. Returns true if the Mask could be calculated. Sets
5370 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
5371 /// shuffles which use a single input multiple times, and in those cases it will
5372 /// adjust the mask to only have indices within that single input.
5373 static bool getTargetShuffleMask(SDNode *N, MVT VT,
5374                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
5375   unsigned NumElems = VT.getVectorNumElements();
5376   SDValue ImmN;
5377
5378   IsUnary = false;
5379   bool IsFakeUnary = false;
5380   switch(N->getOpcode()) {
5381   case X86ISD::BLENDI:
5382     ImmN = N->getOperand(N->getNumOperands()-1);
5383     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5384     break;
5385   case X86ISD::SHUFP:
5386     ImmN = N->getOperand(N->getNumOperands()-1);
5387     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5388     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5389     break;
5390   case X86ISD::UNPCKH:
5391     DecodeUNPCKHMask(VT, Mask);
5392     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5393     break;
5394   case X86ISD::UNPCKL:
5395     DecodeUNPCKLMask(VT, Mask);
5396     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5397     break;
5398   case X86ISD::MOVHLPS:
5399     DecodeMOVHLPSMask(NumElems, Mask);
5400     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5401     break;
5402   case X86ISD::MOVLHPS:
5403     DecodeMOVLHPSMask(NumElems, Mask);
5404     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5405     break;
5406   case X86ISD::PALIGNR:
5407     ImmN = N->getOperand(N->getNumOperands()-1);
5408     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5409     break;
5410   case X86ISD::PSHUFD:
5411   case X86ISD::VPERMILPI:
5412     ImmN = N->getOperand(N->getNumOperands()-1);
5413     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5414     IsUnary = true;
5415     break;
5416   case X86ISD::PSHUFHW:
5417     ImmN = N->getOperand(N->getNumOperands()-1);
5418     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5419     IsUnary = true;
5420     break;
5421   case X86ISD::PSHUFLW:
5422     ImmN = N->getOperand(N->getNumOperands()-1);
5423     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5424     IsUnary = true;
5425     break;
5426   case X86ISD::PSHUFB: {
5427     IsUnary = true;
5428     SDValue MaskNode = N->getOperand(1);
5429     while (MaskNode->getOpcode() == ISD::BITCAST)
5430       MaskNode = MaskNode->getOperand(0);
5431
5432     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
5433       // If we have a build-vector, then things are easy.
5434       EVT VT = MaskNode.getValueType();
5435       assert(VT.isVector() &&
5436              "Can't produce a non-vector with a build_vector!");
5437       if (!VT.isInteger())
5438         return false;
5439
5440       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
5441
5442       SmallVector<uint64_t, 32> RawMask;
5443       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
5444         SDValue Op = MaskNode->getOperand(i);
5445         if (Op->getOpcode() == ISD::UNDEF) {
5446           RawMask.push_back((uint64_t)SM_SentinelUndef);
5447           continue;
5448         }
5449         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
5450         if (!CN)
5451           return false;
5452         APInt MaskElement = CN->getAPIntValue();
5453
5454         // We now have to decode the element which could be any integer size and
5455         // extract each byte of it.
5456         for (int j = 0; j < NumBytesPerElement; ++j) {
5457           // Note that this is x86 and so always little endian: the low byte is
5458           // the first byte of the mask.
5459           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
5460           MaskElement = MaskElement.lshr(8);
5461         }
5462       }
5463       DecodePSHUFBMask(RawMask, Mask);
5464       break;
5465     }
5466
5467     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
5468     if (!MaskLoad)
5469       return false;
5470
5471     SDValue Ptr = MaskLoad->getBasePtr();
5472     if (Ptr->getOpcode() == X86ISD::Wrapper)
5473       Ptr = Ptr->getOperand(0);
5474
5475     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
5476     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
5477       return false;
5478
5479     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
5480       DecodePSHUFBMask(C, Mask);
5481       break;
5482     }
5483
5484     return false;
5485   }
5486   case X86ISD::VPERMI:
5487     ImmN = N->getOperand(N->getNumOperands()-1);
5488     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5489     IsUnary = true;
5490     break;
5491   case X86ISD::MOVSS:
5492   case X86ISD::MOVSD:
5493     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5494     break;
5495   case X86ISD::VPERM2X128:
5496     ImmN = N->getOperand(N->getNumOperands()-1);
5497     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5498     if (Mask.empty()) return false;
5499     break;
5500   case X86ISD::MOVSLDUP:
5501     DecodeMOVSLDUPMask(VT, Mask);
5502     IsUnary = true;
5503     break;
5504   case X86ISD::MOVSHDUP:
5505     DecodeMOVSHDUPMask(VT, Mask);
5506     IsUnary = true;
5507     break;
5508   case X86ISD::MOVDDUP:
5509     DecodeMOVDDUPMask(VT, Mask);
5510     IsUnary = true;
5511     break;
5512   case X86ISD::MOVLHPD:
5513   case X86ISD::MOVLPD:
5514   case X86ISD::MOVLPS:
5515     // Not yet implemented
5516     return false;
5517   default: llvm_unreachable("unknown target shuffle node");
5518   }
5519
5520   // If we have a fake unary shuffle, the shuffle mask is spread across two
5521   // inputs that are actually the same node. Re-map the mask to always point
5522   // into the first input.
5523   if (IsFakeUnary)
5524     for (int &M : Mask)
5525       if (M >= (int)Mask.size())
5526         M -= Mask.size();
5527
5528   return true;
5529 }
5530
5531 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
5532 /// element of the result of the vector shuffle.
5533 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5534                                    unsigned Depth) {
5535   if (Depth == 6)
5536     return SDValue();  // Limit search depth.
5537
5538   SDValue V = SDValue(N, 0);
5539   EVT VT = V.getValueType();
5540   unsigned Opcode = V.getOpcode();
5541
5542   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5543   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5544     int Elt = SV->getMaskElt(Index);
5545
5546     if (Elt < 0)
5547       return DAG.getUNDEF(VT.getVectorElementType());
5548
5549     unsigned NumElems = VT.getVectorNumElements();
5550     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5551                                          : SV->getOperand(1);
5552     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5553   }
5554
5555   // Recurse into target specific vector shuffles to find scalars.
5556   if (isTargetShuffle(Opcode)) {
5557     MVT ShufVT = V.getSimpleValueType();
5558     unsigned NumElems = ShufVT.getVectorNumElements();
5559     SmallVector<int, 16> ShuffleMask;
5560     bool IsUnary;
5561
5562     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5563       return SDValue();
5564
5565     int Elt = ShuffleMask[Index];
5566     if (Elt < 0)
5567       return DAG.getUNDEF(ShufVT.getVectorElementType());
5568
5569     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5570                                          : N->getOperand(1);
5571     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5572                                Depth+1);
5573   }
5574
5575   // Actual nodes that may contain scalar elements
5576   if (Opcode == ISD::BITCAST) {
5577     V = V.getOperand(0);
5578     EVT SrcVT = V.getValueType();
5579     unsigned NumElems = VT.getVectorNumElements();
5580
5581     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5582       return SDValue();
5583   }
5584
5585   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5586     return (Index == 0) ? V.getOperand(0)
5587                         : DAG.getUNDEF(VT.getVectorElementType());
5588
5589   if (V.getOpcode() == ISD::BUILD_VECTOR)
5590     return V.getOperand(Index);
5591
5592   return SDValue();
5593 }
5594
5595 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
5596 /// shuffle operation which come from a consecutively from a zero. The
5597 /// search can start in two different directions, from left or right.
5598 /// We count undefs as zeros until PreferredNum is reached.
5599 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
5600                                          unsigned NumElems, bool ZerosFromLeft,
5601                                          SelectionDAG &DAG,
5602                                          unsigned PreferredNum = -1U) {
5603   unsigned NumZeros = 0;
5604   for (unsigned i = 0; i != NumElems; ++i) {
5605     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5606     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5607     if (!Elt.getNode())
5608       break;
5609
5610     if (X86::isZeroNode(Elt))
5611       ++NumZeros;
5612     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5613       NumZeros = std::min(NumZeros + 1, PreferredNum);
5614     else
5615       break;
5616   }
5617
5618   return NumZeros;
5619 }
5620
5621 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5622 /// correspond consecutively to elements from one of the vector operands,
5623 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
5624 static
5625 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
5626                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5627                               unsigned NumElems, unsigned &OpNum) {
5628   bool SeenV1 = false;
5629   bool SeenV2 = false;
5630
5631   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5632     int Idx = SVOp->getMaskElt(i);
5633     // Ignore undef indicies
5634     if (Idx < 0)
5635       continue;
5636
5637     if (Idx < (int)NumElems)
5638       SeenV1 = true;
5639     else
5640       SeenV2 = true;
5641
5642     // Only accept consecutive elements from the same vector
5643     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5644       return false;
5645   }
5646
5647   OpNum = SeenV1 ? 0 : 1;
5648   return true;
5649 }
5650
5651 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5652 /// logical left shift of a vector.
5653 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5654                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5655   unsigned NumElems =
5656     SVOp->getSimpleValueType(0).getVectorNumElements();
5657   unsigned NumZeros = getNumOfConsecutiveZeros(
5658       SVOp, NumElems, false /* check zeros from right */, DAG,
5659       SVOp->getMaskElt(0));
5660   unsigned OpSrc;
5661
5662   if (!NumZeros)
5663     return false;
5664
5665   // Considering the elements in the mask that are not consecutive zeros,
5666   // check if they consecutively come from only one of the source vectors.
5667   //
5668   //               V1 = {X, A, B, C}     0
5669   //                         \  \  \    /
5670   //   vector_shuffle V1, V2 <1, 2, 3, X>
5671   //
5672   if (!isShuffleMaskConsecutive(SVOp,
5673             0,                   // Mask Start Index
5674             NumElems-NumZeros,   // Mask End Index(exclusive)
5675             NumZeros,            // Where to start looking in the src vector
5676             NumElems,            // Number of elements in vector
5677             OpSrc))              // Which source operand ?
5678     return false;
5679
5680   isLeft = false;
5681   ShAmt = NumZeros;
5682   ShVal = SVOp->getOperand(OpSrc);
5683   return true;
5684 }
5685
5686 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5687 /// logical left shift of a vector.
5688 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5689                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5690   unsigned NumElems =
5691     SVOp->getSimpleValueType(0).getVectorNumElements();
5692   unsigned NumZeros = getNumOfConsecutiveZeros(
5693       SVOp, NumElems, true /* check zeros from left */, DAG,
5694       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5695   unsigned OpSrc;
5696
5697   if (!NumZeros)
5698     return false;
5699
5700   // Considering the elements in the mask that are not consecutive zeros,
5701   // check if they consecutively come from only one of the source vectors.
5702   //
5703   //                           0    { A, B, X, X } = V2
5704   //                          / \    /  /
5705   //   vector_shuffle V1, V2 <X, X, 4, 5>
5706   //
5707   if (!isShuffleMaskConsecutive(SVOp,
5708             NumZeros,     // Mask Start Index
5709             NumElems,     // Mask End Index(exclusive)
5710             0,            // Where to start looking in the src vector
5711             NumElems,     // Number of elements in vector
5712             OpSrc))       // Which source operand ?
5713     return false;
5714
5715   isLeft = true;
5716   ShAmt = NumZeros;
5717   ShVal = SVOp->getOperand(OpSrc);
5718   return true;
5719 }
5720
5721 /// isVectorShift - Returns true if the shuffle can be implemented as a
5722 /// logical left or right shift of a vector.
5723 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5724                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5725   // Although the logic below support any bitwidth size, there are no
5726   // shift instructions which handle more than 128-bit vectors.
5727   if (!SVOp->getSimpleValueType(0).is128BitVector())
5728     return false;
5729
5730   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5731       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5732     return true;
5733
5734   return false;
5735 }
5736
5737 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5738 ///
5739 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5740                                        unsigned NumNonZero, unsigned NumZero,
5741                                        SelectionDAG &DAG,
5742                                        const X86Subtarget* Subtarget,
5743                                        const TargetLowering &TLI) {
5744   if (NumNonZero > 8)
5745     return SDValue();
5746
5747   SDLoc dl(Op);
5748   SDValue V;
5749   bool First = true;
5750   for (unsigned i = 0; i < 16; ++i) {
5751     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5752     if (ThisIsNonZero && First) {
5753       if (NumZero)
5754         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5755       else
5756         V = DAG.getUNDEF(MVT::v8i16);
5757       First = false;
5758     }
5759
5760     if ((i & 1) != 0) {
5761       SDValue ThisElt, LastElt;
5762       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5763       if (LastIsNonZero) {
5764         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5765                               MVT::i16, Op.getOperand(i-1));
5766       }
5767       if (ThisIsNonZero) {
5768         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5769         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5770                               ThisElt, DAG.getConstant(8, MVT::i8));
5771         if (LastIsNonZero)
5772           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5773       } else
5774         ThisElt = LastElt;
5775
5776       if (ThisElt.getNode())
5777         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5778                         DAG.getIntPtrConstant(i/2));
5779     }
5780   }
5781
5782   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5783 }
5784
5785 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5786 ///
5787 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5788                                      unsigned NumNonZero, unsigned NumZero,
5789                                      SelectionDAG &DAG,
5790                                      const X86Subtarget* Subtarget,
5791                                      const TargetLowering &TLI) {
5792   if (NumNonZero > 4)
5793     return SDValue();
5794
5795   SDLoc dl(Op);
5796   SDValue V;
5797   bool First = true;
5798   for (unsigned i = 0; i < 8; ++i) {
5799     bool isNonZero = (NonZeros & (1 << i)) != 0;
5800     if (isNonZero) {
5801       if (First) {
5802         if (NumZero)
5803           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5804         else
5805           V = DAG.getUNDEF(MVT::v8i16);
5806         First = false;
5807       }
5808       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5809                       MVT::v8i16, V, Op.getOperand(i),
5810                       DAG.getIntPtrConstant(i));
5811     }
5812   }
5813
5814   return V;
5815 }
5816
5817 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
5818 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5819                                      const X86Subtarget *Subtarget,
5820                                      const TargetLowering &TLI) {
5821   // Find all zeroable elements.
5822   bool Zeroable[4];
5823   for (int i=0; i < 4; ++i) {
5824     SDValue Elt = Op->getOperand(i);
5825     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
5826   }
5827   assert(std::count_if(&Zeroable[0], &Zeroable[4],
5828                        [](bool M) { return !M; }) > 1 &&
5829          "We expect at least two non-zero elements!");
5830
5831   // We only know how to deal with build_vector nodes where elements are either
5832   // zeroable or extract_vector_elt with constant index.
5833   SDValue FirstNonZero;
5834   unsigned FirstNonZeroIdx;
5835   for (unsigned i=0; i < 4; ++i) {
5836     if (Zeroable[i])
5837       continue;
5838     SDValue Elt = Op->getOperand(i);
5839     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5840         !isa<ConstantSDNode>(Elt.getOperand(1)))
5841       return SDValue();
5842     // Make sure that this node is extracting from a 128-bit vector.
5843     MVT VT = Elt.getOperand(0).getSimpleValueType();
5844     if (!VT.is128BitVector())
5845       return SDValue();
5846     if (!FirstNonZero.getNode()) {
5847       FirstNonZero = Elt;
5848       FirstNonZeroIdx = i;
5849     }
5850   }
5851
5852   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5853   SDValue V1 = FirstNonZero.getOperand(0);
5854   MVT VT = V1.getSimpleValueType();
5855
5856   // See if this build_vector can be lowered as a blend with zero.
5857   SDValue Elt;
5858   unsigned EltMaskIdx, EltIdx;
5859   int Mask[4];
5860   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5861     if (Zeroable[EltIdx]) {
5862       // The zero vector will be on the right hand side.
5863       Mask[EltIdx] = EltIdx+4;
5864       continue;
5865     }
5866
5867     Elt = Op->getOperand(EltIdx);
5868     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5869     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5870     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5871       break;
5872     Mask[EltIdx] = EltIdx;
5873   }
5874
5875   if (EltIdx == 4) {
5876     // Let the shuffle legalizer deal with blend operations.
5877     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5878     if (V1.getSimpleValueType() != VT)
5879       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
5880     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
5881   }
5882
5883   // See if we can lower this build_vector to a INSERTPS.
5884   if (!Subtarget->hasSSE41())
5885     return SDValue();
5886
5887   SDValue V2 = Elt.getOperand(0);
5888   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5889     V1 = SDValue();
5890
5891   bool CanFold = true;
5892   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5893     if (Zeroable[i])
5894       continue;
5895
5896     SDValue Current = Op->getOperand(i);
5897     SDValue SrcVector = Current->getOperand(0);
5898     if (!V1.getNode())
5899       V1 = SrcVector;
5900     CanFold = SrcVector == V1 &&
5901       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5902   }
5903
5904   if (!CanFold)
5905     return SDValue();
5906
5907   assert(V1.getNode() && "Expected at least two non-zero elements!");
5908   if (V1.getSimpleValueType() != MVT::v4f32)
5909     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
5910   if (V2.getSimpleValueType() != MVT::v4f32)
5911     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
5912
5913   // Ok, we can emit an INSERTPS instruction.
5914   unsigned ZMask = 0;
5915   for (int i = 0; i < 4; ++i)
5916     if (Zeroable[i])
5917       ZMask |= 1 << i;
5918
5919   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5920   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5921   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
5922                                DAG.getIntPtrConstant(InsertPSMask));
5923   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
5924 }
5925
5926 /// Return a vector logical shift node.
5927 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5928                          unsigned NumBits, SelectionDAG &DAG,
5929                          const TargetLowering &TLI, SDLoc dl) {
5930   assert(VT.is128BitVector() && "Unknown type for VShift");
5931   MVT ShVT = MVT::v2i64;
5932   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5933   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5934   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
5935   SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy);
5936   return DAG.getNode(ISD::BITCAST, dl, VT,
5937                      DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5938 }
5939
5940 static SDValue
5941 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
5942
5943   // Check if the scalar load can be widened into a vector load. And if
5944   // the address is "base + cst" see if the cst can be "absorbed" into
5945   // the shuffle mask.
5946   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5947     SDValue Ptr = LD->getBasePtr();
5948     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5949       return SDValue();
5950     EVT PVT = LD->getValueType(0);
5951     if (PVT != MVT::i32 && PVT != MVT::f32)
5952       return SDValue();
5953
5954     int FI = -1;
5955     int64_t Offset = 0;
5956     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5957       FI = FINode->getIndex();
5958       Offset = 0;
5959     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5960                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5961       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5962       Offset = Ptr.getConstantOperandVal(1);
5963       Ptr = Ptr.getOperand(0);
5964     } else {
5965       return SDValue();
5966     }
5967
5968     // FIXME: 256-bit vector instructions don't require a strict alignment,
5969     // improve this code to support it better.
5970     unsigned RequiredAlign = VT.getSizeInBits()/8;
5971     SDValue Chain = LD->getChain();
5972     // Make sure the stack object alignment is at least 16 or 32.
5973     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5974     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5975       if (MFI->isFixedObjectIndex(FI)) {
5976         // Can't change the alignment. FIXME: It's possible to compute
5977         // the exact stack offset and reference FI + adjust offset instead.
5978         // If someone *really* cares about this. That's the way to implement it.
5979         return SDValue();
5980       } else {
5981         MFI->setObjectAlignment(FI, RequiredAlign);
5982       }
5983     }
5984
5985     // (Offset % 16 or 32) must be multiple of 4. Then address is then
5986     // Ptr + (Offset & ~15).
5987     if (Offset < 0)
5988       return SDValue();
5989     if ((Offset % RequiredAlign) & 3)
5990       return SDValue();
5991     int64_t StartOffset = Offset & ~(RequiredAlign-1);
5992     if (StartOffset)
5993       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
5994                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
5995
5996     int EltNo = (Offset - StartOffset) >> 2;
5997     unsigned NumElems = VT.getVectorNumElements();
5998
5999     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6000     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6001                              LD->getPointerInfo().getWithOffset(StartOffset),
6002                              false, false, false, 0);
6003
6004     SmallVector<int, 8> Mask;
6005     for (unsigned i = 0; i != NumElems; ++i)
6006       Mask.push_back(EltNo);
6007
6008     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
6009   }
6010
6011   return SDValue();
6012 }
6013
6014 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6015 /// elements can be replaced by a single large load which has the same value as
6016 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6017 ///
6018 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
6019 ///
6020 /// FIXME: we'd also like to handle the case where the last elements are zero
6021 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
6022 /// There's even a handy isZeroNode for that purpose.
6023 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6024                                         SDLoc &DL, SelectionDAG &DAG,
6025                                         bool isAfterLegalize) {
6026   unsigned NumElems = Elts.size();
6027
6028   LoadSDNode *LDBase = nullptr;
6029   unsigned LastLoadedElt = -1U;
6030
6031   // For each element in the initializer, see if we've found a load or an undef.
6032   // If we don't find an initial load element, or later load elements are
6033   // non-consecutive, bail out.
6034   for (unsigned i = 0; i < NumElems; ++i) {
6035     SDValue Elt = Elts[i];
6036     // Look through a bitcast.
6037     if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
6038       Elt = Elt.getOperand(0);
6039     if (!Elt.getNode() ||
6040         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
6041       return SDValue();
6042     if (!LDBase) {
6043       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
6044         return SDValue();
6045       LDBase = cast<LoadSDNode>(Elt.getNode());
6046       LastLoadedElt = i;
6047       continue;
6048     }
6049     if (Elt.getOpcode() == ISD::UNDEF)
6050       continue;
6051
6052     LoadSDNode *LD = cast<LoadSDNode>(Elt);
6053     EVT LdVT = Elt.getValueType();
6054     // Each loaded element must be the correct fractional portion of the
6055     // requested vector load.
6056     if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
6057       return SDValue();
6058     if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
6059       return SDValue();
6060     LastLoadedElt = i;
6061   }
6062
6063   // If we have found an entire vector of loads and undefs, then return a large
6064   // load of the entire vector width starting at the base pointer.  If we found
6065   // consecutive loads for the low half, generate a vzext_load node.
6066   if (LastLoadedElt == NumElems - 1) {
6067     assert(LDBase && "Did not find base load for merging consecutive loads");
6068     EVT EltVT = LDBase->getValueType(0);
6069     // Ensure that the input vector size for the merged loads matches the
6070     // cumulative size of the input elements.
6071     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6072       return SDValue();
6073
6074     if (isAfterLegalize &&
6075         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
6076       return SDValue();
6077
6078     SDValue NewLd = SDValue();
6079
6080     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6081                         LDBase->getPointerInfo(), LDBase->isVolatile(),
6082                         LDBase->isNonTemporal(), LDBase->isInvariant(),
6083                         LDBase->getAlignment());
6084
6085     if (LDBase->hasAnyUseOfValue(1)) {
6086       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6087                                      SDValue(LDBase, 1),
6088                                      SDValue(NewLd.getNode(), 1));
6089       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6090       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6091                              SDValue(NewLd.getNode(), 1));
6092     }
6093
6094     return NewLd;
6095   }
6096
6097   //TODO: The code below fires only for for loading the low v2i32 / v2f32
6098   //of a v4i32 / v4f32. It's probably worth generalizing.
6099   EVT EltVT = VT.getVectorElementType();
6100   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
6101       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
6102     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
6103     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6104     SDValue ResNode =
6105         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
6106                                 LDBase->getPointerInfo(),
6107                                 LDBase->getAlignment(),
6108                                 false/*isVolatile*/, true/*ReadMem*/,
6109                                 false/*WriteMem*/);
6110
6111     // Make sure the newly-created LOAD is in the same position as LDBase in
6112     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
6113     // update uses of LDBase's output chain to use the TokenFactor.
6114     if (LDBase->hasAnyUseOfValue(1)) {
6115       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6116                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
6117       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6118       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6119                              SDValue(ResNode.getNode(), 1));
6120     }
6121
6122     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
6123   }
6124   return SDValue();
6125 }
6126
6127 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
6128 /// to generate a splat value for the following cases:
6129 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
6130 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6131 /// a scalar load, or a constant.
6132 /// The VBROADCAST node is returned when a pattern is found,
6133 /// or SDValue() otherwise.
6134 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
6135                                     SelectionDAG &DAG) {
6136   // VBROADCAST requires AVX.
6137   // TODO: Splats could be generated for non-AVX CPUs using SSE
6138   // instructions, but there's less potential gain for only 128-bit vectors.
6139   if (!Subtarget->hasAVX())
6140     return SDValue();
6141
6142   MVT VT = Op.getSimpleValueType();
6143   SDLoc dl(Op);
6144
6145   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6146          "Unsupported vector type for broadcast.");
6147
6148   SDValue Ld;
6149   bool ConstSplatVal;
6150
6151   switch (Op.getOpcode()) {
6152     default:
6153       // Unknown pattern found.
6154       return SDValue();
6155
6156     case ISD::BUILD_VECTOR: {
6157       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
6158       BitVector UndefElements;
6159       SDValue Splat = BVOp->getSplatValue(&UndefElements);
6160
6161       // We need a splat of a single value to use broadcast, and it doesn't
6162       // make any sense if the value is only in one element of the vector.
6163       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
6164         return SDValue();
6165
6166       Ld = Splat;
6167       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6168                        Ld.getOpcode() == ISD::ConstantFP);
6169
6170       // Make sure that all of the users of a non-constant load are from the
6171       // BUILD_VECTOR node.
6172       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6173         return SDValue();
6174       break;
6175     }
6176
6177     case ISD::VECTOR_SHUFFLE: {
6178       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6179
6180       // Shuffles must have a splat mask where the first element is
6181       // broadcasted.
6182       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
6183         return SDValue();
6184
6185       SDValue Sc = Op.getOperand(0);
6186       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
6187           Sc.getOpcode() != ISD::BUILD_VECTOR) {
6188
6189         if (!Subtarget->hasInt256())
6190           return SDValue();
6191
6192         // Use the register form of the broadcast instruction available on AVX2.
6193         if (VT.getSizeInBits() >= 256)
6194           Sc = Extract128BitVector(Sc, 0, DAG, dl);
6195         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
6196       }
6197
6198       Ld = Sc.getOperand(0);
6199       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6200                        Ld.getOpcode() == ISD::ConstantFP);
6201
6202       // The scalar_to_vector node and the suspected
6203       // load node must have exactly one user.
6204       // Constants may have multiple users.
6205
6206       // AVX-512 has register version of the broadcast
6207       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
6208         Ld.getValueType().getSizeInBits() >= 32;
6209       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
6210           !hasRegVer))
6211         return SDValue();
6212       break;
6213     }
6214   }
6215
6216   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
6217   bool IsGE256 = (VT.getSizeInBits() >= 256);
6218
6219   // When optimizing for size, generate up to 5 extra bytes for a broadcast
6220   // instruction to save 8 or more bytes of constant pool data.
6221   // TODO: If multiple splats are generated to load the same constant,
6222   // it may be detrimental to overall size. There needs to be a way to detect
6223   // that condition to know if this is truly a size win.
6224   const Function *F = DAG.getMachineFunction().getFunction();
6225   bool OptForSize = F->getAttributes().
6226     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
6227
6228   // Handle broadcasting a single constant scalar from the constant pool
6229   // into a vector.
6230   // On Sandybridge (no AVX2), it is still better to load a constant vector
6231   // from the constant pool and not to broadcast it from a scalar.
6232   // But override that restriction when optimizing for size.
6233   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6234   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
6235     EVT CVT = Ld.getValueType();
6236     assert(!CVT.isVector() && "Must not broadcast a vector type");
6237
6238     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6239     // For size optimization, also splat v2f64 and v2i64, and for size opt
6240     // with AVX2, also splat i8 and i16.
6241     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6242     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6243         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
6244       const Constant *C = nullptr;
6245       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6246         C = CI->getConstantIntValue();
6247       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6248         C = CF->getConstantFPValue();
6249
6250       assert(C && "Invalid constant type");
6251
6252       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6253       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
6254       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6255       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
6256                        MachinePointerInfo::getConstantPool(),
6257                        false, false, false, Alignment);
6258
6259       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6260     }
6261   }
6262
6263   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6264
6265   // Handle AVX2 in-register broadcasts.
6266   if (!IsLoad && Subtarget->hasInt256() &&
6267       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6268     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6269
6270   // The scalar source must be a normal load.
6271   if (!IsLoad)
6272     return SDValue();
6273
6274   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6275       (Subtarget->hasVLX() && ScalarSize == 64))
6276     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6277
6278   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6279   // double since there is no vbroadcastsd xmm
6280   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
6281     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6282       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6283   }
6284
6285   // Unsupported broadcast.
6286   return SDValue();
6287 }
6288
6289 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6290 /// underlying vector and index.
6291 ///
6292 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6293 /// index.
6294 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6295                                          SDValue ExtIdx) {
6296   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6297   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6298     return Idx;
6299
6300   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6301   // lowered this:
6302   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6303   // to:
6304   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
6305   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
6306   //                           undef)
6307   //                       Constant<0>)
6308   // In this case the vector is the extract_subvector expression and the index
6309   // is 2, as specified by the shuffle.
6310   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6311   SDValue ShuffleVec = SVOp->getOperand(0);
6312   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6313   assert(ShuffleVecVT.getVectorElementType() ==
6314          ExtractedFromVec.getSimpleValueType().getVectorElementType());
6315
6316   int ShuffleIdx = SVOp->getMaskElt(Idx);
6317   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6318     ExtractedFromVec = ShuffleVec;
6319     return ShuffleIdx;
6320   }
6321   return Idx;
6322 }
6323
6324 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6325   MVT VT = Op.getSimpleValueType();
6326
6327   // Skip if insert_vec_elt is not supported.
6328   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6329   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6330     return SDValue();
6331
6332   SDLoc DL(Op);
6333   unsigned NumElems = Op.getNumOperands();
6334
6335   SDValue VecIn1;
6336   SDValue VecIn2;
6337   SmallVector<unsigned, 4> InsertIndices;
6338   SmallVector<int, 8> Mask(NumElems, -1);
6339
6340   for (unsigned i = 0; i != NumElems; ++i) {
6341     unsigned Opc = Op.getOperand(i).getOpcode();
6342
6343     if (Opc == ISD::UNDEF)
6344       continue;
6345
6346     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6347       // Quit if more than 1 elements need inserting.
6348       if (InsertIndices.size() > 1)
6349         return SDValue();
6350
6351       InsertIndices.push_back(i);
6352       continue;
6353     }
6354
6355     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6356     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6357     // Quit if non-constant index.
6358     if (!isa<ConstantSDNode>(ExtIdx))
6359       return SDValue();
6360     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6361
6362     // Quit if extracted from vector of different type.
6363     if (ExtractedFromVec.getValueType() != VT)
6364       return SDValue();
6365
6366     if (!VecIn1.getNode())
6367       VecIn1 = ExtractedFromVec;
6368     else if (VecIn1 != ExtractedFromVec) {
6369       if (!VecIn2.getNode())
6370         VecIn2 = ExtractedFromVec;
6371       else if (VecIn2 != ExtractedFromVec)
6372         // Quit if more than 2 vectors to shuffle
6373         return SDValue();
6374     }
6375
6376     if (ExtractedFromVec == VecIn1)
6377       Mask[i] = Idx;
6378     else if (ExtractedFromVec == VecIn2)
6379       Mask[i] = Idx + NumElems;
6380   }
6381
6382   if (!VecIn1.getNode())
6383     return SDValue();
6384
6385   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6386   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
6387   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6388     unsigned Idx = InsertIndices[i];
6389     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6390                      DAG.getIntPtrConstant(Idx));
6391   }
6392
6393   return NV;
6394 }
6395
6396 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6397 SDValue
6398 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6399
6400   MVT VT = Op.getSimpleValueType();
6401   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
6402          "Unexpected type in LowerBUILD_VECTORvXi1!");
6403
6404   SDLoc dl(Op);
6405   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6406     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
6407     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6408     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6409   }
6410
6411   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
6412     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
6413     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6414     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6415   }
6416
6417   bool AllContants = true;
6418   uint64_t Immediate = 0;
6419   int NonConstIdx = -1;
6420   bool IsSplat = true;
6421   unsigned NumNonConsts = 0;
6422   unsigned NumConsts = 0;
6423   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6424     SDValue In = Op.getOperand(idx);
6425     if (In.getOpcode() == ISD::UNDEF)
6426       continue;
6427     if (!isa<ConstantSDNode>(In)) {
6428       AllContants = false;
6429       NonConstIdx = idx;
6430       NumNonConsts++;
6431     } else {
6432       NumConsts++;
6433       if (cast<ConstantSDNode>(In)->getZExtValue())
6434       Immediate |= (1ULL << idx);
6435     }
6436     if (In != Op.getOperand(0))
6437       IsSplat = false;
6438   }
6439
6440   if (AllContants) {
6441     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
6442       DAG.getConstant(Immediate, MVT::i16));
6443     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
6444                        DAG.getIntPtrConstant(0));
6445   }
6446
6447   if (NumNonConsts == 1 && NonConstIdx != 0) {
6448     SDValue DstVec;
6449     if (NumConsts) {
6450       SDValue VecAsImm = DAG.getConstant(Immediate,
6451                                          MVT::getIntegerVT(VT.getSizeInBits()));
6452       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
6453     }
6454     else
6455       DstVec = DAG.getUNDEF(VT);
6456     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6457                        Op.getOperand(NonConstIdx),
6458                        DAG.getIntPtrConstant(NonConstIdx));
6459   }
6460   if (!IsSplat && (NonConstIdx != 0))
6461     llvm_unreachable("Unsupported BUILD_VECTOR operation");
6462   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
6463   SDValue Select;
6464   if (IsSplat)
6465     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6466                           DAG.getConstant(-1, SelectVT),
6467                           DAG.getConstant(0, SelectVT));
6468   else
6469     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6470                          DAG.getConstant((Immediate | 1), SelectVT),
6471                          DAG.getConstant(Immediate, SelectVT));
6472   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
6473 }
6474
6475 /// \brief Return true if \p N implements a horizontal binop and return the
6476 /// operands for the horizontal binop into V0 and V1.
6477 ///
6478 /// This is a helper function of PerformBUILD_VECTORCombine.
6479 /// This function checks that the build_vector \p N in input implements a
6480 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6481 /// operation to match.
6482 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6483 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6484 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6485 /// arithmetic sub.
6486 ///
6487 /// This function only analyzes elements of \p N whose indices are
6488 /// in range [BaseIdx, LastIdx).
6489 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6490                               SelectionDAG &DAG,
6491                               unsigned BaseIdx, unsigned LastIdx,
6492                               SDValue &V0, SDValue &V1) {
6493   EVT VT = N->getValueType(0);
6494
6495   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6496   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6497          "Invalid Vector in input!");
6498
6499   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6500   bool CanFold = true;
6501   unsigned ExpectedVExtractIdx = BaseIdx;
6502   unsigned NumElts = LastIdx - BaseIdx;
6503   V0 = DAG.getUNDEF(VT);
6504   V1 = DAG.getUNDEF(VT);
6505
6506   // Check if N implements a horizontal binop.
6507   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6508     SDValue Op = N->getOperand(i + BaseIdx);
6509
6510     // Skip UNDEFs.
6511     if (Op->getOpcode() == ISD::UNDEF) {
6512       // Update the expected vector extract index.
6513       if (i * 2 == NumElts)
6514         ExpectedVExtractIdx = BaseIdx;
6515       ExpectedVExtractIdx += 2;
6516       continue;
6517     }
6518
6519     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6520
6521     if (!CanFold)
6522       break;
6523
6524     SDValue Op0 = Op.getOperand(0);
6525     SDValue Op1 = Op.getOperand(1);
6526
6527     // Try to match the following pattern:
6528     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6529     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6530         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6531         Op0.getOperand(0) == Op1.getOperand(0) &&
6532         isa<ConstantSDNode>(Op0.getOperand(1)) &&
6533         isa<ConstantSDNode>(Op1.getOperand(1)));
6534     if (!CanFold)
6535       break;
6536
6537     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6538     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6539
6540     if (i * 2 < NumElts) {
6541       if (V0.getOpcode() == ISD::UNDEF)
6542         V0 = Op0.getOperand(0);
6543     } else {
6544       if (V1.getOpcode() == ISD::UNDEF)
6545         V1 = Op0.getOperand(0);
6546       if (i * 2 == NumElts)
6547         ExpectedVExtractIdx = BaseIdx;
6548     }
6549
6550     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6551     if (I0 == ExpectedVExtractIdx)
6552       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6553     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6554       // Try to match the following dag sequence:
6555       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6556       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6557     } else
6558       CanFold = false;
6559
6560     ExpectedVExtractIdx += 2;
6561   }
6562
6563   return CanFold;
6564 }
6565
6566 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6567 /// a concat_vector.
6568 ///
6569 /// This is a helper function of PerformBUILD_VECTORCombine.
6570 /// This function expects two 256-bit vectors called V0 and V1.
6571 /// At first, each vector is split into two separate 128-bit vectors.
6572 /// Then, the resulting 128-bit vectors are used to implement two
6573 /// horizontal binary operations.
6574 ///
6575 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6576 ///
6577 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6578 /// the two new horizontal binop.
6579 /// When Mode is set, the first horizontal binop dag node would take as input
6580 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6581 /// horizontal binop dag node would take as input the lower 128-bit of V1
6582 /// and the upper 128-bit of V1.
6583 ///   Example:
6584 ///     HADD V0_LO, V0_HI
6585 ///     HADD V1_LO, V1_HI
6586 ///
6587 /// Otherwise, the first horizontal binop dag node takes as input the lower
6588 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6589 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
6590 ///   Example:
6591 ///     HADD V0_LO, V1_LO
6592 ///     HADD V0_HI, V1_HI
6593 ///
6594 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6595 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6596 /// the upper 128-bits of the result.
6597 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6598                                      SDLoc DL, SelectionDAG &DAG,
6599                                      unsigned X86Opcode, bool Mode,
6600                                      bool isUndefLO, bool isUndefHI) {
6601   EVT VT = V0.getValueType();
6602   assert(VT.is256BitVector() && VT == V1.getValueType() &&
6603          "Invalid nodes in input!");
6604
6605   unsigned NumElts = VT.getVectorNumElements();
6606   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
6607   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
6608   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
6609   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
6610   EVT NewVT = V0_LO.getValueType();
6611
6612   SDValue LO = DAG.getUNDEF(NewVT);
6613   SDValue HI = DAG.getUNDEF(NewVT);
6614
6615   if (Mode) {
6616     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6617     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
6618       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6619     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
6620       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6621   } else {
6622     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6623     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
6624                        V1_LO->getOpcode() != ISD::UNDEF))
6625       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6626
6627     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
6628                        V1_HI->getOpcode() != ISD::UNDEF))
6629       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6630   }
6631
6632   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6633 }
6634
6635 /// \brief Try to fold a build_vector that performs an 'addsub' into the
6636 /// sequence of 'vadd + vsub + blendi'.
6637 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
6638                            const X86Subtarget *Subtarget) {
6639   SDLoc DL(BV);
6640   EVT VT = BV->getValueType(0);
6641   unsigned NumElts = VT.getVectorNumElements();
6642   SDValue InVec0 = DAG.getUNDEF(VT);
6643   SDValue InVec1 = DAG.getUNDEF(VT);
6644
6645   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6646           VT == MVT::v2f64) && "build_vector with an invalid type found!");
6647
6648   // Odd-numbered elements in the input build vector are obtained from
6649   // adding two integer/float elements.
6650   // Even-numbered elements in the input build vector are obtained from
6651   // subtracting two integer/float elements.
6652   unsigned ExpectedOpcode = ISD::FSUB;
6653   unsigned NextExpectedOpcode = ISD::FADD;
6654   bool AddFound = false;
6655   bool SubFound = false;
6656
6657   for (unsigned i = 0, e = NumElts; i != e; i++) {
6658     SDValue Op = BV->getOperand(i);
6659
6660     // Skip 'undef' values.
6661     unsigned Opcode = Op.getOpcode();
6662     if (Opcode == ISD::UNDEF) {
6663       std::swap(ExpectedOpcode, NextExpectedOpcode);
6664       continue;
6665     }
6666
6667     // Early exit if we found an unexpected opcode.
6668     if (Opcode != ExpectedOpcode)
6669       return SDValue();
6670
6671     SDValue Op0 = Op.getOperand(0);
6672     SDValue Op1 = Op.getOperand(1);
6673
6674     // Try to match the following pattern:
6675     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6676     // Early exit if we cannot match that sequence.
6677     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6678         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6679         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6680         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6681         Op0.getOperand(1) != Op1.getOperand(1))
6682       return SDValue();
6683
6684     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6685     if (I0 != i)
6686       return SDValue();
6687
6688     // We found a valid add/sub node. Update the information accordingly.
6689     if (i & 1)
6690       AddFound = true;
6691     else
6692       SubFound = true;
6693
6694     // Update InVec0 and InVec1.
6695     if (InVec0.getOpcode() == ISD::UNDEF)
6696       InVec0 = Op0.getOperand(0);
6697     if (InVec1.getOpcode() == ISD::UNDEF)
6698       InVec1 = Op1.getOperand(0);
6699
6700     // Make sure that operands in input to each add/sub node always
6701     // come from a same pair of vectors.
6702     if (InVec0 != Op0.getOperand(0)) {
6703       if (ExpectedOpcode == ISD::FSUB)
6704         return SDValue();
6705
6706       // FADD is commutable. Try to commute the operands
6707       // and then test again.
6708       std::swap(Op0, Op1);
6709       if (InVec0 != Op0.getOperand(0))
6710         return SDValue();
6711     }
6712
6713     if (InVec1 != Op1.getOperand(0))
6714       return SDValue();
6715
6716     // Update the pair of expected opcodes.
6717     std::swap(ExpectedOpcode, NextExpectedOpcode);
6718   }
6719
6720   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6721   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
6722       InVec1.getOpcode() != ISD::UNDEF)
6723     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6724
6725   return SDValue();
6726 }
6727
6728 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
6729                                           const X86Subtarget *Subtarget) {
6730   SDLoc DL(N);
6731   EVT VT = N->getValueType(0);
6732   unsigned NumElts = VT.getVectorNumElements();
6733   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
6734   SDValue InVec0, InVec1;
6735
6736   // Try to match an ADDSUB.
6737   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
6738       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
6739     SDValue Value = matchAddSub(BV, DAG, Subtarget);
6740     if (Value.getNode())
6741       return Value;
6742   }
6743
6744   // Try to match horizontal ADD/SUB.
6745   unsigned NumUndefsLO = 0;
6746   unsigned NumUndefsHI = 0;
6747   unsigned Half = NumElts/2;
6748
6749   // Count the number of UNDEF operands in the build_vector in input.
6750   for (unsigned i = 0, e = Half; i != e; ++i)
6751     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6752       NumUndefsLO++;
6753
6754   for (unsigned i = Half, e = NumElts; i != e; ++i)
6755     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6756       NumUndefsHI++;
6757
6758   // Early exit if this is either a build_vector of all UNDEFs or all the
6759   // operands but one are UNDEF.
6760   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6761     return SDValue();
6762
6763   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
6764     // Try to match an SSE3 float HADD/HSUB.
6765     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6766       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6767
6768     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6769       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6770   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
6771     // Try to match an SSSE3 integer HADD/HSUB.
6772     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6773       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6774
6775     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6776       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6777   }
6778
6779   if (!Subtarget->hasAVX())
6780     return SDValue();
6781
6782   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6783     // Try to match an AVX horizontal add/sub of packed single/double
6784     // precision floating point values from 256-bit vectors.
6785     SDValue InVec2, InVec3;
6786     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6787         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6788         ((InVec0.getOpcode() == ISD::UNDEF ||
6789           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6790         ((InVec1.getOpcode() == ISD::UNDEF ||
6791           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6792       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6793
6794     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6795         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6796         ((InVec0.getOpcode() == ISD::UNDEF ||
6797           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6798         ((InVec1.getOpcode() == ISD::UNDEF ||
6799           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6800       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6801   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6802     // Try to match an AVX2 horizontal add/sub of signed integers.
6803     SDValue InVec2, InVec3;
6804     unsigned X86Opcode;
6805     bool CanFold = true;
6806
6807     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6808         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6809         ((InVec0.getOpcode() == ISD::UNDEF ||
6810           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6811         ((InVec1.getOpcode() == ISD::UNDEF ||
6812           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6813       X86Opcode = X86ISD::HADD;
6814     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6815         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6816         ((InVec0.getOpcode() == ISD::UNDEF ||
6817           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6818         ((InVec1.getOpcode() == ISD::UNDEF ||
6819           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6820       X86Opcode = X86ISD::HSUB;
6821     else
6822       CanFold = false;
6823
6824     if (CanFold) {
6825       // Fold this build_vector into a single horizontal add/sub.
6826       // Do this only if the target has AVX2.
6827       if (Subtarget->hasAVX2())
6828         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6829
6830       // Do not try to expand this build_vector into a pair of horizontal
6831       // add/sub if we can emit a pair of scalar add/sub.
6832       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6833         return SDValue();
6834
6835       // Convert this build_vector into a pair of horizontal binop followed by
6836       // a concat vector.
6837       bool isUndefLO = NumUndefsLO == Half;
6838       bool isUndefHI = NumUndefsHI == Half;
6839       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6840                                    isUndefLO, isUndefHI);
6841     }
6842   }
6843
6844   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6845        VT == MVT::v16i16) && Subtarget->hasAVX()) {
6846     unsigned X86Opcode;
6847     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6848       X86Opcode = X86ISD::HADD;
6849     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6850       X86Opcode = X86ISD::HSUB;
6851     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6852       X86Opcode = X86ISD::FHADD;
6853     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6854       X86Opcode = X86ISD::FHSUB;
6855     else
6856       return SDValue();
6857
6858     // Don't try to expand this build_vector into a pair of horizontal add/sub
6859     // if we can simply emit a pair of scalar add/sub.
6860     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6861       return SDValue();
6862
6863     // Convert this build_vector into two horizontal add/sub followed by
6864     // a concat vector.
6865     bool isUndefLO = NumUndefsLO == Half;
6866     bool isUndefHI = NumUndefsHI == Half;
6867     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6868                                  isUndefLO, isUndefHI);
6869   }
6870
6871   return SDValue();
6872 }
6873
6874 SDValue
6875 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6876   SDLoc dl(Op);
6877
6878   MVT VT = Op.getSimpleValueType();
6879   MVT ExtVT = VT.getVectorElementType();
6880   unsigned NumElems = Op.getNumOperands();
6881
6882   // Generate vectors for predicate vectors.
6883   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
6884     return LowerBUILD_VECTORvXi1(Op, DAG);
6885
6886   // Vectors containing all zeros can be matched by pxor and xorps later
6887   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6888     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6889     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6890     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6891       return Op;
6892
6893     return getZeroVector(VT, Subtarget, DAG, dl);
6894   }
6895
6896   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6897   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6898   // vpcmpeqd on 256-bit vectors.
6899   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6900     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
6901       return Op;
6902
6903     if (!VT.is512BitVector())
6904       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
6905   }
6906
6907   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
6908   if (Broadcast.getNode())
6909     return Broadcast;
6910
6911   unsigned EVTBits = ExtVT.getSizeInBits();
6912
6913   unsigned NumZero  = 0;
6914   unsigned NumNonZero = 0;
6915   unsigned NonZeros = 0;
6916   bool IsAllConstants = true;
6917   SmallSet<SDValue, 8> Values;
6918   for (unsigned i = 0; i < NumElems; ++i) {
6919     SDValue Elt = Op.getOperand(i);
6920     if (Elt.getOpcode() == ISD::UNDEF)
6921       continue;
6922     Values.insert(Elt);
6923     if (Elt.getOpcode() != ISD::Constant &&
6924         Elt.getOpcode() != ISD::ConstantFP)
6925       IsAllConstants = false;
6926     if (X86::isZeroNode(Elt))
6927       NumZero++;
6928     else {
6929       NonZeros |= (1 << i);
6930       NumNonZero++;
6931     }
6932   }
6933
6934   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
6935   if (NumNonZero == 0)
6936     return DAG.getUNDEF(VT);
6937
6938   // Special case for single non-zero, non-undef, element.
6939   if (NumNonZero == 1) {
6940     unsigned Idx = countTrailingZeros(NonZeros);
6941     SDValue Item = Op.getOperand(Idx);
6942
6943     // If this is an insertion of an i64 value on x86-32, and if the top bits of
6944     // the value are obviously zero, truncate the value to i32 and do the
6945     // insertion that way.  Only do this if the value is non-constant or if the
6946     // value is a constant being inserted into element 0.  It is cheaper to do
6947     // a constant pool load than it is to do a movd + shuffle.
6948     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
6949         (!IsAllConstants || Idx == 0)) {
6950       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6951         // Handle SSE only.
6952         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6953         EVT VecVT = MVT::v4i32;
6954         unsigned VecElts = 4;
6955
6956         // Truncate the value (which may itself be a constant) to i32, and
6957         // convert it to a vector with movd (S2V+shuffle to zero extend).
6958         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6959         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6960
6961         // If using the new shuffle lowering, just directly insert this.
6962         if (ExperimentalVectorShuffleLowering)
6963           return DAG.getNode(
6964               ISD::BITCAST, dl, VT,
6965               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
6966
6967         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6968
6969         // Now we have our 32-bit value zero extended in the low element of
6970         // a vector.  If Idx != 0, swizzle it into place.
6971         if (Idx != 0) {
6972           SmallVector<int, 4> Mask;
6973           Mask.push_back(Idx);
6974           for (unsigned i = 1; i != VecElts; ++i)
6975             Mask.push_back(i);
6976           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
6977                                       &Mask[0]);
6978         }
6979         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
6980       }
6981     }
6982
6983     // If we have a constant or non-constant insertion into the low element of
6984     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6985     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
6986     // depending on what the source datatype is.
6987     if (Idx == 0) {
6988       if (NumZero == 0)
6989         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6990
6991       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6992           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
6993         if (VT.is256BitVector() || VT.is512BitVector()) {
6994           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6995           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6996                              Item, DAG.getIntPtrConstant(0));
6997         }
6998         assert(VT.is128BitVector() && "Expected an SSE value type!");
6999         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7000         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7001         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7002       }
7003
7004       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7005         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7006         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7007         if (VT.is256BitVector()) {
7008           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
7009           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7010         } else {
7011           assert(VT.is128BitVector() && "Expected an SSE value type!");
7012           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7013         }
7014         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
7015       }
7016     }
7017
7018     // Is it a vector logical left shift?
7019     if (NumElems == 2 && Idx == 1 &&
7020         X86::isZeroNode(Op.getOperand(0)) &&
7021         !X86::isZeroNode(Op.getOperand(1))) {
7022       unsigned NumBits = VT.getSizeInBits();
7023       return getVShift(true, VT,
7024                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7025                                    VT, Op.getOperand(1)),
7026                        NumBits/2, DAG, *this, dl);
7027     }
7028
7029     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7030       return SDValue();
7031
7032     // Otherwise, if this is a vector with i32 or f32 elements, and the element
7033     // is a non-constant being inserted into an element other than the low one,
7034     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
7035     // movd/movss) to move this into the low element, then shuffle it into
7036     // place.
7037     if (EVTBits == 32) {
7038       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7039
7040       // If using the new shuffle lowering, just directly insert this.
7041       if (ExperimentalVectorShuffleLowering)
7042         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7043
7044       // Turn it into a shuffle of zero and zero-extended scalar to vector.
7045       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
7046       SmallVector<int, 8> MaskVec;
7047       for (unsigned i = 0; i != NumElems; ++i)
7048         MaskVec.push_back(i == Idx ? 0 : 1);
7049       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
7050     }
7051   }
7052
7053   // Splat is obviously ok. Let legalizer expand it to a shuffle.
7054   if (Values.size() == 1) {
7055     if (EVTBits == 32) {
7056       // Instead of a shuffle like this:
7057       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7058       // Check if it's possible to issue this instead.
7059       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7060       unsigned Idx = countTrailingZeros(NonZeros);
7061       SDValue Item = Op.getOperand(Idx);
7062       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7063         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7064     }
7065     return SDValue();
7066   }
7067
7068   // A vector full of immediates; various special cases are already
7069   // handled, so this is best done with a single constant-pool load.
7070   if (IsAllConstants)
7071     return SDValue();
7072
7073   // For AVX-length vectors, see if we can use a vector load to get all of the
7074   // elements, otherwise build the individual 128-bit pieces and use
7075   // shuffles to put them in place.
7076   if (VT.is256BitVector() || VT.is512BitVector()) {
7077     SmallVector<SDValue, 64> V;
7078     for (unsigned i = 0; i != NumElems; ++i)
7079       V.push_back(Op.getOperand(i));
7080
7081     // Check for a build vector of consecutive loads.
7082     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
7083       return LD;
7084
7085     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7086
7087     // Build both the lower and upper subvector.
7088     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7089                                 makeArrayRef(&V[0], NumElems/2));
7090     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7091                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
7092
7093     // Recreate the wider vector with the lower and upper part.
7094     if (VT.is256BitVector())
7095       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7096     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7097   }
7098
7099   // Let legalizer expand 2-wide build_vectors.
7100   if (EVTBits == 64) {
7101     if (NumNonZero == 1) {
7102       // One half is zero or undef.
7103       unsigned Idx = countTrailingZeros(NonZeros);
7104       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7105                                  Op.getOperand(Idx));
7106       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7107     }
7108     return SDValue();
7109   }
7110
7111   // If element VT is < 32 bits, convert it to inserts into a zero vector.
7112   if (EVTBits == 8 && NumElems == 16) {
7113     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
7114                                         Subtarget, *this);
7115     if (V.getNode()) return V;
7116   }
7117
7118   if (EVTBits == 16 && NumElems == 8) {
7119     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
7120                                       Subtarget, *this);
7121     if (V.getNode()) return V;
7122   }
7123
7124   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7125   if (EVTBits == 32 && NumElems == 4) {
7126     SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
7127     if (V.getNode())
7128       return V;
7129   }
7130
7131   // If element VT is == 32 bits, turn it into a number of shuffles.
7132   SmallVector<SDValue, 8> V(NumElems);
7133   if (NumElems == 4 && NumZero > 0) {
7134     for (unsigned i = 0; i < 4; ++i) {
7135       bool isZero = !(NonZeros & (1 << i));
7136       if (isZero)
7137         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
7138       else
7139         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7140     }
7141
7142     for (unsigned i = 0; i < 2; ++i) {
7143       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7144         default: break;
7145         case 0:
7146           V[i] = V[i*2];  // Must be a zero vector.
7147           break;
7148         case 1:
7149           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
7150           break;
7151         case 2:
7152           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
7153           break;
7154         case 3:
7155           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
7156           break;
7157       }
7158     }
7159
7160     bool Reverse1 = (NonZeros & 0x3) == 2;
7161     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7162     int MaskVec[] = {
7163       Reverse1 ? 1 : 0,
7164       Reverse1 ? 0 : 1,
7165       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7166       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
7167     };
7168     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
7169   }
7170
7171   if (Values.size() > 1 && VT.is128BitVector()) {
7172     // Check for a build vector of consecutive loads.
7173     for (unsigned i = 0; i < NumElems; ++i)
7174       V[i] = Op.getOperand(i);
7175
7176     // Check for elements which are consecutive loads.
7177     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
7178     if (LD.getNode())
7179       return LD;
7180
7181     // Check for a build vector from mostly shuffle plus few inserting.
7182     SDValue Sh = buildFromShuffleMostly(Op, DAG);
7183     if (Sh.getNode())
7184       return Sh;
7185
7186     // For SSE 4.1, use insertps to put the high elements into the low element.
7187     if (Subtarget->hasSSE41()) {
7188       SDValue Result;
7189       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
7190         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7191       else
7192         Result = DAG.getUNDEF(VT);
7193
7194       for (unsigned i = 1; i < NumElems; ++i) {
7195         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
7196         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7197                              Op.getOperand(i), DAG.getIntPtrConstant(i));
7198       }
7199       return Result;
7200     }
7201
7202     // Otherwise, expand into a number of unpckl*, start by extending each of
7203     // our (non-undef) elements to the full vector width with the element in the
7204     // bottom slot of the vector (which generates no code for SSE).
7205     for (unsigned i = 0; i < NumElems; ++i) {
7206       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
7207         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7208       else
7209         V[i] = DAG.getUNDEF(VT);
7210     }
7211
7212     // Next, we iteratively mix elements, e.g. for v4f32:
7213     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7214     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7215     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
7216     unsigned EltStride = NumElems >> 1;
7217     while (EltStride != 0) {
7218       for (unsigned i = 0; i < EltStride; ++i) {
7219         // If V[i+EltStride] is undef and this is the first round of mixing,
7220         // then it is safe to just drop this shuffle: V[i] is already in the
7221         // right place, the one element (since it's the first round) being
7222         // inserted as undef can be dropped.  This isn't safe for successive
7223         // rounds because they will permute elements within both vectors.
7224         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
7225             EltStride == NumElems/2)
7226           continue;
7227
7228         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
7229       }
7230       EltStride >>= 1;
7231     }
7232     return V[0];
7233   }
7234   return SDValue();
7235 }
7236
7237 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
7238 // to create 256-bit vectors from two other 128-bit ones.
7239 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7240   SDLoc dl(Op);
7241   MVT ResVT = Op.getSimpleValueType();
7242
7243   assert((ResVT.is256BitVector() ||
7244           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7245
7246   SDValue V1 = Op.getOperand(0);
7247   SDValue V2 = Op.getOperand(1);
7248   unsigned NumElems = ResVT.getVectorNumElements();
7249   if(ResVT.is256BitVector())
7250     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7251
7252   if (Op.getNumOperands() == 4) {
7253     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
7254                                 ResVT.getVectorNumElements()/2);
7255     SDValue V3 = Op.getOperand(2);
7256     SDValue V4 = Op.getOperand(3);
7257     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
7258       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
7259   }
7260   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7261 }
7262
7263 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7264   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
7265   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7266          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7267           Op.getNumOperands() == 4)));
7268
7269   // AVX can use the vinsertf128 instruction to create 256-bit vectors
7270   // from two other 128-bit ones.
7271
7272   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7273   return LowerAVXCONCAT_VECTORS(Op, DAG);
7274 }
7275
7276
7277 //===----------------------------------------------------------------------===//
7278 // Vector shuffle lowering
7279 //
7280 // This is an experimental code path for lowering vector shuffles on x86. It is
7281 // designed to handle arbitrary vector shuffles and blends, gracefully
7282 // degrading performance as necessary. It works hard to recognize idiomatic
7283 // shuffles and lower them to optimal instruction patterns without leaving
7284 // a framework that allows reasonably efficient handling of all vector shuffle
7285 // patterns.
7286 //===----------------------------------------------------------------------===//
7287
7288 /// \brief Tiny helper function to identify a no-op mask.
7289 ///
7290 /// This is a somewhat boring predicate function. It checks whether the mask
7291 /// array input, which is assumed to be a single-input shuffle mask of the kind
7292 /// used by the X86 shuffle instructions (not a fully general
7293 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7294 /// in-place shuffle are 'no-op's.
7295 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7296   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7297     if (Mask[i] != -1 && Mask[i] != i)
7298       return false;
7299   return true;
7300 }
7301
7302 /// \brief Helper function to classify a mask as a single-input mask.
7303 ///
7304 /// This isn't a generic single-input test because in the vector shuffle
7305 /// lowering we canonicalize single inputs to be the first input operand. This
7306 /// means we can more quickly test for a single input by only checking whether
7307 /// an input from the second operand exists. We also assume that the size of
7308 /// mask corresponds to the size of the input vectors which isn't true in the
7309 /// fully general case.
7310 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
7311   for (int M : Mask)
7312     if (M >= (int)Mask.size())
7313       return false;
7314   return true;
7315 }
7316
7317 /// \brief Test whether there are elements crossing 128-bit lanes in this
7318 /// shuffle mask.
7319 ///
7320 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7321 /// and we routinely test for these.
7322 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7323   int LaneSize = 128 / VT.getScalarSizeInBits();
7324   int Size = Mask.size();
7325   for (int i = 0; i < Size; ++i)
7326     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7327       return true;
7328   return false;
7329 }
7330
7331 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
7332 ///
7333 /// This checks a shuffle mask to see if it is performing the same
7334 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
7335 /// that it is also not lane-crossing. It may however involve a blend from the
7336 /// same lane of a second vector.
7337 ///
7338 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7339 /// non-trivial to compute in the face of undef lanes. The representation is
7340 /// *not* suitable for use with existing 128-bit shuffles as it will contain
7341 /// entries from both V1 and V2 inputs to the wider mask.
7342 static bool
7343 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7344                                 SmallVectorImpl<int> &RepeatedMask) {
7345   int LaneSize = 128 / VT.getScalarSizeInBits();
7346   RepeatedMask.resize(LaneSize, -1);
7347   int Size = Mask.size();
7348   for (int i = 0; i < Size; ++i) {
7349     if (Mask[i] < 0)
7350       continue;
7351     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7352       // This entry crosses lanes, so there is no way to model this shuffle.
7353       return false;
7354
7355     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7356     if (RepeatedMask[i % LaneSize] == -1)
7357       // This is the first non-undef entry in this slot of a 128-bit lane.
7358       RepeatedMask[i % LaneSize] =
7359           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
7360     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
7361       // Found a mismatch with the repeated mask.
7362       return false;
7363   }
7364   return true;
7365 }
7366
7367 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
7368 // 2013 will allow us to use it as a non-type template parameter.
7369 namespace {
7370
7371 /// \brief Implementation of the \c isShuffleEquivalent variadic functor.
7372 ///
7373 /// See its documentation for details.
7374 bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
7375   if (Mask.size() != Args.size())
7376     return false;
7377   for (int i = 0, e = Mask.size(); i < e; ++i) {
7378     assert(*Args[i] >= 0 && "Arguments must be positive integers!");
7379     if (Mask[i] != -1 && Mask[i] != *Args[i])
7380       return false;
7381   }
7382   return true;
7383 }
7384
7385 } // namespace
7386
7387 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7388 /// arguments.
7389 ///
7390 /// This is a fast way to test a shuffle mask against a fixed pattern:
7391 ///
7392 ///   if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
7393 ///
7394 /// It returns true if the mask is exactly as wide as the argument list, and
7395 /// each element of the mask is either -1 (signifying undef) or the value given
7396 /// in the argument.
7397 static const VariadicFunction1<
7398     bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
7399
7400 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7401 ///
7402 /// This helper function produces an 8-bit shuffle immediate corresponding to
7403 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7404 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7405 /// example.
7406 ///
7407 /// NB: We rely heavily on "undef" masks preserving the input lane.
7408 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
7409                                           SelectionDAG &DAG) {
7410   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7411   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7412   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7413   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7414   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7415
7416   unsigned Imm = 0;
7417   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
7418   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
7419   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
7420   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
7421   return DAG.getConstant(Imm, MVT::i8);
7422 }
7423
7424 /// \brief Try to emit a blend instruction for a shuffle.
7425 ///
7426 /// This doesn't do any checks for the availability of instructions for blending
7427 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7428 /// be matched in the backend with the type given. What it does check for is
7429 /// that the shuffle mask is in fact a blend.
7430 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
7431                                          SDValue V2, ArrayRef<int> Mask,
7432                                          const X86Subtarget *Subtarget,
7433                                          SelectionDAG &DAG) {
7434
7435   unsigned BlendMask = 0;
7436   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7437     if (Mask[i] >= Size) {
7438       if (Mask[i] != i + Size)
7439         return SDValue(); // Shuffled V2 input!
7440       BlendMask |= 1u << i;
7441       continue;
7442     }
7443     if (Mask[i] >= 0 && Mask[i] != i)
7444       return SDValue(); // Shuffled V1 input!
7445   }
7446   switch (VT.SimpleTy) {
7447   case MVT::v2f64:
7448   case MVT::v4f32:
7449   case MVT::v4f64:
7450   case MVT::v8f32:
7451     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7452                        DAG.getConstant(BlendMask, MVT::i8));
7453
7454   case MVT::v4i64:
7455   case MVT::v8i32:
7456     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7457     // FALLTHROUGH
7458   case MVT::v2i64:
7459   case MVT::v4i32:
7460     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7461     // that instruction.
7462     if (Subtarget->hasAVX2()) {
7463       // Scale the blend by the number of 32-bit dwords per element.
7464       int Scale =  VT.getScalarSizeInBits() / 32;
7465       BlendMask = 0;
7466       for (int i = 0, Size = Mask.size(); i < Size; ++i)
7467         if (Mask[i] >= Size)
7468           for (int j = 0; j < Scale; ++j)
7469             BlendMask |= 1u << (i * Scale + j);
7470
7471       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7472       V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
7473       V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
7474       return DAG.getNode(ISD::BITCAST, DL, VT,
7475                          DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7476                                      DAG.getConstant(BlendMask, MVT::i8)));
7477     }
7478     // FALLTHROUGH
7479   case MVT::v8i16: {
7480     // For integer shuffles we need to expand the mask and cast the inputs to
7481     // v8i16s prior to blending.
7482     int Scale = 8 / VT.getVectorNumElements();
7483     BlendMask = 0;
7484     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7485       if (Mask[i] >= Size)
7486         for (int j = 0; j < Scale; ++j)
7487           BlendMask |= 1u << (i * Scale + j);
7488
7489     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
7490     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
7491     return DAG.getNode(ISD::BITCAST, DL, VT,
7492                        DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7493                                    DAG.getConstant(BlendMask, MVT::i8)));
7494   }
7495
7496   case MVT::v16i16: {
7497     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7498     SmallVector<int, 8> RepeatedMask;
7499     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7500       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7501       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7502       BlendMask = 0;
7503       for (int i = 0; i < 8; ++i)
7504         if (RepeatedMask[i] >= 16)
7505           BlendMask |= 1u << i;
7506       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7507                          DAG.getConstant(BlendMask, MVT::i8));
7508     }
7509   }
7510     // FALLTHROUGH
7511   case MVT::v32i8: {
7512     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7513     // Scale the blend by the number of bytes per element.
7514     int Scale =  VT.getScalarSizeInBits() / 8;
7515     assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
7516
7517     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7518     // mix of LLVM's code generator and the x86 backend. We tell the code
7519     // generator that boolean values in the elements of an x86 vector register
7520     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7521     // mapping a select to operand #1, and 'false' mapping to operand #2. The
7522     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7523     // of the element (the remaining are ignored) and 0 in that high bit would
7524     // mean operand #1 while 1 in the high bit would mean operand #2. So while
7525     // the LLVM model for boolean values in vector elements gets the relevant
7526     // bit set, it is set backwards and over constrained relative to x86's
7527     // actual model.
7528     SDValue VSELECTMask[32];
7529     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7530       for (int j = 0; j < Scale; ++j)
7531         VSELECTMask[Scale * i + j] =
7532             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7533                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
7534
7535     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
7536     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
7537     return DAG.getNode(
7538         ISD::BITCAST, DL, VT,
7539         DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
7540                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
7541                     V1, V2));
7542   }
7543
7544   default:
7545     llvm_unreachable("Not a supported integer vector type!");
7546   }
7547 }
7548
7549 /// \brief Generic routine to lower a shuffle and blend as a decomposed set of
7550 /// unblended shuffles followed by an unshuffled blend.
7551 ///
7552 /// This matches the extremely common pattern for handling combined
7553 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7554 /// operations.
7555 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
7556                                                           SDValue V1,
7557                                                           SDValue V2,
7558                                                           ArrayRef<int> Mask,
7559                                                           SelectionDAG &DAG) {
7560   // Shuffle the input elements into the desired positions in V1 and V2 and
7561   // blend them together.
7562   SmallVector<int, 32> V1Mask(Mask.size(), -1);
7563   SmallVector<int, 32> V2Mask(Mask.size(), -1);
7564   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7565   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7566     if (Mask[i] >= 0 && Mask[i] < Size) {
7567       V1Mask[i] = Mask[i];
7568       BlendMask[i] = i;
7569     } else if (Mask[i] >= Size) {
7570       V2Mask[i] = Mask[i] - Size;
7571       BlendMask[i] = i + Size;
7572     }
7573
7574   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7575   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7576   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7577 }
7578
7579 /// \brief Try to lower a vector shuffle as a byte rotation.
7580 ///
7581 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7582 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7583 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7584 /// try to generically lower a vector shuffle through such an pattern. It
7585 /// does not check for the profitability of lowering either as PALIGNR or
7586 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7587 /// This matches shuffle vectors that look like:
7588 ///
7589 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7590 ///
7591 /// Essentially it concatenates V1 and V2, shifts right by some number of
7592 /// elements, and takes the low elements as the result. Note that while this is
7593 /// specified as a *right shift* because x86 is little-endian, it is a *left
7594 /// rotate* of the vector lanes.
7595 ///
7596 /// Note that this only handles 128-bit vector widths currently.
7597 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
7598                                               SDValue V2,
7599                                               ArrayRef<int> Mask,
7600                                               const X86Subtarget *Subtarget,
7601                                               SelectionDAG &DAG) {
7602   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7603
7604   // We need to detect various ways of spelling a rotation:
7605   //   [11, 12, 13, 14, 15,  0,  1,  2]
7606   //   [-1, 12, 13, 14, -1, -1,  1, -1]
7607   //   [-1, -1, -1, -1, -1, -1,  1,  2]
7608   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
7609   //   [-1,  4,  5,  6, -1, -1,  9, -1]
7610   //   [-1,  4,  5,  6, -1, -1, -1, -1]
7611   int Rotation = 0;
7612   SDValue Lo, Hi;
7613   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7614     if (Mask[i] == -1)
7615       continue;
7616     assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
7617
7618     // Based on the mod-Size value of this mask element determine where
7619     // a rotated vector would have started.
7620     int StartIdx = i - (Mask[i] % Size);
7621     if (StartIdx == 0)
7622       // The identity rotation isn't interesting, stop.
7623       return SDValue();
7624
7625     // If we found the tail of a vector the rotation must be the missing
7626     // front. If we found the head of a vector, it must be how much of the head.
7627     int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
7628
7629     if (Rotation == 0)
7630       Rotation = CandidateRotation;
7631     else if (Rotation != CandidateRotation)
7632       // The rotations don't match, so we can't match this mask.
7633       return SDValue();
7634
7635     // Compute which value this mask is pointing at.
7636     SDValue MaskV = Mask[i] < Size ? V1 : V2;
7637
7638     // Compute which of the two target values this index should be assigned to.
7639     // This reflects whether the high elements are remaining or the low elements
7640     // are remaining.
7641     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7642
7643     // Either set up this value if we've not encountered it before, or check
7644     // that it remains consistent.
7645     if (!TargetV)
7646       TargetV = MaskV;
7647     else if (TargetV != MaskV)
7648       // This may be a rotation, but it pulls from the inputs in some
7649       // unsupported interleaving.
7650       return SDValue();
7651   }
7652
7653   // Check that we successfully analyzed the mask, and normalize the results.
7654   assert(Rotation != 0 && "Failed to locate a viable rotation!");
7655   assert((Lo || Hi) && "Failed to find a rotated input vector!");
7656   if (!Lo)
7657     Lo = Hi;
7658   else if (!Hi)
7659     Hi = Lo;
7660
7661   assert(VT.getSizeInBits() == 128 &&
7662          "Rotate-based lowering only supports 128-bit lowering!");
7663   assert(Mask.size() <= 16 &&
7664          "Can shuffle at most 16 bytes in a 128-bit vector!");
7665
7666   // The actual rotate instruction rotates bytes, so we need to scale the
7667   // rotation based on how many bytes are in the vector.
7668   int Scale = 16 / Mask.size();
7669
7670   // SSSE3 targets can use the palignr instruction
7671   if (Subtarget->hasSSSE3()) {
7672     // Cast the inputs to v16i8 to match PALIGNR.
7673     Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
7674     Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
7675
7676     return DAG.getNode(ISD::BITCAST, DL, VT,
7677                        DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
7678                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
7679   }
7680
7681   // Default SSE2 implementation
7682   int LoByteShift = 16 - Rotation * Scale;
7683   int HiByteShift = Rotation * Scale;
7684
7685   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
7686   Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
7687   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
7688
7689   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
7690                                 DAG.getConstant(8 * LoByteShift, MVT::i8));
7691   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
7692                                 DAG.getConstant(8 * HiByteShift, MVT::i8));
7693   return DAG.getNode(ISD::BITCAST, DL, VT,
7694                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
7695 }
7696
7697 /// \brief Compute whether each element of a shuffle is zeroable.
7698 ///
7699 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7700 /// Either it is an undef element in the shuffle mask, the element of the input
7701 /// referenced is undef, or the element of the input referenced is known to be
7702 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7703 /// as many lanes with this technique as possible to simplify the remaining
7704 /// shuffle.
7705 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7706                                                      SDValue V1, SDValue V2) {
7707   SmallBitVector Zeroable(Mask.size(), false);
7708
7709   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7710   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7711
7712   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7713     int M = Mask[i];
7714     // Handle the easy cases.
7715     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7716       Zeroable[i] = true;
7717       continue;
7718     }
7719
7720     // If this is an index into a build_vector node, dig out the input value and
7721     // use it.
7722     SDValue V = M < Size ? V1 : V2;
7723     if (V.getOpcode() != ISD::BUILD_VECTOR)
7724       continue;
7725
7726     SDValue Input = V.getOperand(M % Size);
7727     // The UNDEF opcode check really should be dead code here, but not quite
7728     // worth asserting on (it isn't invalid, just unexpected).
7729     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
7730       Zeroable[i] = true;
7731   }
7732
7733   return Zeroable;
7734 }
7735
7736 /// \brief Try to emit a bitmask instruction for a shuffle.
7737 ///
7738 /// This handles cases where we can model a blend exactly as a bitmask due to
7739 /// one of the inputs being zeroable.
7740 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
7741                                            SDValue V2, ArrayRef<int> Mask,
7742                                            SelectionDAG &DAG) {
7743   MVT EltVT = VT.getScalarType();
7744   int NumEltBits = EltVT.getSizeInBits();
7745   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
7746   SDValue Zero = DAG.getConstant(0, IntEltVT);
7747   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
7748   if (EltVT.isFloatingPoint()) {
7749     Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
7750     AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
7751   }
7752   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
7753   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7754   SDValue V;
7755   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7756     if (Zeroable[i])
7757       continue;
7758     if (Mask[i] % Size != i)
7759       return SDValue(); // Not a blend.
7760     if (!V)
7761       V = Mask[i] < Size ? V1 : V2;
7762     else if (V != (Mask[i] < Size ? V1 : V2))
7763       return SDValue(); // Can only let one input through the mask.
7764
7765     VMaskOps[i] = AllOnes;
7766   }
7767   if (!V)
7768     return SDValue(); // No non-zeroable elements!
7769
7770   SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
7771   V = DAG.getNode(VT.isFloatingPoint() ? X86ISD::FAND : ISD::AND, DL, VT, V,
7772                   VMask);
7773   return V;
7774 }
7775
7776 /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
7777 ///
7778 /// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
7779 /// byte-shift instructions. The mask must consist of a shifted sequential
7780 /// shuffle from one of the input vectors and zeroable elements for the
7781 /// remaining 'shifted in' elements.
7782 ///
7783 /// Note that this only handles 128-bit vector widths currently.
7784 static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
7785                                              SDValue V2, ArrayRef<int> Mask,
7786                                              SelectionDAG &DAG) {
7787   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7788
7789   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7790
7791   int Size = Mask.size();
7792   int Scale = 16 / Size;
7793
7794   for (int Shift = 1; Shift < Size; Shift++) {
7795     int ByteShift = Shift * Scale;
7796
7797     // PSRLDQ : (little-endian) right byte shift
7798     // [ 5,  6,  7, zz, zz, zz, zz, zz]
7799     // [ -1, 5,  6,  7, zz, zz, zz, zz]
7800     // [  1, 2, -1, -1, -1, -1, zz, zz]
7801     bool ZeroableRight = true;
7802     for (int i = Size - Shift; i < Size; i++) {
7803       ZeroableRight &= Zeroable[i];
7804     }
7805
7806     if (ZeroableRight) {
7807       bool ValidShiftRight1 =
7808           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
7809       bool ValidShiftRight2 =
7810           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
7811
7812       if (ValidShiftRight1 || ValidShiftRight2) {
7813         // Cast the inputs to v2i64 to match PSRLDQ.
7814         SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
7815         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7816         SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
7817                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7818         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7819       }
7820     }
7821
7822     // PSLLDQ : (little-endian) left byte shift
7823     // [ zz,  0,  1,  2,  3,  4,  5,  6]
7824     // [ zz, zz, -1, -1,  2,  3,  4, -1]
7825     // [ zz, zz, zz, zz, zz, zz, -1,  1]
7826     bool ZeroableLeft = true;
7827     for (int i = 0; i < Shift; i++) {
7828       ZeroableLeft &= Zeroable[i];
7829     }
7830
7831     if (ZeroableLeft) {
7832       bool ValidShiftLeft1 =
7833           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
7834       bool ValidShiftLeft2 =
7835           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
7836
7837       if (ValidShiftLeft1 || ValidShiftLeft2) {
7838         // Cast the inputs to v2i64 to match PSLLDQ.
7839         SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
7840         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7841         SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
7842                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7843         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7844       }
7845     }
7846   }
7847
7848   return SDValue();
7849 }
7850
7851 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
7852 ///
7853 /// Attempts to match a shuffle mask against the PSRL(W/D/Q) and PSLL(W/D/Q)
7854 /// SSE2 and AVX2 logical bit-shift instructions. The function matches
7855 /// elements from one of the input vectors shuffled to the left or right
7856 /// with zeroable elements 'shifted in'.
7857 static SDValue lowerVectorShuffleAsBitShift(SDLoc DL, MVT VT, SDValue V1,
7858                                             SDValue V2, ArrayRef<int> Mask,
7859                                             SelectionDAG &DAG) {
7860   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7861
7862   int Size = Mask.size();
7863   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7864
7865   // PSRL : (little-endian) right bit shift.
7866   // [  1, zz,  3, zz]
7867   // [ -1, -1,  7, zz]
7868   // PSHL : (little-endian) left bit shift.
7869   // [ zz, 0, zz,  2 ]
7870   // [ -1, 4, zz, -1 ]
7871   auto MatchBitShift = [&](int Shift, int Scale) -> SDValue {
7872     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
7873     MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
7874     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
7875            "Illegal integer vector type");
7876
7877     bool MatchLeft = true, MatchRight = true;
7878     for (int i = 0; i != Size; i += Scale) {
7879       for (int j = 0; j != Shift; j++) {
7880         MatchLeft &= Zeroable[i + j];
7881       }
7882       for (int j = Scale - Shift; j != Scale; j++) {
7883         MatchRight &= Zeroable[i + j];
7884       }
7885     }
7886     if (!(MatchLeft || MatchRight))
7887       return SDValue();
7888
7889     bool MatchV1 = true, MatchV2 = true;
7890     for (int i = 0; i != Size; i += Scale) {
7891       unsigned Pos = MatchLeft ? i + Shift : i;
7892       unsigned Low = MatchLeft ? i : i + Shift;
7893       unsigned Len = Scale - Shift;
7894       MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
7895       MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + Size);
7896     }
7897     if (!(MatchV1 || MatchV2))
7898       return SDValue();
7899
7900     // Cast the inputs to ShiftVT to match VSRLI/VSHLI and back again.
7901     unsigned OpCode = MatchLeft ? X86ISD::VSHLI : X86ISD::VSRLI;
7902     int ShiftAmt = Shift * VT.getScalarSizeInBits();
7903     SDValue V = MatchV1 ? V1 : V2;
7904     V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
7905     V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
7906     return DAG.getNode(ISD::BITCAST, DL, VT, V);
7907   };
7908
7909   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
7910   // keep doubling the size of the integer elements up to that. We can
7911   // then shift the elements of the integer vector by whole multiples of
7912   // their width within the elements of the larger integer vector. Test each
7913   // multiple to see if we can find a match with the moved element indices
7914   // and that the shifted in elements are all zeroable.
7915   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 64; Scale *= 2)
7916     for (int Shift = 1; Shift != Scale; Shift++)
7917       if (SDValue BitShift = MatchBitShift(Shift, Scale))
7918         return BitShift;
7919
7920   // no match
7921   return SDValue();
7922 }
7923
7924 /// \brief Lower a vector shuffle as a zero or any extension.
7925 ///
7926 /// Given a specific number of elements, element bit width, and extension
7927 /// stride, produce either a zero or any extension based on the available
7928 /// features of the subtarget.
7929 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7930     SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
7931     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7932   assert(Scale > 1 && "Need a scale to extend.");
7933   int NumElements = VT.getVectorNumElements();
7934   int EltBits = VT.getScalarSizeInBits();
7935   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
7936          "Only 8, 16, and 32 bit elements can be extended.");
7937   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
7938
7939   // Found a valid zext mask! Try various lowering strategies based on the
7940   // input type and available ISA extensions.
7941   if (Subtarget->hasSSE41()) {
7942     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
7943                                  NumElements / Scale);
7944     return DAG.getNode(ISD::BITCAST, DL, VT,
7945                        DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
7946   }
7947
7948   // For any extends we can cheat for larger element sizes and use shuffle
7949   // instructions that can fold with a load and/or copy.
7950   if (AnyExt && EltBits == 32) {
7951     int PSHUFDMask[4] = {0, -1, 1, -1};
7952     return DAG.getNode(
7953         ISD::BITCAST, DL, VT,
7954         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7955                     DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7956                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
7957   }
7958   if (AnyExt && EltBits == 16 && Scale > 2) {
7959     int PSHUFDMask[4] = {0, -1, 0, -1};
7960     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7961                          DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7962                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
7963     int PSHUFHWMask[4] = {1, -1, -1, -1};
7964     return DAG.getNode(
7965         ISD::BITCAST, DL, VT,
7966         DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
7967                     DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
7968                     getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
7969   }
7970
7971   // If this would require more than 2 unpack instructions to expand, use
7972   // pshufb when available. We can only use more than 2 unpack instructions
7973   // when zero extending i8 elements which also makes it easier to use pshufb.
7974   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
7975     assert(NumElements == 16 && "Unexpected byte vector width!");
7976     SDValue PSHUFBMask[16];
7977     for (int i = 0; i < 16; ++i)
7978       PSHUFBMask[i] =
7979           DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
7980     InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
7981     return DAG.getNode(ISD::BITCAST, DL, VT,
7982                        DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
7983                                    DAG.getNode(ISD::BUILD_VECTOR, DL,
7984                                                MVT::v16i8, PSHUFBMask)));
7985   }
7986
7987   // Otherwise emit a sequence of unpacks.
7988   do {
7989     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7990     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
7991                          : getZeroVector(InputVT, Subtarget, DAG, DL);
7992     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7993     InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
7994     Scale /= 2;
7995     EltBits *= 2;
7996     NumElements /= 2;
7997   } while (Scale > 1);
7998   return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
7999 }
8000
8001 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8002 ///
8003 /// This routine will try to do everything in its power to cleverly lower
8004 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
8005 /// check for the profitability of this lowering,  it tries to aggressively
8006 /// match this pattern. It will use all of the micro-architectural details it
8007 /// can to emit an efficient lowering. It handles both blends with all-zero
8008 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
8009 /// masking out later).
8010 ///
8011 /// The reason we have dedicated lowering for zext-style shuffles is that they
8012 /// are both incredibly common and often quite performance sensitive.
8013 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
8014     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8015     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8016   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8017
8018   int Bits = VT.getSizeInBits();
8019   int NumElements = VT.getVectorNumElements();
8020   assert(VT.getScalarSizeInBits() <= 32 &&
8021          "Exceeds 32-bit integer zero extension limit");
8022   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
8023
8024   // Define a helper function to check a particular ext-scale and lower to it if
8025   // valid.
8026   auto Lower = [&](int Scale) -> SDValue {
8027     SDValue InputV;
8028     bool AnyExt = true;
8029     for (int i = 0; i < NumElements; ++i) {
8030       if (Mask[i] == -1)
8031         continue; // Valid anywhere but doesn't tell us anything.
8032       if (i % Scale != 0) {
8033         // Each of the extended elements need to be zeroable.
8034         if (!Zeroable[i])
8035           return SDValue();
8036
8037         // We no longer are in the anyext case.
8038         AnyExt = false;
8039         continue;
8040       }
8041
8042       // Each of the base elements needs to be consecutive indices into the
8043       // same input vector.
8044       SDValue V = Mask[i] < NumElements ? V1 : V2;
8045       if (!InputV)
8046         InputV = V;
8047       else if (InputV != V)
8048         return SDValue(); // Flip-flopping inputs.
8049
8050       if (Mask[i] % NumElements != i / Scale)
8051         return SDValue(); // Non-consecutive strided elements.
8052     }
8053
8054     // If we fail to find an input, we have a zero-shuffle which should always
8055     // have already been handled.
8056     // FIXME: Maybe handle this here in case during blending we end up with one?
8057     if (!InputV)
8058       return SDValue();
8059
8060     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8061         DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
8062   };
8063
8064   // The widest scale possible for extending is to a 64-bit integer.
8065   assert(Bits % 64 == 0 &&
8066          "The number of bits in a vector must be divisible by 64 on x86!");
8067   int NumExtElements = Bits / 64;
8068
8069   // Each iteration, try extending the elements half as much, but into twice as
8070   // many elements.
8071   for (; NumExtElements < NumElements; NumExtElements *= 2) {
8072     assert(NumElements % NumExtElements == 0 &&
8073            "The input vector size must be divisible by the extended size.");
8074     if (SDValue V = Lower(NumElements / NumExtElements))
8075       return V;
8076   }
8077
8078   // General extends failed, but 128-bit vectors may be able to use MOVQ.
8079   if (Bits != 128)
8080     return SDValue();
8081
8082   // Returns one of the source operands if the shuffle can be reduced to a
8083   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
8084   auto CanZExtLowHalf = [&]() {
8085     for (int i = NumElements / 2; i != NumElements; i++)
8086       if (!Zeroable[i])
8087         return SDValue();
8088     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
8089       return V1;
8090     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
8091       return V2;
8092     return SDValue();
8093   };
8094
8095   if (SDValue V = CanZExtLowHalf()) {
8096     V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
8097     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
8098     return DAG.getNode(ISD::BITCAST, DL, VT, V);
8099   }
8100
8101   // No viable ext lowering found.
8102   return SDValue();
8103 }
8104
8105 /// \brief Try to get a scalar value for a specific element of a vector.
8106 ///
8107 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
8108 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
8109                                               SelectionDAG &DAG) {
8110   MVT VT = V.getSimpleValueType();
8111   MVT EltVT = VT.getVectorElementType();
8112   while (V.getOpcode() == ISD::BITCAST)
8113     V = V.getOperand(0);
8114   // If the bitcasts shift the element size, we can't extract an equivalent
8115   // element from it.
8116   MVT NewVT = V.getSimpleValueType();
8117   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
8118     return SDValue();
8119
8120   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8121       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
8122     return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
8123
8124   return SDValue();
8125 }
8126
8127 /// \brief Helper to test for a load that can be folded with x86 shuffles.
8128 ///
8129 /// This is particularly important because the set of instructions varies
8130 /// significantly based on whether the operand is a load or not.
8131 static bool isShuffleFoldableLoad(SDValue V) {
8132   while (V.getOpcode() == ISD::BITCAST)
8133     V = V.getOperand(0);
8134
8135   return ISD::isNON_EXTLoad(V.getNode());
8136 }
8137
8138 /// \brief Try to lower insertion of a single element into a zero vector.
8139 ///
8140 /// This is a common pattern that we have especially efficient patterns to lower
8141 /// across all subtarget feature sets.
8142 static SDValue lowerVectorShuffleAsElementInsertion(
8143     MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8144     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8145   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8146   MVT ExtVT = VT;
8147   MVT EltVT = VT.getVectorElementType();
8148
8149   int V2Index = std::find_if(Mask.begin(), Mask.end(),
8150                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
8151                 Mask.begin();
8152   bool IsV1Zeroable = true;
8153   for (int i = 0, Size = Mask.size(); i < Size; ++i)
8154     if (i != V2Index && !Zeroable[i]) {
8155       IsV1Zeroable = false;
8156       break;
8157     }
8158
8159   // Check for a single input from a SCALAR_TO_VECTOR node.
8160   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8161   // all the smarts here sunk into that routine. However, the current
8162   // lowering of BUILD_VECTOR makes that nearly impossible until the old
8163   // vector shuffle lowering is dead.
8164   if (SDValue V2S = getScalarValueForVectorElement(
8165           V2, Mask[V2Index] - Mask.size(), DAG)) {
8166     // We need to zext the scalar if it is smaller than an i32.
8167     V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
8168     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8169       // Using zext to expand a narrow element won't work for non-zero
8170       // insertions.
8171       if (!IsV1Zeroable)
8172         return SDValue();
8173
8174       // Zero-extend directly to i32.
8175       ExtVT = MVT::v4i32;
8176       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8177     }
8178     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8179   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8180              EltVT == MVT::i16) {
8181     // Either not inserting from the low element of the input or the input
8182     // element size is too small to use VZEXT_MOVL to clear the high bits.
8183     return SDValue();
8184   }
8185
8186   if (!IsV1Zeroable) {
8187     // If V1 can't be treated as a zero vector we have fewer options to lower
8188     // this. We can't support integer vectors or non-zero targets cheaply, and
8189     // the V1 elements can't be permuted in any way.
8190     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8191     if (!VT.isFloatingPoint() || V2Index != 0)
8192       return SDValue();
8193     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8194     V1Mask[V2Index] = -1;
8195     if (!isNoopShuffleMask(V1Mask))
8196       return SDValue();
8197     // This is essentially a special case blend operation, but if we have
8198     // general purpose blend operations, they are always faster. Bail and let
8199     // the rest of the lowering handle these as blends.
8200     if (Subtarget->hasSSE41())
8201       return SDValue();
8202
8203     // Otherwise, use MOVSD or MOVSS.
8204     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8205            "Only two types of floating point element types to handle!");
8206     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8207                        ExtVT, V1, V2);
8208   }
8209
8210   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8211   if (ExtVT != VT)
8212     V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8213
8214   if (V2Index != 0) {
8215     // If we have 4 or fewer lanes we can cheaply shuffle the element into
8216     // the desired position. Otherwise it is more efficient to do a vector
8217     // shift left. We know that we can do a vector shift left because all
8218     // the inputs are zero.
8219     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8220       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8221       V2Shuffle[V2Index] = 0;
8222       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8223     } else {
8224       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
8225       V2 = DAG.getNode(
8226           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
8227           DAG.getConstant(
8228               V2Index * EltVT.getSizeInBits(),
8229               DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
8230       V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8231     }
8232   }
8233   return V2;
8234 }
8235
8236 /// \brief Try to lower broadcast of a single element.
8237 ///
8238 /// For convenience, this code also bundles all of the subtarget feature set
8239 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8240 /// a convenient way to factor it out.
8241 static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
8242                                              ArrayRef<int> Mask,
8243                                              const X86Subtarget *Subtarget,
8244                                              SelectionDAG &DAG) {
8245   if (!Subtarget->hasAVX())
8246     return SDValue();
8247   if (VT.isInteger() && !Subtarget->hasAVX2())
8248     return SDValue();
8249
8250   // Check that the mask is a broadcast.
8251   int BroadcastIdx = -1;
8252   for (int M : Mask)
8253     if (M >= 0 && BroadcastIdx == -1)
8254       BroadcastIdx = M;
8255     else if (M >= 0 && M != BroadcastIdx)
8256       return SDValue();
8257
8258   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8259                                             "a sorted mask where the broadcast "
8260                                             "comes from V1.");
8261
8262   // Go up the chain of (vector) values to try and find a scalar load that
8263   // we can combine with the broadcast.
8264   for (;;) {
8265     switch (V.getOpcode()) {
8266     case ISD::CONCAT_VECTORS: {
8267       int OperandSize = Mask.size() / V.getNumOperands();
8268       V = V.getOperand(BroadcastIdx / OperandSize);
8269       BroadcastIdx %= OperandSize;
8270       continue;
8271     }
8272
8273     case ISD::INSERT_SUBVECTOR: {
8274       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8275       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8276       if (!ConstantIdx)
8277         break;
8278
8279       int BeginIdx = (int)ConstantIdx->getZExtValue();
8280       int EndIdx =
8281           BeginIdx + (int)VInner.getValueType().getVectorNumElements();
8282       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8283         BroadcastIdx -= BeginIdx;
8284         V = VInner;
8285       } else {
8286         V = VOuter;
8287       }
8288       continue;
8289     }
8290     }
8291     break;
8292   }
8293
8294   // Check if this is a broadcast of a scalar. We special case lowering
8295   // for scalars so that we can more effectively fold with loads.
8296   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8297       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8298     V = V.getOperand(BroadcastIdx);
8299
8300     // If the scalar isn't a load we can't broadcast from it in AVX1, only with
8301     // AVX2.
8302     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
8303       return SDValue();
8304   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
8305     // We can't broadcast from a vector register w/o AVX2, and we can only
8306     // broadcast from the zero-element of a vector register.
8307     return SDValue();
8308   }
8309
8310   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
8311 }
8312
8313 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8314 // INSERTPS when the V1 elements are already in the correct locations
8315 // because otherwise we can just always use two SHUFPS instructions which
8316 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8317 // perform INSERTPS if a single V1 element is out of place and all V2
8318 // elements are zeroable.
8319 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8320                                             ArrayRef<int> Mask,
8321                                             SelectionDAG &DAG) {
8322   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8323   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8324   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8325   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8326
8327   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8328
8329   unsigned ZMask = 0;
8330   int V1DstIndex = -1;
8331   int V2DstIndex = -1;
8332   bool V1UsedInPlace = false;
8333
8334   for (int i = 0; i < 4; i++) {
8335     // Synthesize a zero mask from the zeroable elements (includes undefs).
8336     if (Zeroable[i]) {
8337       ZMask |= 1 << i;
8338       continue;
8339     }
8340
8341     // Flag if we use any V1 inputs in place.
8342     if (i == Mask[i]) {
8343       V1UsedInPlace = true;
8344       continue;
8345     }
8346
8347     // We can only insert a single non-zeroable element.
8348     if (V1DstIndex != -1 || V2DstIndex != -1)
8349       return SDValue();
8350
8351     if (Mask[i] < 4) {
8352       // V1 input out of place for insertion.
8353       V1DstIndex = i;
8354     } else {
8355       // V2 input for insertion.
8356       V2DstIndex = i;
8357     }
8358   }
8359
8360   // Don't bother if we have no (non-zeroable) element for insertion.
8361   if (V1DstIndex == -1 && V2DstIndex == -1)
8362     return SDValue();
8363
8364   // Determine element insertion src/dst indices. The src index is from the
8365   // start of the inserted vector, not the start of the concatenated vector.
8366   unsigned V2SrcIndex = 0;
8367   if (V1DstIndex != -1) {
8368     // If we have a V1 input out of place, we use V1 as the V2 element insertion
8369     // and don't use the original V2 at all.
8370     V2SrcIndex = Mask[V1DstIndex];
8371     V2DstIndex = V1DstIndex;
8372     V2 = V1;
8373   } else {
8374     V2SrcIndex = Mask[V2DstIndex] - 4;
8375   }
8376
8377   // If no V1 inputs are used in place, then the result is created only from
8378   // the zero mask and the V2 insertion - so remove V1 dependency.
8379   if (!V1UsedInPlace)
8380     V1 = DAG.getUNDEF(MVT::v4f32);
8381
8382   unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8383   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8384
8385   // Insert the V2 element into the desired position.
8386   SDLoc DL(Op);
8387   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8388                      DAG.getConstant(InsertPSMask, MVT::i8));
8389 }
8390
8391 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8392 ///
8393 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8394 /// support for floating point shuffles but not integer shuffles. These
8395 /// instructions will incur a domain crossing penalty on some chips though so
8396 /// it is better to avoid lowering through this for integer vectors where
8397 /// possible.
8398 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8399                                        const X86Subtarget *Subtarget,
8400                                        SelectionDAG &DAG) {
8401   SDLoc DL(Op);
8402   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
8403   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8404   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8405   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8406   ArrayRef<int> Mask = SVOp->getMask();
8407   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8408
8409   if (isSingleInputShuffleMask(Mask)) {
8410     // Use low duplicate instructions for masks that match their pattern.
8411     if (Subtarget->hasSSE3())
8412       if (isShuffleEquivalent(Mask, 0, 0))
8413         return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
8414
8415     // Straight shuffle of a single input vector. Simulate this by using the
8416     // single input as both of the "inputs" to this instruction..
8417     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8418
8419     if (Subtarget->hasAVX()) {
8420       // If we have AVX, we can use VPERMILPS which will allow folding a load
8421       // into the shuffle.
8422       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8423                          DAG.getConstant(SHUFPDMask, MVT::i8));
8424     }
8425
8426     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
8427                        DAG.getConstant(SHUFPDMask, MVT::i8));
8428   }
8429   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8430   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8431
8432   // Use dedicated unpack instructions for masks that match their pattern.
8433   if (isShuffleEquivalent(Mask, 0, 2))
8434     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
8435   if (isShuffleEquivalent(Mask, 1, 3))
8436     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
8437
8438   // If we have a single input, insert that into V1 if we can do so cheaply.
8439   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8440     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8441             MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
8442       return Insertion;
8443     // Try inverting the insertion since for v2 masks it is easy to do and we
8444     // can't reliably sort the mask one way or the other.
8445     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8446                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8447     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8448             MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
8449       return Insertion;
8450   }
8451
8452   // Try to use one of the special instruction patterns to handle two common
8453   // blend patterns if a zero-blend above didn't work.
8454   if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
8455     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8456       // We can either use a special instruction to load over the low double or
8457       // to move just the low double.
8458       return DAG.getNode(
8459           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8460           DL, MVT::v2f64, V2,
8461           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8462
8463   if (Subtarget->hasSSE41())
8464     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8465                                                   Subtarget, DAG))
8466       return Blend;
8467
8468   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8469   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
8470                      DAG.getConstant(SHUFPDMask, MVT::i8));
8471 }
8472
8473 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8474 ///
8475 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8476 /// the integer unit to minimize domain crossing penalties. However, for blends
8477 /// it falls back to the floating point shuffle operation with appropriate bit
8478 /// casting.
8479 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8480                                        const X86Subtarget *Subtarget,
8481                                        SelectionDAG &DAG) {
8482   SDLoc DL(Op);
8483   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
8484   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8485   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8486   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8487   ArrayRef<int> Mask = SVOp->getMask();
8488   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8489
8490   if (isSingleInputShuffleMask(Mask)) {
8491     // Check for being able to broadcast a single element.
8492     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
8493                                                           Mask, Subtarget, DAG))
8494       return Broadcast;
8495
8496     // Straight shuffle of a single input vector. For everything from SSE2
8497     // onward this has a single fast instruction with no scary immediates.
8498     // We have to map the mask as it is actually a v4i32 shuffle instruction.
8499     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
8500     int WidenedMask[4] = {
8501         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
8502         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
8503     return DAG.getNode(
8504         ISD::BITCAST, DL, MVT::v2i64,
8505         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
8506                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
8507   }
8508
8509   // Try to use byte shift instructions.
8510   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8511           DL, MVT::v2i64, V1, V2, Mask, DAG))
8512     return Shift;
8513
8514   // If we have a single input from V2 insert that into V1 if we can do so
8515   // cheaply.
8516   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8517     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8518             MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
8519       return Insertion;
8520     // Try inverting the insertion since for v2 masks it is easy to do and we
8521     // can't reliably sort the mask one way or the other.
8522     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8523                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8524     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8525             MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
8526       return Insertion;
8527   }
8528
8529   // Use dedicated unpack instructions for masks that match their pattern.
8530   if (isShuffleEquivalent(Mask, 0, 2))
8531     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
8532   if (isShuffleEquivalent(Mask, 1, 3))
8533     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
8534
8535   if (Subtarget->hasSSE41())
8536     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
8537                                                   Subtarget, DAG))
8538       return Blend;
8539
8540   // Try to use byte rotation instructions.
8541   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8542   if (Subtarget->hasSSSE3())
8543     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8544             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8545       return Rotate;
8546
8547   // We implement this with SHUFPD which is pretty lame because it will likely
8548   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
8549   // However, all the alternatives are still more cycles and newer chips don't
8550   // have this problem. It would be really nice if x86 had better shuffles here.
8551   V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
8552   V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
8553   return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
8554                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
8555 }
8556
8557 /// \brief Lower a vector shuffle using the SHUFPS instruction.
8558 ///
8559 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
8560 /// It makes no assumptions about whether this is the *best* lowering, it simply
8561 /// uses it.
8562 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
8563                                             ArrayRef<int> Mask, SDValue V1,
8564                                             SDValue V2, SelectionDAG &DAG) {
8565   SDValue LowV = V1, HighV = V2;
8566   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
8567
8568   int NumV2Elements =
8569       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8570
8571   if (NumV2Elements == 1) {
8572     int V2Index =
8573         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8574         Mask.begin();
8575
8576     // Compute the index adjacent to V2Index and in the same half by toggling
8577     // the low bit.
8578     int V2AdjIndex = V2Index ^ 1;
8579
8580     if (Mask[V2AdjIndex] == -1) {
8581       // Handles all the cases where we have a single V2 element and an undef.
8582       // This will only ever happen in the high lanes because we commute the
8583       // vector otherwise.
8584       if (V2Index < 2)
8585         std::swap(LowV, HighV);
8586       NewMask[V2Index] -= 4;
8587     } else {
8588       // Handle the case where the V2 element ends up adjacent to a V1 element.
8589       // To make this work, blend them together as the first step.
8590       int V1Index = V2AdjIndex;
8591       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
8592       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
8593                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8594
8595       // Now proceed to reconstruct the final blend as we have the necessary
8596       // high or low half formed.
8597       if (V2Index < 2) {
8598         LowV = V2;
8599         HighV = V1;
8600       } else {
8601         HighV = V2;
8602       }
8603       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
8604       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
8605     }
8606   } else if (NumV2Elements == 2) {
8607     if (Mask[0] < 4 && Mask[1] < 4) {
8608       // Handle the easy case where we have V1 in the low lanes and V2 in the
8609       // high lanes.
8610       NewMask[2] -= 4;
8611       NewMask[3] -= 4;
8612     } else if (Mask[2] < 4 && Mask[3] < 4) {
8613       // We also handle the reversed case because this utility may get called
8614       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
8615       // arrange things in the right direction.
8616       NewMask[0] -= 4;
8617       NewMask[1] -= 4;
8618       HighV = V1;
8619       LowV = V2;
8620     } else {
8621       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
8622       // trying to place elements directly, just blend them and set up the final
8623       // shuffle to place them.
8624
8625       // The first two blend mask elements are for V1, the second two are for
8626       // V2.
8627       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
8628                           Mask[2] < 4 ? Mask[2] : Mask[3],
8629                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
8630                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
8631       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
8632                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8633
8634       // Now we do a normal shuffle of V1 by giving V1 as both operands to
8635       // a blend.
8636       LowV = HighV = V1;
8637       NewMask[0] = Mask[0] < 4 ? 0 : 2;
8638       NewMask[1] = Mask[0] < 4 ? 2 : 0;
8639       NewMask[2] = Mask[2] < 4 ? 1 : 3;
8640       NewMask[3] = Mask[2] < 4 ? 3 : 1;
8641     }
8642   }
8643   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
8644                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
8645 }
8646
8647 /// \brief Lower 4-lane 32-bit floating point shuffles.
8648 ///
8649 /// Uses instructions exclusively from the floating point unit to minimize
8650 /// domain crossing penalties, as these are sufficient to implement all v4f32
8651 /// shuffles.
8652 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8653                                        const X86Subtarget *Subtarget,
8654                                        SelectionDAG &DAG) {
8655   SDLoc DL(Op);
8656   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8657   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8658   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8659   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8660   ArrayRef<int> Mask = SVOp->getMask();
8661   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8662
8663   int NumV2Elements =
8664       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8665
8666   if (NumV2Elements == 0) {
8667     // Check for being able to broadcast a single element.
8668     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8669                                                           Mask, Subtarget, DAG))
8670       return Broadcast;
8671
8672     // Use even/odd duplicate instructions for masks that match their pattern.
8673     if (Subtarget->hasSSE3()) {
8674       if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
8675         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
8676       if (isShuffleEquivalent(Mask, 1, 1, 3, 3))
8677         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
8678     }
8679
8680     if (Subtarget->hasAVX()) {
8681       // If we have AVX, we can use VPERMILPS which will allow folding a load
8682       // into the shuffle.
8683       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
8684                          getV4X86ShuffleImm8ForMask(Mask, DAG));
8685     }
8686
8687     // Otherwise, use a straight shuffle of a single input vector. We pass the
8688     // input vector to both operands to simulate this with a SHUFPS.
8689     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
8690                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8691   }
8692
8693   // Use dedicated unpack instructions for masks that match their pattern.
8694   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8695     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
8696   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8697     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
8698
8699   // There are special ways we can lower some single-element blends. However, we
8700   // have custom ways we can lower more complex single-element blends below that
8701   // we defer to if both this and BLENDPS fail to match, so restrict this to
8702   // when the V2 input is targeting element 0 of the mask -- that is the fast
8703   // case here.
8704   if (NumV2Elements == 1 && Mask[0] >= 4)
8705     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
8706                                                          Mask, Subtarget, DAG))
8707       return V;
8708
8709   if (Subtarget->hasSSE41()) {
8710     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
8711                                                   Subtarget, DAG))
8712       return Blend;
8713
8714     // Use INSERTPS if we can complete the shuffle efficiently.
8715     if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8716       return V;
8717   }
8718
8719   // Otherwise fall back to a SHUFPS lowering strategy.
8720   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
8721 }
8722
8723 /// \brief Lower 4-lane i32 vector shuffles.
8724 ///
8725 /// We try to handle these with integer-domain shuffles where we can, but for
8726 /// blends we use the floating point domain blend instructions.
8727 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8728                                        const X86Subtarget *Subtarget,
8729                                        SelectionDAG &DAG) {
8730   SDLoc DL(Op);
8731   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
8732   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8733   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8734   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8735   ArrayRef<int> Mask = SVOp->getMask();
8736   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8737
8738   // Whenever we can lower this as a zext, that instruction is strictly faster
8739   // than any alternative. It also allows us to fold memory operands into the
8740   // shuffle in many cases.
8741   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
8742                                                          Mask, Subtarget, DAG))
8743     return ZExt;
8744
8745   int NumV2Elements =
8746       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8747
8748   if (NumV2Elements == 0) {
8749     // Check for being able to broadcast a single element.
8750     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8751                                                           Mask, Subtarget, DAG))
8752       return Broadcast;
8753
8754     // Straight shuffle of a single input vector. For everything from SSE2
8755     // onward this has a single fast instruction with no scary immediates.
8756     // We coerce the shuffle pattern to be compatible with UNPCK instructions
8757     // but we aren't actually going to use the UNPCK instruction because doing
8758     // so prevents folding a load into this instruction or making a copy.
8759     const int UnpackLoMask[] = {0, 0, 1, 1};
8760     const int UnpackHiMask[] = {2, 2, 3, 3};
8761     if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
8762       Mask = UnpackLoMask;
8763     else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
8764       Mask = UnpackHiMask;
8765
8766     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
8767                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8768   }
8769
8770   // Try to use bit shift instructions.
8771   if (SDValue Shift = lowerVectorShuffleAsBitShift(
8772           DL, MVT::v4i32, V1, V2, Mask, DAG))
8773     return Shift;
8774
8775   // Try to use byte shift instructions.
8776   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8777           DL, MVT::v4i32, V1, V2, Mask, DAG))
8778     return Shift;
8779
8780   // There are special ways we can lower some single-element blends.
8781   if (NumV2Elements == 1)
8782     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
8783                                                          Mask, Subtarget, DAG))
8784       return V;
8785
8786   if (Subtarget->hasSSE41())
8787     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
8788                                                   Subtarget, DAG))
8789       return Blend;
8790
8791   if (SDValue Masked =
8792           lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
8793     return Masked;
8794
8795   // Use dedicated unpack instructions for masks that match their pattern.
8796   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8797     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
8798   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8799     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
8800
8801   // Try to use byte rotation instructions.
8802   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8803   if (Subtarget->hasSSSE3())
8804     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8805             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
8806       return Rotate;
8807
8808   // We implement this with SHUFPS because it can blend from two vectors.
8809   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
8810   // up the inputs, bypassing domain shift penalties that we would encur if we
8811   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
8812   // relevant.
8813   return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
8814                      DAG.getVectorShuffle(
8815                          MVT::v4f32, DL,
8816                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
8817                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
8818 }
8819
8820 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
8821 /// shuffle lowering, and the most complex part.
8822 ///
8823 /// The lowering strategy is to try to form pairs of input lanes which are
8824 /// targeted at the same half of the final vector, and then use a dword shuffle
8825 /// to place them onto the right half, and finally unpack the paired lanes into
8826 /// their final position.
8827 ///
8828 /// The exact breakdown of how to form these dword pairs and align them on the
8829 /// correct sides is really tricky. See the comments within the function for
8830 /// more of the details.
8831 static SDValue lowerV8I16SingleInputVectorShuffle(
8832     SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
8833     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8834   assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
8835   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
8836   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
8837
8838   SmallVector<int, 4> LoInputs;
8839   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
8840                [](int M) { return M >= 0; });
8841   std::sort(LoInputs.begin(), LoInputs.end());
8842   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
8843   SmallVector<int, 4> HiInputs;
8844   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
8845                [](int M) { return M >= 0; });
8846   std::sort(HiInputs.begin(), HiInputs.end());
8847   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
8848   int NumLToL =
8849       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
8850   int NumHToL = LoInputs.size() - NumLToL;
8851   int NumLToH =
8852       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
8853   int NumHToH = HiInputs.size() - NumLToH;
8854   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
8855   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
8856   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
8857   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
8858
8859   // Check for being able to broadcast a single element.
8860   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
8861                                                         Mask, Subtarget, DAG))
8862     return Broadcast;
8863
8864   // Try to use bit shift instructions.
8865   if (SDValue Shift = lowerVectorShuffleAsBitShift(
8866           DL, MVT::v8i16, V, V, Mask, DAG))
8867     return Shift;
8868
8869   // Try to use byte shift instructions.
8870   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8871           DL, MVT::v8i16, V, V, Mask, DAG))
8872     return Shift;
8873
8874   // Use dedicated unpack instructions for masks that match their pattern.
8875   if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
8876     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
8877   if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
8878     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
8879
8880   // Try to use byte rotation instructions.
8881   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8882           DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
8883     return Rotate;
8884
8885   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
8886   // such inputs we can swap two of the dwords across the half mark and end up
8887   // with <=2 inputs to each half in each half. Once there, we can fall through
8888   // to the generic code below. For example:
8889   //
8890   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8891   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
8892   //
8893   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
8894   // and an existing 2-into-2 on the other half. In this case we may have to
8895   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
8896   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
8897   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
8898   // because any other situation (including a 3-into-1 or 1-into-3 in the other
8899   // half than the one we target for fixing) will be fixed when we re-enter this
8900   // path. We will also combine away any sequence of PSHUFD instructions that
8901   // result into a single instruction. Here is an example of the tricky case:
8902   //
8903   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8904   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
8905   //
8906   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
8907   //
8908   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
8909   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
8910   //
8911   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
8912   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
8913   //
8914   // The result is fine to be handled by the generic logic.
8915   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
8916                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
8917                           int AOffset, int BOffset) {
8918     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
8919            "Must call this with A having 3 or 1 inputs from the A half.");
8920     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
8921            "Must call this with B having 1 or 3 inputs from the B half.");
8922     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
8923            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
8924
8925     // Compute the index of dword with only one word among the three inputs in
8926     // a half by taking the sum of the half with three inputs and subtracting
8927     // the sum of the actual three inputs. The difference is the remaining
8928     // slot.
8929     int ADWord, BDWord;
8930     int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
8931     int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
8932     int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
8933     ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
8934     int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
8935     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
8936     int TripleNonInputIdx =
8937         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
8938     TripleDWord = TripleNonInputIdx / 2;
8939
8940     // We use xor with one to compute the adjacent DWord to whichever one the
8941     // OneInput is in.
8942     OneInputDWord = (OneInput / 2) ^ 1;
8943
8944     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
8945     // and BToA inputs. If there is also such a problem with the BToB and AToB
8946     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
8947     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
8948     // is essential that we don't *create* a 3<-1 as then we might oscillate.
8949     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
8950       // Compute how many inputs will be flipped by swapping these DWords. We
8951       // need
8952       // to balance this to ensure we don't form a 3-1 shuffle in the other
8953       // half.
8954       int NumFlippedAToBInputs =
8955           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
8956           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
8957       int NumFlippedBToBInputs =
8958           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
8959           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
8960       if ((NumFlippedAToBInputs == 1 &&
8961            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
8962           (NumFlippedBToBInputs == 1 &&
8963            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
8964         // We choose whether to fix the A half or B half based on whether that
8965         // half has zero flipped inputs. At zero, we may not be able to fix it
8966         // with that half. We also bias towards fixing the B half because that
8967         // will more commonly be the high half, and we have to bias one way.
8968         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
8969                                                        ArrayRef<int> Inputs) {
8970           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
8971           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
8972                                          PinnedIdx ^ 1) != Inputs.end();
8973           // Determine whether the free index is in the flipped dword or the
8974           // unflipped dword based on where the pinned index is. We use this bit
8975           // in an xor to conditionally select the adjacent dword.
8976           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
8977           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8978                                              FixFreeIdx) != Inputs.end();
8979           if (IsFixIdxInput == IsFixFreeIdxInput)
8980             FixFreeIdx += 1;
8981           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8982                                         FixFreeIdx) != Inputs.end();
8983           assert(IsFixIdxInput != IsFixFreeIdxInput &&
8984                  "We need to be changing the number of flipped inputs!");
8985           int PSHUFHalfMask[] = {0, 1, 2, 3};
8986           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
8987           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
8988                           MVT::v8i16, V,
8989                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
8990
8991           for (int &M : Mask)
8992             if (M != -1 && M == FixIdx)
8993               M = FixFreeIdx;
8994             else if (M != -1 && M == FixFreeIdx)
8995               M = FixIdx;
8996         };
8997         if (NumFlippedBToBInputs != 0) {
8998           int BPinnedIdx =
8999               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9000           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
9001         } else {
9002           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
9003           int APinnedIdx =
9004               AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9005           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
9006         }
9007       }
9008     }
9009
9010     int PSHUFDMask[] = {0, 1, 2, 3};
9011     PSHUFDMask[ADWord] = BDWord;
9012     PSHUFDMask[BDWord] = ADWord;
9013     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9014                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9015                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9016                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9017
9018     // Adjust the mask to match the new locations of A and B.
9019     for (int &M : Mask)
9020       if (M != -1 && M/2 == ADWord)
9021         M = 2 * BDWord + M % 2;
9022       else if (M != -1 && M/2 == BDWord)
9023         M = 2 * ADWord + M % 2;
9024
9025     // Recurse back into this routine to re-compute state now that this isn't
9026     // a 3 and 1 problem.
9027     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9028                                 Mask);
9029   };
9030   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
9031     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
9032   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
9033     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
9034
9035   // At this point there are at most two inputs to the low and high halves from
9036   // each half. That means the inputs can always be grouped into dwords and
9037   // those dwords can then be moved to the correct half with a dword shuffle.
9038   // We use at most one low and one high word shuffle to collect these paired
9039   // inputs into dwords, and finally a dword shuffle to place them.
9040   int PSHUFLMask[4] = {-1, -1, -1, -1};
9041   int PSHUFHMask[4] = {-1, -1, -1, -1};
9042   int PSHUFDMask[4] = {-1, -1, -1, -1};
9043
9044   // First fix the masks for all the inputs that are staying in their
9045   // original halves. This will then dictate the targets of the cross-half
9046   // shuffles.
9047   auto fixInPlaceInputs =
9048       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
9049                     MutableArrayRef<int> SourceHalfMask,
9050                     MutableArrayRef<int> HalfMask, int HalfOffset) {
9051     if (InPlaceInputs.empty())
9052       return;
9053     if (InPlaceInputs.size() == 1) {
9054       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9055           InPlaceInputs[0] - HalfOffset;
9056       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
9057       return;
9058     }
9059     if (IncomingInputs.empty()) {
9060       // Just fix all of the in place inputs.
9061       for (int Input : InPlaceInputs) {
9062         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
9063         PSHUFDMask[Input / 2] = Input / 2;
9064       }
9065       return;
9066     }
9067
9068     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
9069     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9070         InPlaceInputs[0] - HalfOffset;
9071     // Put the second input next to the first so that they are packed into
9072     // a dword. We find the adjacent index by toggling the low bit.
9073     int AdjIndex = InPlaceInputs[0] ^ 1;
9074     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
9075     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
9076     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
9077   };
9078   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
9079   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
9080
9081   // Now gather the cross-half inputs and place them into a free dword of
9082   // their target half.
9083   // FIXME: This operation could almost certainly be simplified dramatically to
9084   // look more like the 3-1 fixing operation.
9085   auto moveInputsToRightHalf = [&PSHUFDMask](
9086       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
9087       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
9088       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
9089       int DestOffset) {
9090     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
9091       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
9092     };
9093     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
9094                                                int Word) {
9095       int LowWord = Word & ~1;
9096       int HighWord = Word | 1;
9097       return isWordClobbered(SourceHalfMask, LowWord) ||
9098              isWordClobbered(SourceHalfMask, HighWord);
9099     };
9100
9101     if (IncomingInputs.empty())
9102       return;
9103
9104     if (ExistingInputs.empty()) {
9105       // Map any dwords with inputs from them into the right half.
9106       for (int Input : IncomingInputs) {
9107         // If the source half mask maps over the inputs, turn those into
9108         // swaps and use the swapped lane.
9109         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
9110           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
9111             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
9112                 Input - SourceOffset;
9113             // We have to swap the uses in our half mask in one sweep.
9114             for (int &M : HalfMask)
9115               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
9116                 M = Input;
9117               else if (M == Input)
9118                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9119           } else {
9120             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
9121                        Input - SourceOffset &&
9122                    "Previous placement doesn't match!");
9123           }
9124           // Note that this correctly re-maps both when we do a swap and when
9125           // we observe the other side of the swap above. We rely on that to
9126           // avoid swapping the members of the input list directly.
9127           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9128         }
9129
9130         // Map the input's dword into the correct half.
9131         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
9132           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
9133         else
9134           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
9135                      Input / 2 &&
9136                  "Previous placement doesn't match!");
9137       }
9138
9139       // And just directly shift any other-half mask elements to be same-half
9140       // as we will have mirrored the dword containing the element into the
9141       // same position within that half.
9142       for (int &M : HalfMask)
9143         if (M >= SourceOffset && M < SourceOffset + 4) {
9144           M = M - SourceOffset + DestOffset;
9145           assert(M >= 0 && "This should never wrap below zero!");
9146         }
9147       return;
9148     }
9149
9150     // Ensure we have the input in a viable dword of its current half. This
9151     // is particularly tricky because the original position may be clobbered
9152     // by inputs being moved and *staying* in that half.
9153     if (IncomingInputs.size() == 1) {
9154       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9155         int InputFixed = std::find(std::begin(SourceHalfMask),
9156                                    std::end(SourceHalfMask), -1) -
9157                          std::begin(SourceHalfMask) + SourceOffset;
9158         SourceHalfMask[InputFixed - SourceOffset] =
9159             IncomingInputs[0] - SourceOffset;
9160         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9161                      InputFixed);
9162         IncomingInputs[0] = InputFixed;
9163       }
9164     } else if (IncomingInputs.size() == 2) {
9165       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9166           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9167         // We have two non-adjacent or clobbered inputs we need to extract from
9168         // the source half. To do this, we need to map them into some adjacent
9169         // dword slot in the source mask.
9170         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9171                               IncomingInputs[1] - SourceOffset};
9172
9173         // If there is a free slot in the source half mask adjacent to one of
9174         // the inputs, place the other input in it. We use (Index XOR 1) to
9175         // compute an adjacent index.
9176         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9177             SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
9178           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9179           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9180           InputsFixed[1] = InputsFixed[0] ^ 1;
9181         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9182                    SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
9183           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9184           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9185           InputsFixed[0] = InputsFixed[1] ^ 1;
9186         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
9187                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
9188           // The two inputs are in the same DWord but it is clobbered and the
9189           // adjacent DWord isn't used at all. Move both inputs to the free
9190           // slot.
9191           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9192           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9193           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9194           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9195         } else {
9196           // The only way we hit this point is if there is no clobbering
9197           // (because there are no off-half inputs to this half) and there is no
9198           // free slot adjacent to one of the inputs. In this case, we have to
9199           // swap an input with a non-input.
9200           for (int i = 0; i < 4; ++i)
9201             assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
9202                    "We can't handle any clobbers here!");
9203           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9204                  "Cannot have adjacent inputs here!");
9205
9206           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9207           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9208
9209           // We also have to update the final source mask in this case because
9210           // it may need to undo the above swap.
9211           for (int &M : FinalSourceHalfMask)
9212             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9213               M = InputsFixed[1] + SourceOffset;
9214             else if (M == InputsFixed[1] + SourceOffset)
9215               M = (InputsFixed[0] ^ 1) + SourceOffset;
9216
9217           InputsFixed[1] = InputsFixed[0] ^ 1;
9218         }
9219
9220         // Point everything at the fixed inputs.
9221         for (int &M : HalfMask)
9222           if (M == IncomingInputs[0])
9223             M = InputsFixed[0] + SourceOffset;
9224           else if (M == IncomingInputs[1])
9225             M = InputsFixed[1] + SourceOffset;
9226
9227         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9228         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9229       }
9230     } else {
9231       llvm_unreachable("Unhandled input size!");
9232     }
9233
9234     // Now hoist the DWord down to the right half.
9235     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
9236     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
9237     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9238     for (int &M : HalfMask)
9239       for (int Input : IncomingInputs)
9240         if (M == Input)
9241           M = FreeDWord * 2 + Input % 2;
9242   };
9243   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9244                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
9245   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9246                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
9247
9248   // Now enact all the shuffles we've computed to move the inputs into their
9249   // target half.
9250   if (!isNoopShuffleMask(PSHUFLMask))
9251     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9252                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
9253   if (!isNoopShuffleMask(PSHUFHMask))
9254     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9255                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
9256   if (!isNoopShuffleMask(PSHUFDMask))
9257     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9258                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9259                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9260                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9261
9262   // At this point, each half should contain all its inputs, and we can then
9263   // just shuffle them into their final position.
9264   assert(std::count_if(LoMask.begin(), LoMask.end(),
9265                        [](int M) { return M >= 4; }) == 0 &&
9266          "Failed to lift all the high half inputs to the low mask!");
9267   assert(std::count_if(HiMask.begin(), HiMask.end(),
9268                        [](int M) { return M >= 0 && M < 4; }) == 0 &&
9269          "Failed to lift all the low half inputs to the high mask!");
9270
9271   // Do a half shuffle for the low mask.
9272   if (!isNoopShuffleMask(LoMask))
9273     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9274                     getV4X86ShuffleImm8ForMask(LoMask, DAG));
9275
9276   // Do a half shuffle with the high mask after shifting its values down.
9277   for (int &M : HiMask)
9278     if (M >= 0)
9279       M -= 4;
9280   if (!isNoopShuffleMask(HiMask))
9281     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9282                     getV4X86ShuffleImm8ForMask(HiMask, DAG));
9283
9284   return V;
9285 }
9286
9287 /// \brief Detect whether the mask pattern should be lowered through
9288 /// interleaving.
9289 ///
9290 /// This essentially tests whether viewing the mask as an interleaving of two
9291 /// sub-sequences reduces the cross-input traffic of a blend operation. If so,
9292 /// lowering it through interleaving is a significantly better strategy.
9293 static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
9294   int NumEvenInputs[2] = {0, 0};
9295   int NumOddInputs[2] = {0, 0};
9296   int NumLoInputs[2] = {0, 0};
9297   int NumHiInputs[2] = {0, 0};
9298   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9299     if (Mask[i] < 0)
9300       continue;
9301
9302     int InputIdx = Mask[i] >= Size;
9303
9304     if (i < Size / 2)
9305       ++NumLoInputs[InputIdx];
9306     else
9307       ++NumHiInputs[InputIdx];
9308
9309     if ((i % 2) == 0)
9310       ++NumEvenInputs[InputIdx];
9311     else
9312       ++NumOddInputs[InputIdx];
9313   }
9314
9315   // The minimum number of cross-input results for both the interleaved and
9316   // split cases. If interleaving results in fewer cross-input results, return
9317   // true.
9318   int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
9319                                     NumEvenInputs[0] + NumOddInputs[1]);
9320   int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
9321                               NumLoInputs[0] + NumHiInputs[1]);
9322   return InterleavedCrosses < SplitCrosses;
9323 }
9324
9325 /// \brief Blend two v8i16 vectors using a naive unpack strategy.
9326 ///
9327 /// This strategy only works when the inputs from each vector fit into a single
9328 /// half of that vector, and generally there are not so many inputs as to leave
9329 /// the in-place shuffles required highly constrained (and thus expensive). It
9330 /// shifts all the inputs into a single side of both input vectors and then
9331 /// uses an unpack to interleave these inputs in a single vector. At that
9332 /// point, we will fall back on the generic single input shuffle lowering.
9333 static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
9334                                                  SDValue V2,
9335                                                  MutableArrayRef<int> Mask,
9336                                                  const X86Subtarget *Subtarget,
9337                                                  SelectionDAG &DAG) {
9338   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9339   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9340   SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
9341   for (int i = 0; i < 8; ++i)
9342     if (Mask[i] >= 0 && Mask[i] < 4)
9343       LoV1Inputs.push_back(i);
9344     else if (Mask[i] >= 4 && Mask[i] < 8)
9345       HiV1Inputs.push_back(i);
9346     else if (Mask[i] >= 8 && Mask[i] < 12)
9347       LoV2Inputs.push_back(i);
9348     else if (Mask[i] >= 12)
9349       HiV2Inputs.push_back(i);
9350
9351   int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
9352   int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
9353   (void)NumV1Inputs;
9354   (void)NumV2Inputs;
9355   assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
9356   assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
9357   assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
9358
9359   bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
9360                      HiV1Inputs.size() + HiV2Inputs.size();
9361
9362   auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
9363                               ArrayRef<int> HiInputs, bool MoveToLo,
9364                               int MaskOffset) {
9365     ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
9366     ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
9367     if (BadInputs.empty())
9368       return V;
9369
9370     int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9371     int MoveOffset = MoveToLo ? 0 : 4;
9372
9373     if (GoodInputs.empty()) {
9374       for (int BadInput : BadInputs) {
9375         MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
9376         Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
9377       }
9378     } else {
9379       if (GoodInputs.size() == 2) {
9380         // If the low inputs are spread across two dwords, pack them into
9381         // a single dword.
9382         MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
9383         MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
9384         Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
9385         Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
9386       } else {
9387         // Otherwise pin the good inputs.
9388         for (int GoodInput : GoodInputs)
9389           MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
9390       }
9391
9392       if (BadInputs.size() == 2) {
9393         // If we have two bad inputs then there may be either one or two good
9394         // inputs fixed in place. Find a fixed input, and then find the *other*
9395         // two adjacent indices by using modular arithmetic.
9396         int GoodMaskIdx =
9397             std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
9398                          [](int M) { return M >= 0; }) -
9399             std::begin(MoveMask);
9400         int MoveMaskIdx =
9401             ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
9402         assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
9403         assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
9404         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9405         MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
9406         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9407         Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
9408       } else {
9409         assert(BadInputs.size() == 1 && "All sizes handled");
9410         int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
9411                                     std::end(MoveMask), -1) -
9412                           std::begin(MoveMask);
9413         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9414         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9415       }
9416     }
9417
9418     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9419                                 MoveMask);
9420   };
9421   V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
9422                         /*MaskOffset*/ 0);
9423   V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
9424                         /*MaskOffset*/ 8);
9425
9426   // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
9427   // cross-half traffic in the final shuffle.
9428
9429   // Munge the mask to be a single-input mask after the unpack merges the
9430   // results.
9431   for (int &M : Mask)
9432     if (M != -1)
9433       M = 2 * (M % 4) + (M / 8);
9434
9435   return DAG.getVectorShuffle(
9436       MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
9437                                   DL, MVT::v8i16, V1, V2),
9438       DAG.getUNDEF(MVT::v8i16), Mask);
9439 }
9440
9441 /// \brief Generic lowering of 8-lane i16 shuffles.
9442 ///
9443 /// This handles both single-input shuffles and combined shuffle/blends with
9444 /// two inputs. The single input shuffles are immediately delegated to
9445 /// a dedicated lowering routine.
9446 ///
9447 /// The blends are lowered in one of three fundamental ways. If there are few
9448 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9449 /// of the input is significantly cheaper when lowered as an interleaving of
9450 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9451 /// halves of the inputs separately (making them have relatively few inputs)
9452 /// and then concatenate them.
9453 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9454                                        const X86Subtarget *Subtarget,
9455                                        SelectionDAG &DAG) {
9456   SDLoc DL(Op);
9457   assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
9458   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9459   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9460   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9461   ArrayRef<int> OrigMask = SVOp->getMask();
9462   int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9463                         OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
9464   MutableArrayRef<int> Mask(MaskStorage);
9465
9466   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9467
9468   // Whenever we can lower this as a zext, that instruction is strictly faster
9469   // than any alternative.
9470   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9471           DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
9472     return ZExt;
9473
9474   auto isV1 = [](int M) { return M >= 0 && M < 8; };
9475   auto isV2 = [](int M) { return M >= 8; };
9476
9477   int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
9478   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
9479
9480   if (NumV2Inputs == 0)
9481     return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
9482
9483   assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
9484                             "to be V1-input shuffles.");
9485
9486   // Try to use bit shift instructions.
9487   if (SDValue Shift = lowerVectorShuffleAsBitShift(
9488           DL, MVT::v8i16, V1, V2, Mask, DAG))
9489     return Shift;
9490
9491   // Try to use byte shift instructions.
9492   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9493           DL, MVT::v8i16, V1, V2, Mask, DAG))
9494     return Shift;
9495
9496   // There are special ways we can lower some single-element blends.
9497   if (NumV2Inputs == 1)
9498     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
9499                                                          Mask, Subtarget, DAG))
9500       return V;
9501
9502   if (Subtarget->hasSSE41())
9503     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9504                                                   Subtarget, DAG))
9505       return Blend;
9506
9507   if (SDValue Masked =
9508           lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
9509     return Masked;
9510
9511   // Use dedicated unpack instructions for masks that match their pattern.
9512   if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
9513     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
9514   if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
9515     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
9516
9517   // Try to use byte rotation instructions.
9518   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9519           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9520     return Rotate;
9521
9522   if (NumV1Inputs + NumV2Inputs <= 4)
9523     return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
9524
9525   // Check whether an interleaving lowering is likely to be more efficient.
9526   // This isn't perfect but it is a strong heuristic that tends to work well on
9527   // the kinds of shuffles that show up in practice.
9528   //
9529   // FIXME: Handle 1x, 2x, and 4x interleaving.
9530   if (shouldLowerAsInterleaving(Mask)) {
9531     // FIXME: Figure out whether we should pack these into the low or high
9532     // halves.
9533
9534     int EMask[8], OMask[8];
9535     for (int i = 0; i < 4; ++i) {
9536       EMask[i] = Mask[2*i];
9537       OMask[i] = Mask[2*i + 1];
9538       EMask[i + 4] = -1;
9539       OMask[i + 4] = -1;
9540     }
9541
9542     SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
9543     SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
9544
9545     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
9546   }
9547
9548   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9549   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9550
9551   for (int i = 0; i < 4; ++i) {
9552     LoBlendMask[i] = Mask[i];
9553     HiBlendMask[i] = Mask[i + 4];
9554   }
9555
9556   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9557   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9558   LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
9559   HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
9560
9561   return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9562                      DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
9563 }
9564
9565 /// \brief Check whether a compaction lowering can be done by dropping even
9566 /// elements and compute how many times even elements must be dropped.
9567 ///
9568 /// This handles shuffles which take every Nth element where N is a power of
9569 /// two. Example shuffle masks:
9570 ///
9571 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
9572 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
9573 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
9574 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
9575 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
9576 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
9577 ///
9578 /// Any of these lanes can of course be undef.
9579 ///
9580 /// This routine only supports N <= 3.
9581 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
9582 /// for larger N.
9583 ///
9584 /// \returns N above, or the number of times even elements must be dropped if
9585 /// there is such a number. Otherwise returns zero.
9586 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
9587   // Figure out whether we're looping over two inputs or just one.
9588   bool IsSingleInput = isSingleInputShuffleMask(Mask);
9589
9590   // The modulus for the shuffle vector entries is based on whether this is
9591   // a single input or not.
9592   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
9593   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
9594          "We should only be called with masks with a power-of-2 size!");
9595
9596   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
9597
9598   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
9599   // and 2^3 simultaneously. This is because we may have ambiguity with
9600   // partially undef inputs.
9601   bool ViableForN[3] = {true, true, true};
9602
9603   for (int i = 0, e = Mask.size(); i < e; ++i) {
9604     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
9605     // want.
9606     if (Mask[i] == -1)
9607       continue;
9608
9609     bool IsAnyViable = false;
9610     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9611       if (ViableForN[j]) {
9612         uint64_t N = j + 1;
9613
9614         // The shuffle mask must be equal to (i * 2^N) % M.
9615         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
9616           IsAnyViable = true;
9617         else
9618           ViableForN[j] = false;
9619       }
9620     // Early exit if we exhaust the possible powers of two.
9621     if (!IsAnyViable)
9622       break;
9623   }
9624
9625   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9626     if (ViableForN[j])
9627       return j + 1;
9628
9629   // Return 0 as there is no viable power of two.
9630   return 0;
9631 }
9632
9633 /// \brief Generic lowering of v16i8 shuffles.
9634 ///
9635 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
9636 /// detect any complexity reducing interleaving. If that doesn't help, it uses
9637 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
9638 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
9639 /// back together.
9640 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9641                                        const X86Subtarget *Subtarget,
9642                                        SelectionDAG &DAG) {
9643   SDLoc DL(Op);
9644   assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
9645   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9646   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9647   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9648   ArrayRef<int> OrigMask = SVOp->getMask();
9649   assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
9650
9651   // Try to use bit shift instructions.
9652   if (SDValue Shift = lowerVectorShuffleAsBitShift(
9653           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9654     return Shift;
9655
9656   // Try to use byte shift instructions.
9657   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9658           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9659     return Shift;
9660
9661   // Try to use byte rotation instructions.
9662   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9663           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9664     return Rotate;
9665
9666   // Try to use a zext lowering.
9667   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9668           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9669     return ZExt;
9670
9671   int MaskStorage[16] = {
9672       OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
9673       OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
9674       OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
9675       OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
9676   MutableArrayRef<int> Mask(MaskStorage);
9677   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
9678   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
9679
9680   int NumV2Elements =
9681       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
9682
9683   // For single-input shuffles, there are some nicer lowering tricks we can use.
9684   if (NumV2Elements == 0) {
9685     // Check for being able to broadcast a single element.
9686     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9687                                                           Mask, Subtarget, DAG))
9688       return Broadcast;
9689
9690     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
9691     // Notably, this handles splat and partial-splat shuffles more efficiently.
9692     // However, it only makes sense if the pre-duplication shuffle simplifies
9693     // things significantly. Currently, this means we need to be able to
9694     // express the pre-duplication shuffle as an i16 shuffle.
9695     //
9696     // FIXME: We should check for other patterns which can be widened into an
9697     // i16 shuffle as well.
9698     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
9699       for (int i = 0; i < 16; i += 2)
9700         if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
9701           return false;
9702
9703       return true;
9704     };
9705     auto tryToWidenViaDuplication = [&]() -> SDValue {
9706       if (!canWidenViaDuplication(Mask))
9707         return SDValue();
9708       SmallVector<int, 4> LoInputs;
9709       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
9710                    [](int M) { return M >= 0 && M < 8; });
9711       std::sort(LoInputs.begin(), LoInputs.end());
9712       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
9713                      LoInputs.end());
9714       SmallVector<int, 4> HiInputs;
9715       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
9716                    [](int M) { return M >= 8; });
9717       std::sort(HiInputs.begin(), HiInputs.end());
9718       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
9719                      HiInputs.end());
9720
9721       bool TargetLo = LoInputs.size() >= HiInputs.size();
9722       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
9723       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
9724
9725       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9726       SmallDenseMap<int, int, 8> LaneMap;
9727       for (int I : InPlaceInputs) {
9728         PreDupI16Shuffle[I/2] = I/2;
9729         LaneMap[I] = I;
9730       }
9731       int j = TargetLo ? 0 : 4, je = j + 4;
9732       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
9733         // Check if j is already a shuffle of this input. This happens when
9734         // there are two adjacent bytes after we move the low one.
9735         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
9736           // If we haven't yet mapped the input, search for a slot into which
9737           // we can map it.
9738           while (j < je && PreDupI16Shuffle[j] != -1)
9739             ++j;
9740
9741           if (j == je)
9742             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
9743             return SDValue();
9744
9745           // Map this input with the i16 shuffle.
9746           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
9747         }
9748
9749         // Update the lane map based on the mapping we ended up with.
9750         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
9751       }
9752       V1 = DAG.getNode(
9753           ISD::BITCAST, DL, MVT::v16i8,
9754           DAG.getVectorShuffle(MVT::v8i16, DL,
9755                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9756                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
9757
9758       // Unpack the bytes to form the i16s that will be shuffled into place.
9759       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9760                        MVT::v16i8, V1, V1);
9761
9762       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9763       for (int i = 0; i < 16; ++i)
9764         if (Mask[i] != -1) {
9765           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
9766           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
9767           if (PostDupI16Shuffle[i / 2] == -1)
9768             PostDupI16Shuffle[i / 2] = MappedMask;
9769           else
9770             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
9771                    "Conflicting entrties in the original shuffle!");
9772         }
9773       return DAG.getNode(
9774           ISD::BITCAST, DL, MVT::v16i8,
9775           DAG.getVectorShuffle(MVT::v8i16, DL,
9776                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9777                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
9778     };
9779     if (SDValue V = tryToWidenViaDuplication())
9780       return V;
9781   }
9782
9783   // Check whether an interleaving lowering is likely to be more efficient.
9784   // This isn't perfect but it is a strong heuristic that tends to work well on
9785   // the kinds of shuffles that show up in practice.
9786   //
9787   // FIXME: We need to handle other interleaving widths (i16, i32, ...).
9788   if (shouldLowerAsInterleaving(Mask)) {
9789     int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9790       return (M >= 0 && M < 8) || (M >= 16 && M < 24);
9791     });
9792     int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9793       return (M >= 8 && M < 16) || M >= 24;
9794     });
9795     int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9796                      -1, -1, -1, -1, -1, -1, -1, -1};
9797     int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9798                      -1, -1, -1, -1, -1, -1, -1, -1};
9799     bool UnpackLo = NumLoHalf >= NumHiHalf;
9800     MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
9801     MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
9802     for (int i = 0; i < 8; ++i) {
9803       TargetEMask[i] = Mask[2 * i];
9804       TargetOMask[i] = Mask[2 * i + 1];
9805     }
9806
9807     SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
9808     SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
9809
9810     return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9811                        MVT::v16i8, Evens, Odds);
9812   }
9813
9814   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
9815   // with PSHUFB. It is important to do this before we attempt to generate any
9816   // blends but after all of the single-input lowerings. If the single input
9817   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
9818   // want to preserve that and we can DAG combine any longer sequences into
9819   // a PSHUFB in the end. But once we start blending from multiple inputs,
9820   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
9821   // and there are *very* few patterns that would actually be faster than the
9822   // PSHUFB approach because of its ability to zero lanes.
9823   //
9824   // FIXME: The only exceptions to the above are blends which are exact
9825   // interleavings with direct instructions supporting them. We currently don't
9826   // handle those well here.
9827   if (Subtarget->hasSSSE3()) {
9828     SDValue V1Mask[16];
9829     SDValue V2Mask[16];
9830     bool V1InUse = false;
9831     bool V2InUse = false;
9832     SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9833
9834     for (int i = 0; i < 16; ++i) {
9835       if (Mask[i] == -1) {
9836         V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9837       } else {
9838         const int ZeroMask = 0x80;
9839         int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
9840         int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
9841         if (Zeroable[i])
9842           V1Idx = V2Idx = ZeroMask;
9843         V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
9844         V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
9845         V1InUse |= (ZeroMask != V1Idx);
9846         V2InUse |= (ZeroMask != V2Idx);
9847       }
9848     }
9849
9850     if (V1InUse)
9851       V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
9852                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
9853     if (V2InUse)
9854       V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
9855                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
9856
9857     // If we need shuffled inputs from both, blend the two.
9858     if (V1InUse && V2InUse)
9859       return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9860     if (V1InUse)
9861       return V1; // Single inputs are easy.
9862     if (V2InUse)
9863       return V2; // Single inputs are easy.
9864     // Shuffling to a zeroable vector.
9865     return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
9866   }
9867
9868   // There are special ways we can lower some single-element blends.
9869   if (NumV2Elements == 1)
9870     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
9871                                                          Mask, Subtarget, DAG))
9872       return V;
9873
9874   // Check whether a compaction lowering can be done. This handles shuffles
9875   // which take every Nth element for some even N. See the helper function for
9876   // details.
9877   //
9878   // We special case these as they can be particularly efficiently handled with
9879   // the PACKUSB instruction on x86 and they show up in common patterns of
9880   // rearranging bytes to truncate wide elements.
9881   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
9882     // NumEvenDrops is the power of two stride of the elements. Another way of
9883     // thinking about it is that we need to drop the even elements this many
9884     // times to get the original input.
9885     bool IsSingleInput = isSingleInputShuffleMask(Mask);
9886
9887     // First we need to zero all the dropped bytes.
9888     assert(NumEvenDrops <= 3 &&
9889            "No support for dropping even elements more than 3 times.");
9890     // We use the mask type to pick which bytes are preserved based on how many
9891     // elements are dropped.
9892     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
9893     SDValue ByteClearMask =
9894         DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
9895                     DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
9896     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
9897     if (!IsSingleInput)
9898       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
9899
9900     // Now pack things back together.
9901     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
9902     V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
9903     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
9904     for (int i = 1; i < NumEvenDrops; ++i) {
9905       Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
9906       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
9907     }
9908
9909     return Result;
9910   }
9911
9912   int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9913   int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9914   int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9915   int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9916
9917   auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
9918                             MutableArrayRef<int> V1HalfBlendMask,
9919                             MutableArrayRef<int> V2HalfBlendMask) {
9920     for (int i = 0; i < 8; ++i)
9921       if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
9922         V1HalfBlendMask[i] = HalfMask[i];
9923         HalfMask[i] = i;
9924       } else if (HalfMask[i] >= 16) {
9925         V2HalfBlendMask[i] = HalfMask[i] - 16;
9926         HalfMask[i] = i + 8;
9927       }
9928   };
9929   buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
9930   buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
9931
9932   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
9933
9934   auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
9935                              MutableArrayRef<int> HiBlendMask) {
9936     SDValue V1, V2;
9937     // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
9938     // them out and avoid using UNPCK{L,H} to extract the elements of V as
9939     // i16s.
9940     if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
9941                      [](int M) { return M >= 0 && M % 2 == 1; }) &&
9942         std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
9943                      [](int M) { return M >= 0 && M % 2 == 1; })) {
9944       // Use a mask to drop the high bytes.
9945       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
9946       V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
9947                        DAG.getConstant(0x00FF, MVT::v8i16));
9948
9949       // This will be a single vector shuffle instead of a blend so nuke V2.
9950       V2 = DAG.getUNDEF(MVT::v8i16);
9951
9952       // Squash the masks to point directly into V1.
9953       for (int &M : LoBlendMask)
9954         if (M >= 0)
9955           M /= 2;
9956       for (int &M : HiBlendMask)
9957         if (M >= 0)
9958           M /= 2;
9959     } else {
9960       // Otherwise just unpack the low half of V into V1 and the high half into
9961       // V2 so that we can blend them as i16s.
9962       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9963                        DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
9964       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9965                        DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
9966     }
9967
9968     SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9969     SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9970     return std::make_pair(BlendedLo, BlendedHi);
9971   };
9972   SDValue V1Lo, V1Hi, V2Lo, V2Hi;
9973   std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
9974   std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
9975
9976   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
9977   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
9978
9979   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
9980 }
9981
9982 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
9983 ///
9984 /// This routine breaks down the specific type of 128-bit shuffle and
9985 /// dispatches to the lowering routines accordingly.
9986 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9987                                         MVT VT, const X86Subtarget *Subtarget,
9988                                         SelectionDAG &DAG) {
9989   switch (VT.SimpleTy) {
9990   case MVT::v2i64:
9991     return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9992   case MVT::v2f64:
9993     return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9994   case MVT::v4i32:
9995     return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9996   case MVT::v4f32:
9997     return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9998   case MVT::v8i16:
9999     return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10000   case MVT::v16i8:
10001     return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10002
10003   default:
10004     llvm_unreachable("Unimplemented!");
10005   }
10006 }
10007
10008 /// \brief Helper function to test whether a shuffle mask could be
10009 /// simplified by widening the elements being shuffled.
10010 ///
10011 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
10012 /// leaves it in an unspecified state.
10013 ///
10014 /// NOTE: This must handle normal vector shuffle masks and *target* vector
10015 /// shuffle masks. The latter have the special property of a '-2' representing
10016 /// a zero-ed lane of a vector.
10017 static bool canWidenShuffleElements(ArrayRef<int> Mask,
10018                                     SmallVectorImpl<int> &WidenedMask) {
10019   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
10020     // If both elements are undef, its trivial.
10021     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
10022       WidenedMask.push_back(SM_SentinelUndef);
10023       continue;
10024     }
10025
10026     // Check for an undef mask and a mask value properly aligned to fit with
10027     // a pair of values. If we find such a case, use the non-undef mask's value.
10028     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
10029       WidenedMask.push_back(Mask[i + 1] / 2);
10030       continue;
10031     }
10032     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
10033       WidenedMask.push_back(Mask[i] / 2);
10034       continue;
10035     }
10036
10037     // When zeroing, we need to spread the zeroing across both lanes to widen.
10038     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
10039       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
10040           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
10041         WidenedMask.push_back(SM_SentinelZero);
10042         continue;
10043       }
10044       return false;
10045     }
10046
10047     // Finally check if the two mask values are adjacent and aligned with
10048     // a pair.
10049     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
10050       WidenedMask.push_back(Mask[i] / 2);
10051       continue;
10052     }
10053
10054     // Otherwise we can't safely widen the elements used in this shuffle.
10055     return false;
10056   }
10057   assert(WidenedMask.size() == Mask.size() / 2 &&
10058          "Incorrect size of mask after widening the elements!");
10059
10060   return true;
10061 }
10062
10063 /// \brief Generic routine to split ector shuffle into half-sized shuffles.
10064 ///
10065 /// This routine just extracts two subvectors, shuffles them independently, and
10066 /// then concatenates them back together. This should work effectively with all
10067 /// AVX vector shuffle types.
10068 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10069                                           SDValue V2, ArrayRef<int> Mask,
10070                                           SelectionDAG &DAG) {
10071   assert(VT.getSizeInBits() >= 256 &&
10072          "Only for 256-bit or wider vector shuffles!");
10073   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
10074   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
10075
10076   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
10077   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
10078
10079   int NumElements = VT.getVectorNumElements();
10080   int SplitNumElements = NumElements / 2;
10081   MVT ScalarVT = VT.getScalarType();
10082   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
10083
10084   SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
10085                              DAG.getIntPtrConstant(0));
10086   SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
10087                              DAG.getIntPtrConstant(SplitNumElements));
10088   SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
10089                              DAG.getIntPtrConstant(0));
10090   SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
10091                              DAG.getIntPtrConstant(SplitNumElements));
10092
10093   // Now create two 4-way blends of these half-width vectors.
10094   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
10095     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
10096     SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
10097     for (int i = 0; i < SplitNumElements; ++i) {
10098       int M = HalfMask[i];
10099       if (M >= NumElements) {
10100         if (M >= NumElements + SplitNumElements)
10101           UseHiV2 = true;
10102         else
10103           UseLoV2 = true;
10104         V2BlendMask.push_back(M - NumElements);
10105         V1BlendMask.push_back(-1);
10106         BlendMask.push_back(SplitNumElements + i);
10107       } else if (M >= 0) {
10108         if (M >= SplitNumElements)
10109           UseHiV1 = true;
10110         else
10111           UseLoV1 = true;
10112         V2BlendMask.push_back(-1);
10113         V1BlendMask.push_back(M);
10114         BlendMask.push_back(i);
10115       } else {
10116         V2BlendMask.push_back(-1);
10117         V1BlendMask.push_back(-1);
10118         BlendMask.push_back(-1);
10119       }
10120     }
10121
10122     // Because the lowering happens after all combining takes place, we need to
10123     // manually combine these blend masks as much as possible so that we create
10124     // a minimal number of high-level vector shuffle nodes.
10125
10126     // First try just blending the halves of V1 or V2.
10127     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
10128       return DAG.getUNDEF(SplitVT);
10129     if (!UseLoV2 && !UseHiV2)
10130       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10131     if (!UseLoV1 && !UseHiV1)
10132       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10133
10134     SDValue V1Blend, V2Blend;
10135     if (UseLoV1 && UseHiV1) {
10136       V1Blend =
10137         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10138     } else {
10139       // We only use half of V1 so map the usage down into the final blend mask.
10140       V1Blend = UseLoV1 ? LoV1 : HiV1;
10141       for (int i = 0; i < SplitNumElements; ++i)
10142         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
10143           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
10144     }
10145     if (UseLoV2 && UseHiV2) {
10146       V2Blend =
10147         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10148     } else {
10149       // We only use half of V2 so map the usage down into the final blend mask.
10150       V2Blend = UseLoV2 ? LoV2 : HiV2;
10151       for (int i = 0; i < SplitNumElements; ++i)
10152         if (BlendMask[i] >= SplitNumElements)
10153           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
10154     }
10155     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
10156   };
10157   SDValue Lo = HalfBlend(LoMask);
10158   SDValue Hi = HalfBlend(HiMask);
10159   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
10160 }
10161
10162 /// \brief Either split a vector in halves or decompose the shuffles and the
10163 /// blend.
10164 ///
10165 /// This is provided as a good fallback for many lowerings of non-single-input
10166 /// shuffles with more than one 128-bit lane. In those cases, we want to select
10167 /// between splitting the shuffle into 128-bit components and stitching those
10168 /// back together vs. extracting the single-input shuffles and blending those
10169 /// results.
10170 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
10171                                                 SDValue V2, ArrayRef<int> Mask,
10172                                                 SelectionDAG &DAG) {
10173   assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
10174                                             "lower single-input shuffles as it "
10175                                             "could then recurse on itself.");
10176   int Size = Mask.size();
10177
10178   // If this can be modeled as a broadcast of two elements followed by a blend,
10179   // prefer that lowering. This is especially important because broadcasts can
10180   // often fold with memory operands.
10181   auto DoBothBroadcast = [&] {
10182     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10183     for (int M : Mask)
10184       if (M >= Size) {
10185         if (V2BroadcastIdx == -1)
10186           V2BroadcastIdx = M - Size;
10187         else if (M - Size != V2BroadcastIdx)
10188           return false;
10189       } else if (M >= 0) {
10190         if (V1BroadcastIdx == -1)
10191           V1BroadcastIdx = M;
10192         else if (M != V1BroadcastIdx)
10193           return false;
10194       }
10195     return true;
10196   };
10197   if (DoBothBroadcast())
10198     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10199                                                       DAG);
10200
10201   // If the inputs all stem from a single 128-bit lane of each input, then we
10202   // split them rather than blending because the split will decompose to
10203   // unusually few instructions.
10204   int LaneCount = VT.getSizeInBits() / 128;
10205   int LaneSize = Size / LaneCount;
10206   SmallBitVector LaneInputs[2];
10207   LaneInputs[0].resize(LaneCount, false);
10208   LaneInputs[1].resize(LaneCount, false);
10209   for (int i = 0; i < Size; ++i)
10210     if (Mask[i] >= 0)
10211       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10212   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10213     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10214
10215   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10216   // that the decomposed single-input shuffles don't end up here.
10217   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10218 }
10219
10220 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10221 /// a permutation and blend of those lanes.
10222 ///
10223 /// This essentially blends the out-of-lane inputs to each lane into the lane
10224 /// from a permuted copy of the vector. This lowering strategy results in four
10225 /// instructions in the worst case for a single-input cross lane shuffle which
10226 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10227 /// of. Special cases for each particular shuffle pattern should be handled
10228 /// prior to trying this lowering.
10229 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
10230                                                        SDValue V1, SDValue V2,
10231                                                        ArrayRef<int> Mask,
10232                                                        SelectionDAG &DAG) {
10233   // FIXME: This should probably be generalized for 512-bit vectors as well.
10234   assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
10235   int LaneSize = Mask.size() / 2;
10236
10237   // If there are only inputs from one 128-bit lane, splitting will in fact be
10238   // less expensive. The flags track wether the given lane contains an element
10239   // that crosses to another lane.
10240   bool LaneCrossing[2] = {false, false};
10241   for (int i = 0, Size = Mask.size(); i < Size; ++i)
10242     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10243       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10244   if (!LaneCrossing[0] || !LaneCrossing[1])
10245     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10246
10247   if (isSingleInputShuffleMask(Mask)) {
10248     SmallVector<int, 32> FlippedBlendMask;
10249     for (int i = 0, Size = Mask.size(); i < Size; ++i)
10250       FlippedBlendMask.push_back(
10251           Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10252                                   ? Mask[i]
10253                                   : Mask[i] % LaneSize +
10254                                         (i / LaneSize) * LaneSize + Size));
10255
10256     // Flip the vector, and blend the results which should now be in-lane. The
10257     // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10258     // 5 for the high source. The value 3 selects the high half of source 2 and
10259     // the value 2 selects the low half of source 2. We only use source 2 to
10260     // allow folding it into a memory operand.
10261     unsigned PERMMask = 3 | 2 << 4;
10262     SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10263                                   V1, DAG.getConstant(PERMMask, MVT::i8));
10264     return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10265   }
10266
10267   // This now reduces to two single-input shuffles of V1 and V2 which at worst
10268   // will be handled by the above logic and a blend of the results, much like
10269   // other patterns in AVX.
10270   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10271 }
10272
10273 /// \brief Handle lowering 2-lane 128-bit shuffles.
10274 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10275                                         SDValue V2, ArrayRef<int> Mask,
10276                                         const X86Subtarget *Subtarget,
10277                                         SelectionDAG &DAG) {
10278   // Blends are faster and handle all the non-lane-crossing cases.
10279   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10280                                                 Subtarget, DAG))
10281     return Blend;
10282
10283   MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10284                                VT.getVectorNumElements() / 2);
10285   // Check for patterns which can be matched with a single insert of a 128-bit
10286   // subvector.
10287   if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
10288       isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
10289     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10290                               DAG.getIntPtrConstant(0));
10291     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10292                               Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
10293     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10294   }
10295   if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
10296     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10297                               DAG.getIntPtrConstant(0));
10298     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
10299                               DAG.getIntPtrConstant(2));
10300     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10301   }
10302
10303   // Otherwise form a 128-bit permutation.
10304   // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
10305   unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
10306   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10307                      DAG.getConstant(PermMask, MVT::i8));
10308 }
10309
10310 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10311 /// shuffling each lane.
10312 ///
10313 /// This will only succeed when the result of fixing the 128-bit lanes results
10314 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10315 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10316 /// the lane crosses early and then use simpler shuffles within each lane.
10317 ///
10318 /// FIXME: It might be worthwhile at some point to support this without
10319 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10320 /// in x86 only floating point has interesting non-repeating shuffles, and even
10321 /// those are still *marginally* more expensive.
10322 static SDValue lowerVectorShuffleByMerging128BitLanes(
10323     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10324     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
10325   assert(!isSingleInputShuffleMask(Mask) &&
10326          "This is only useful with multiple inputs.");
10327
10328   int Size = Mask.size();
10329   int LaneSize = 128 / VT.getScalarSizeInBits();
10330   int NumLanes = Size / LaneSize;
10331   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10332
10333   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10334   // check whether the in-128-bit lane shuffles share a repeating pattern.
10335   SmallVector<int, 4> Lanes;
10336   Lanes.resize(NumLanes, -1);
10337   SmallVector<int, 4> InLaneMask;
10338   InLaneMask.resize(LaneSize, -1);
10339   for (int i = 0; i < Size; ++i) {
10340     if (Mask[i] < 0)
10341       continue;
10342
10343     int j = i / LaneSize;
10344
10345     if (Lanes[j] < 0) {
10346       // First entry we've seen for this lane.
10347       Lanes[j] = Mask[i] / LaneSize;
10348     } else if (Lanes[j] != Mask[i] / LaneSize) {
10349       // This doesn't match the lane selected previously!
10350       return SDValue();
10351     }
10352
10353     // Check that within each lane we have a consistent shuffle mask.
10354     int k = i % LaneSize;
10355     if (InLaneMask[k] < 0) {
10356       InLaneMask[k] = Mask[i] % LaneSize;
10357     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10358       // This doesn't fit a repeating in-lane mask.
10359       return SDValue();
10360     }
10361   }
10362
10363   // First shuffle the lanes into place.
10364   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10365                                 VT.getSizeInBits() / 64);
10366   SmallVector<int, 8> LaneMask;
10367   LaneMask.resize(NumLanes * 2, -1);
10368   for (int i = 0; i < NumLanes; ++i)
10369     if (Lanes[i] >= 0) {
10370       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10371       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10372     }
10373
10374   V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
10375   V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
10376   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10377
10378   // Cast it back to the type we actually want.
10379   LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
10380
10381   // Now do a simple shuffle that isn't lane crossing.
10382   SmallVector<int, 8> NewMask;
10383   NewMask.resize(Size, -1);
10384   for (int i = 0; i < Size; ++i)
10385     if (Mask[i] >= 0)
10386       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10387   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10388          "Must not introduce lane crosses at this point!");
10389
10390   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10391 }
10392
10393 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10394 /// given mask.
10395 ///
10396 /// This returns true if the elements from a particular input are already in the
10397 /// slot required by the given mask and require no permutation.
10398 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10399   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10400   int Size = Mask.size();
10401   for (int i = 0; i < Size; ++i)
10402     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10403       return false;
10404
10405   return true;
10406 }
10407
10408 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
10409 ///
10410 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
10411 /// isn't available.
10412 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10413                                        const X86Subtarget *Subtarget,
10414                                        SelectionDAG &DAG) {
10415   SDLoc DL(Op);
10416   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10417   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10418   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10419   ArrayRef<int> Mask = SVOp->getMask();
10420   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10421
10422   SmallVector<int, 4> WidenedMask;
10423   if (canWidenShuffleElements(Mask, WidenedMask))
10424     return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
10425                                     DAG);
10426
10427   if (isSingleInputShuffleMask(Mask)) {
10428     // Check for being able to broadcast a single element.
10429     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
10430                                                           Mask, Subtarget, DAG))
10431       return Broadcast;
10432
10433     // Use low duplicate instructions for masks that match their pattern.
10434     if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
10435       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
10436
10437     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
10438       // Non-half-crossing single input shuffles can be lowerid with an
10439       // interleaved permutation.
10440       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
10441                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
10442       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
10443                          DAG.getConstant(VPERMILPMask, MVT::i8));
10444     }
10445
10446     // With AVX2 we have direct support for this permutation.
10447     if (Subtarget->hasAVX2())
10448       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
10449                          getV4X86ShuffleImm8ForMask(Mask, DAG));
10450
10451     // Otherwise, fall back.
10452     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
10453                                                    DAG);
10454   }
10455
10456   // X86 has dedicated unpack instructions that can handle specific blend
10457   // operations: UNPCKH and UNPCKL.
10458   if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10459     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
10460   if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10461     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
10462
10463   // If we have a single input to the zero element, insert that into V1 if we
10464   // can do so cheaply.
10465   int NumV2Elements =
10466       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
10467   if (NumV2Elements == 1 && Mask[0] >= 4)
10468     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10469             MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
10470       return Insertion;
10471
10472   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
10473                                                 Subtarget, DAG))
10474     return Blend;
10475
10476   // Check if the blend happens to exactly fit that of SHUFPD.
10477   if ((Mask[0] == -1 || Mask[0] < 2) &&
10478       (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
10479       (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
10480       (Mask[3] == -1 || Mask[3] >= 6)) {
10481     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
10482                           ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
10483     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
10484                        DAG.getConstant(SHUFPDMask, MVT::i8));
10485   }
10486   if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
10487       (Mask[1] == -1 || Mask[1] < 2) &&
10488       (Mask[2] == -1 || Mask[2] >= 6) &&
10489       (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
10490     unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
10491                           ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
10492     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
10493                        DAG.getConstant(SHUFPDMask, MVT::i8));
10494   }
10495
10496   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10497   // shuffle. However, if we have AVX2 and either inputs are already in place,
10498   // we will be able to shuffle even across lanes the other input in a single
10499   // instruction so skip this pattern.
10500   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10501                                  isShuffleMaskInputInPlace(1, Mask))))
10502     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10503             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10504       return Result;
10505
10506   // If we have AVX2 then we always want to lower with a blend because an v4 we
10507   // can fully permute the elements.
10508   if (Subtarget->hasAVX2())
10509     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
10510                                                       Mask, DAG);
10511
10512   // Otherwise fall back on generic lowering.
10513   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
10514 }
10515
10516 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
10517 ///
10518 /// This routine is only called when we have AVX2 and thus a reasonable
10519 /// instruction set for v4i64 shuffling..
10520 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10521                                        const X86Subtarget *Subtarget,
10522                                        SelectionDAG &DAG) {
10523   SDLoc DL(Op);
10524   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10525   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10526   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10527   ArrayRef<int> Mask = SVOp->getMask();
10528   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10529   assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
10530
10531   SmallVector<int, 4> WidenedMask;
10532   if (canWidenShuffleElements(Mask, WidenedMask))
10533     return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
10534                                     DAG);
10535
10536   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
10537                                                 Subtarget, DAG))
10538     return Blend;
10539
10540   // Check for being able to broadcast a single element.
10541   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
10542                                                         Mask, Subtarget, DAG))
10543     return Broadcast;
10544
10545   // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
10546   // use lower latency instructions that will operate on both 128-bit lanes.
10547   SmallVector<int, 2> RepeatedMask;
10548   if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
10549     if (isSingleInputShuffleMask(Mask)) {
10550       int PSHUFDMask[] = {-1, -1, -1, -1};
10551       for (int i = 0; i < 2; ++i)
10552         if (RepeatedMask[i] >= 0) {
10553           PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
10554           PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
10555         }
10556       return DAG.getNode(
10557           ISD::BITCAST, DL, MVT::v4i64,
10558           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
10559                       DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
10560                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
10561     }
10562
10563     // Use dedicated unpack instructions for masks that match their pattern.
10564     if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10565       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
10566     if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10567       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
10568   }
10569
10570   // AVX2 provides a direct instruction for permuting a single input across
10571   // lanes.
10572   if (isSingleInputShuffleMask(Mask))
10573     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
10574                        getV4X86ShuffleImm8ForMask(Mask, DAG));
10575
10576   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10577   // shuffle. However, if we have AVX2 and either inputs are already in place,
10578   // we will be able to shuffle even across lanes the other input in a single
10579   // instruction so skip this pattern.
10580   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10581                                  isShuffleMaskInputInPlace(1, Mask))))
10582     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10583             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
10584       return Result;
10585
10586   // Otherwise fall back on generic blend lowering.
10587   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
10588                                                     Mask, DAG);
10589 }
10590
10591 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
10592 ///
10593 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
10594 /// isn't available.
10595 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10596                                        const X86Subtarget *Subtarget,
10597                                        SelectionDAG &DAG) {
10598   SDLoc DL(Op);
10599   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10600   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10601   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10602   ArrayRef<int> Mask = SVOp->getMask();
10603   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10604
10605   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
10606                                                 Subtarget, DAG))
10607     return Blend;
10608
10609   // Check for being able to broadcast a single element.
10610   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
10611                                                         Mask, Subtarget, DAG))
10612     return Broadcast;
10613
10614   // If the shuffle mask is repeated in each 128-bit lane, we have many more
10615   // options to efficiently lower the shuffle.
10616   SmallVector<int, 4> RepeatedMask;
10617   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
10618     assert(RepeatedMask.size() == 4 &&
10619            "Repeated masks must be half the mask width!");
10620
10621     // Use even/odd duplicate instructions for masks that match their pattern.
10622     if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6))
10623       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
10624     if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7))
10625       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
10626
10627     if (isSingleInputShuffleMask(Mask))
10628       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
10629                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10630
10631     // Use dedicated unpack instructions for masks that match their pattern.
10632     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10633       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
10634     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10635       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
10636
10637     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
10638     // have already handled any direct blends. We also need to squash the
10639     // repeated mask into a simulated v4f32 mask.
10640     for (int i = 0; i < 4; ++i)
10641       if (RepeatedMask[i] >= 8)
10642         RepeatedMask[i] -= 4;
10643     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
10644   }
10645
10646   // If we have a single input shuffle with different shuffle patterns in the
10647   // two 128-bit lanes use the variable mask to VPERMILPS.
10648   if (isSingleInputShuffleMask(Mask)) {
10649     SDValue VPermMask[8];
10650     for (int i = 0; i < 8; ++i)
10651       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10652                                  : DAG.getConstant(Mask[i], MVT::i32);
10653     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
10654       return DAG.getNode(
10655           X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
10656           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
10657
10658     if (Subtarget->hasAVX2())
10659       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
10660                          DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
10661                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
10662                                                  MVT::v8i32, VPermMask)),
10663                          V1);
10664
10665     // Otherwise, fall back.
10666     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
10667                                                    DAG);
10668   }
10669
10670   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10671   // shuffle.
10672   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10673           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
10674     return Result;
10675
10676   // If we have AVX2 then we always want to lower with a blend because at v8 we
10677   // can fully permute the elements.
10678   if (Subtarget->hasAVX2())
10679     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
10680                                                       Mask, DAG);
10681
10682   // Otherwise fall back on generic lowering.
10683   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
10684 }
10685
10686 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
10687 ///
10688 /// This routine is only called when we have AVX2 and thus a reasonable
10689 /// instruction set for v8i32 shuffling..
10690 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10691                                        const X86Subtarget *Subtarget,
10692                                        SelectionDAG &DAG) {
10693   SDLoc DL(Op);
10694   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10695   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10696   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10697   ArrayRef<int> Mask = SVOp->getMask();
10698   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10699   assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
10700
10701   // Whenever we can lower this as a zext, that instruction is strictly faster
10702   // than any alternative. It also allows us to fold memory operands into the
10703   // shuffle in many cases.
10704   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
10705                                                          Mask, Subtarget, DAG))
10706     return ZExt;
10707
10708   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
10709                                                 Subtarget, DAG))
10710     return Blend;
10711
10712   // Check for being able to broadcast a single element.
10713   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
10714                                                         Mask, Subtarget, DAG))
10715     return Broadcast;
10716
10717   // If the shuffle mask is repeated in each 128-bit lane we can use more
10718   // efficient instructions that mirror the shuffles across the two 128-bit
10719   // lanes.
10720   SmallVector<int, 4> RepeatedMask;
10721   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
10722     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
10723     if (isSingleInputShuffleMask(Mask))
10724       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
10725                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10726
10727     // Use dedicated unpack instructions for masks that match their pattern.
10728     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10729       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
10730     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10731       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
10732   }
10733
10734   // If the shuffle patterns aren't repeated but it is a single input, directly
10735   // generate a cross-lane VPERMD instruction.
10736   if (isSingleInputShuffleMask(Mask)) {
10737     SDValue VPermMask[8];
10738     for (int i = 0; i < 8; ++i)
10739       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10740                                  : DAG.getConstant(Mask[i], MVT::i32);
10741     return DAG.getNode(
10742         X86ISD::VPERMV, DL, MVT::v8i32,
10743         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
10744   }
10745
10746   // Try to use bit shift instructions.
10747   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10748           DL, MVT::v8i32, V1, V2, Mask, DAG))
10749     return Shift;
10750
10751   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10752   // shuffle.
10753   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10754           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
10755     return Result;
10756
10757   // Otherwise fall back on generic blend lowering.
10758   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
10759                                                     Mask, DAG);
10760 }
10761
10762 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
10763 ///
10764 /// This routine is only called when we have AVX2 and thus a reasonable
10765 /// instruction set for v16i16 shuffling..
10766 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10767                                         const X86Subtarget *Subtarget,
10768                                         SelectionDAG &DAG) {
10769   SDLoc DL(Op);
10770   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10771   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10772   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10773   ArrayRef<int> Mask = SVOp->getMask();
10774   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10775   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
10776
10777   // Whenever we can lower this as a zext, that instruction is strictly faster
10778   // than any alternative. It also allows us to fold memory operands into the
10779   // shuffle in many cases.
10780   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
10781                                                          Mask, Subtarget, DAG))
10782     return ZExt;
10783
10784   // Check for being able to broadcast a single element.
10785   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
10786                                                         Mask, Subtarget, DAG))
10787     return Broadcast;
10788
10789   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
10790                                                 Subtarget, DAG))
10791     return Blend;
10792
10793   // Use dedicated unpack instructions for masks that match their pattern.
10794   if (isShuffleEquivalent(Mask,
10795                           // First 128-bit lane:
10796                           0, 16, 1, 17, 2, 18, 3, 19,
10797                           // Second 128-bit lane:
10798                           8, 24, 9, 25, 10, 26, 11, 27))
10799     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
10800   if (isShuffleEquivalent(Mask,
10801                           // First 128-bit lane:
10802                           4, 20, 5, 21, 6, 22, 7, 23,
10803                           // Second 128-bit lane:
10804                           12, 28, 13, 29, 14, 30, 15, 31))
10805     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
10806
10807   if (isSingleInputShuffleMask(Mask)) {
10808     // There are no generalized cross-lane shuffle operations available on i16
10809     // element types.
10810     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
10811       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
10812                                                      Mask, DAG);
10813
10814     SDValue PSHUFBMask[32];
10815     for (int i = 0; i < 16; ++i) {
10816       if (Mask[i] == -1) {
10817         PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
10818         continue;
10819       }
10820
10821       int M = i < 8 ? Mask[i] : Mask[i] - 8;
10822       assert(M >= 0 && M < 8 && "Invalid single-input mask!");
10823       PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
10824       PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
10825     }
10826     return DAG.getNode(
10827         ISD::BITCAST, DL, MVT::v16i16,
10828         DAG.getNode(
10829             X86ISD::PSHUFB, DL, MVT::v32i8,
10830             DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
10831             DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
10832   }
10833
10834   // Try to use bit shift instructions.
10835   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10836           DL, MVT::v16i16, V1, V2, Mask, DAG))
10837     return Shift;
10838
10839   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10840   // shuffle.
10841   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10842           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
10843     return Result;
10844
10845   // Otherwise fall back on generic lowering.
10846   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
10847 }
10848
10849 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
10850 ///
10851 /// This routine is only called when we have AVX2 and thus a reasonable
10852 /// instruction set for v32i8 shuffling..
10853 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10854                                        const X86Subtarget *Subtarget,
10855                                        SelectionDAG &DAG) {
10856   SDLoc DL(Op);
10857   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10858   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10859   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10860   ArrayRef<int> Mask = SVOp->getMask();
10861   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10862   assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
10863
10864   // Whenever we can lower this as a zext, that instruction is strictly faster
10865   // than any alternative. It also allows us to fold memory operands into the
10866   // shuffle in many cases.
10867   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
10868                                                          Mask, Subtarget, DAG))
10869     return ZExt;
10870
10871   // Check for being able to broadcast a single element.
10872   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
10873                                                         Mask, Subtarget, DAG))
10874     return Broadcast;
10875
10876   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
10877                                                 Subtarget, DAG))
10878     return Blend;
10879
10880   // Use dedicated unpack instructions for masks that match their pattern.
10881   // Note that these are repeated 128-bit lane unpacks, not unpacks across all
10882   // 256-bit lanes.
10883   if (isShuffleEquivalent(
10884           Mask,
10885           // First 128-bit lane:
10886           0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
10887           // Second 128-bit lane:
10888           16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
10889     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
10890   if (isShuffleEquivalent(
10891           Mask,
10892           // First 128-bit lane:
10893           8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
10894           // Second 128-bit lane:
10895           24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
10896     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
10897
10898   if (isSingleInputShuffleMask(Mask)) {
10899     // There are no generalized cross-lane shuffle operations available on i8
10900     // element types.
10901     if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
10902       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
10903                                                      Mask, DAG);
10904
10905     SDValue PSHUFBMask[32];
10906     for (int i = 0; i < 32; ++i)
10907       PSHUFBMask[i] =
10908           Mask[i] < 0
10909               ? DAG.getUNDEF(MVT::i8)
10910               : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
10911
10912     return DAG.getNode(
10913         X86ISD::PSHUFB, DL, MVT::v32i8, V1,
10914         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
10915   }
10916
10917   // Try to use bit shift instructions.
10918   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10919           DL, MVT::v32i8, V1, V2, Mask, DAG))
10920     return Shift;
10921
10922   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10923   // shuffle.
10924   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10925           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
10926     return Result;
10927
10928   // Otherwise fall back on generic lowering.
10929   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
10930 }
10931
10932 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
10933 ///
10934 /// This routine either breaks down the specific type of a 256-bit x86 vector
10935 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
10936 /// together based on the available instructions.
10937 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10938                                         MVT VT, const X86Subtarget *Subtarget,
10939                                         SelectionDAG &DAG) {
10940   SDLoc DL(Op);
10941   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10942   ArrayRef<int> Mask = SVOp->getMask();
10943
10944   // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
10945   // check for those subtargets here and avoid much of the subtarget querying in
10946   // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
10947   // ability to manipulate a 256-bit vector with integer types. Since we'll use
10948   // floating point types there eventually, just immediately cast everything to
10949   // a float and operate entirely in that domain.
10950   if (VT.isInteger() && !Subtarget->hasAVX2()) {
10951     int ElementBits = VT.getScalarSizeInBits();
10952     if (ElementBits < 32)
10953       // No floating point type available, decompose into 128-bit vectors.
10954       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10955
10956     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
10957                                 VT.getVectorNumElements());
10958     V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
10959     V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
10960     return DAG.getNode(ISD::BITCAST, DL, VT,
10961                        DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
10962   }
10963
10964   switch (VT.SimpleTy) {
10965   case MVT::v4f64:
10966     return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10967   case MVT::v4i64:
10968     return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10969   case MVT::v8f32:
10970     return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10971   case MVT::v8i32:
10972     return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10973   case MVT::v16i16:
10974     return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10975   case MVT::v32i8:
10976     return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10977
10978   default:
10979     llvm_unreachable("Not a valid 256-bit x86 vector type!");
10980   }
10981 }
10982
10983 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
10984 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10985                                        const X86Subtarget *Subtarget,
10986                                        SelectionDAG &DAG) {
10987   SDLoc DL(Op);
10988   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10989   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10990   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10991   ArrayRef<int> Mask = SVOp->getMask();
10992   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10993
10994   // X86 has dedicated unpack instructions that can handle specific blend
10995   // operations: UNPCKH and UNPCKL.
10996   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
10997     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
10998   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
10999     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
11000
11001   // FIXME: Implement direct support for this type!
11002   return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
11003 }
11004
11005 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
11006 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11007                                        const X86Subtarget *Subtarget,
11008                                        SelectionDAG &DAG) {
11009   SDLoc DL(Op);
11010   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11011   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11012   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11013   ArrayRef<int> Mask = SVOp->getMask();
11014   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11015
11016   // Use dedicated unpack instructions for masks that match their pattern.
11017   if (isShuffleEquivalent(Mask,
11018                           0, 16, 1, 17, 4, 20, 5, 21,
11019                           8, 24, 9, 25, 12, 28, 13, 29))
11020     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
11021   if (isShuffleEquivalent(Mask,
11022                           2, 18, 3, 19, 6, 22, 7, 23,
11023                           10, 26, 11, 27, 14, 30, 15, 31))
11024     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
11025
11026   // FIXME: Implement direct support for this type!
11027   return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
11028 }
11029
11030 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
11031 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11032                                        const X86Subtarget *Subtarget,
11033                                        SelectionDAG &DAG) {
11034   SDLoc DL(Op);
11035   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11036   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11037   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11038   ArrayRef<int> Mask = SVOp->getMask();
11039   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11040
11041   // X86 has dedicated unpack instructions that can handle specific blend
11042   // operations: UNPCKH and UNPCKL.
11043   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
11044     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
11045   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11046     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
11047
11048   // FIXME: Implement direct support for this type!
11049   return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
11050 }
11051
11052 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
11053 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11054                                        const X86Subtarget *Subtarget,
11055                                        SelectionDAG &DAG) {
11056   SDLoc DL(Op);
11057   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11058   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11059   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11060   ArrayRef<int> Mask = SVOp->getMask();
11061   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11062
11063   // Use dedicated unpack instructions for masks that match their pattern.
11064   if (isShuffleEquivalent(Mask,
11065                           0, 16, 1, 17, 4, 20, 5, 21,
11066                           8, 24, 9, 25, 12, 28, 13, 29))
11067     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
11068   if (isShuffleEquivalent(Mask,
11069                           2, 18, 3, 19, 6, 22, 7, 23,
11070                           10, 26, 11, 27, 14, 30, 15, 31))
11071     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
11072
11073   // FIXME: Implement direct support for this type!
11074   return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
11075 }
11076
11077 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
11078 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11079                                         const X86Subtarget *Subtarget,
11080                                         SelectionDAG &DAG) {
11081   SDLoc DL(Op);
11082   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11083   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11084   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11085   ArrayRef<int> Mask = SVOp->getMask();
11086   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11087   assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
11088
11089   // FIXME: Implement direct support for this type!
11090   return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
11091 }
11092
11093 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
11094 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11095                                        const X86Subtarget *Subtarget,
11096                                        SelectionDAG &DAG) {
11097   SDLoc DL(Op);
11098   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11099   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11100   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11101   ArrayRef<int> Mask = SVOp->getMask();
11102   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
11103   assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
11104
11105   // FIXME: Implement direct support for this type!
11106   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
11107 }
11108
11109 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
11110 ///
11111 /// This routine either breaks down the specific type of a 512-bit x86 vector
11112 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
11113 /// together based on the available instructions.
11114 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11115                                         MVT VT, const X86Subtarget *Subtarget,
11116                                         SelectionDAG &DAG) {
11117   SDLoc DL(Op);
11118   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11119   ArrayRef<int> Mask = SVOp->getMask();
11120   assert(Subtarget->hasAVX512() &&
11121          "Cannot lower 512-bit vectors w/ basic ISA!");
11122
11123   // Check for being able to broadcast a single element.
11124   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
11125                                                         Mask, Subtarget, DAG))
11126     return Broadcast;
11127
11128   // Dispatch to each element type for lowering. If we don't have supprot for
11129   // specific element type shuffles at 512 bits, immediately split them and
11130   // lower them. Each lowering routine of a given type is allowed to assume that
11131   // the requisite ISA extensions for that element type are available.
11132   switch (VT.SimpleTy) {
11133   case MVT::v8f64:
11134     return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11135   case MVT::v16f32:
11136     return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11137   case MVT::v8i64:
11138     return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11139   case MVT::v16i32:
11140     return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11141   case MVT::v32i16:
11142     if (Subtarget->hasBWI())
11143       return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
11144     break;
11145   case MVT::v64i8:
11146     if (Subtarget->hasBWI())
11147       return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
11148     break;
11149
11150   default:
11151     llvm_unreachable("Not a valid 512-bit x86 vector type!");
11152   }
11153
11154   // Otherwise fall back on splitting.
11155   return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11156 }
11157
11158 /// \brief Top-level lowering for x86 vector shuffles.
11159 ///
11160 /// This handles decomposition, canonicalization, and lowering of all x86
11161 /// vector shuffles. Most of the specific lowering strategies are encapsulated
11162 /// above in helper routines. The canonicalization attempts to widen shuffles
11163 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
11164 /// s.t. only one of the two inputs needs to be tested, etc.
11165 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
11166                                   SelectionDAG &DAG) {
11167   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11168   ArrayRef<int> Mask = SVOp->getMask();
11169   SDValue V1 = Op.getOperand(0);
11170   SDValue V2 = Op.getOperand(1);
11171   MVT VT = Op.getSimpleValueType();
11172   int NumElements = VT.getVectorNumElements();
11173   SDLoc dl(Op);
11174
11175   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
11176
11177   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
11178   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11179   if (V1IsUndef && V2IsUndef)
11180     return DAG.getUNDEF(VT);
11181
11182   // When we create a shuffle node we put the UNDEF node to second operand,
11183   // but in some cases the first operand may be transformed to UNDEF.
11184   // In this case we should just commute the node.
11185   if (V1IsUndef)
11186     return DAG.getCommutedVectorShuffle(*SVOp);
11187
11188   // Check for non-undef masks pointing at an undef vector and make the masks
11189   // undef as well. This makes it easier to match the shuffle based solely on
11190   // the mask.
11191   if (V2IsUndef)
11192     for (int M : Mask)
11193       if (M >= NumElements) {
11194         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
11195         for (int &M : NewMask)
11196           if (M >= NumElements)
11197             M = -1;
11198         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
11199       }
11200
11201   // Try to collapse shuffles into using a vector type with fewer elements but
11202   // wider element types. We cap this to not form integers or floating point
11203   // elements wider than 64 bits, but it might be interesting to form i128
11204   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
11205   SmallVector<int, 16> WidenedMask;
11206   if (VT.getScalarSizeInBits() < 64 &&
11207       canWidenShuffleElements(Mask, WidenedMask)) {
11208     MVT NewEltVT = VT.isFloatingPoint()
11209                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
11210                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
11211     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
11212     // Make sure that the new vector type is legal. For example, v2f64 isn't
11213     // legal on SSE1.
11214     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
11215       V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
11216       V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
11217       return DAG.getNode(ISD::BITCAST, dl, VT,
11218                          DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
11219     }
11220   }
11221
11222   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
11223   for (int M : SVOp->getMask())
11224     if (M < 0)
11225       ++NumUndefElements;
11226     else if (M < NumElements)
11227       ++NumV1Elements;
11228     else
11229       ++NumV2Elements;
11230
11231   // Commute the shuffle as needed such that more elements come from V1 than
11232   // V2. This allows us to match the shuffle pattern strictly on how many
11233   // elements come from V1 without handling the symmetric cases.
11234   if (NumV2Elements > NumV1Elements)
11235     return DAG.getCommutedVectorShuffle(*SVOp);
11236
11237   // When the number of V1 and V2 elements are the same, try to minimize the
11238   // number of uses of V2 in the low half of the vector. When that is tied,
11239   // ensure that the sum of indices for V1 is equal to or lower than the sum
11240   // indices for V2. When those are equal, try to ensure that the number of odd
11241   // indices for V1 is lower than the number of odd indices for V2.
11242   if (NumV1Elements == NumV2Elements) {
11243     int LowV1Elements = 0, LowV2Elements = 0;
11244     for (int M : SVOp->getMask().slice(0, NumElements / 2))
11245       if (M >= NumElements)
11246         ++LowV2Elements;
11247       else if (M >= 0)
11248         ++LowV1Elements;
11249     if (LowV2Elements > LowV1Elements) {
11250       return DAG.getCommutedVectorShuffle(*SVOp);
11251     } else if (LowV2Elements == LowV1Elements) {
11252       int SumV1Indices = 0, SumV2Indices = 0;
11253       for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11254         if (SVOp->getMask()[i] >= NumElements)
11255           SumV2Indices += i;
11256         else if (SVOp->getMask()[i] >= 0)
11257           SumV1Indices += i;
11258       if (SumV2Indices < SumV1Indices) {
11259         return DAG.getCommutedVectorShuffle(*SVOp);
11260       } else if (SumV2Indices == SumV1Indices) {
11261         int NumV1OddIndices = 0, NumV2OddIndices = 0;
11262         for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11263           if (SVOp->getMask()[i] >= NumElements)
11264             NumV2OddIndices += i % 2;
11265           else if (SVOp->getMask()[i] >= 0)
11266             NumV1OddIndices += i % 2;
11267         if (NumV2OddIndices < NumV1OddIndices)
11268           return DAG.getCommutedVectorShuffle(*SVOp);
11269       }
11270     }
11271   }
11272
11273   // For each vector width, delegate to a specialized lowering routine.
11274   if (VT.getSizeInBits() == 128)
11275     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11276
11277   if (VT.getSizeInBits() == 256)
11278     return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11279
11280   // Force AVX-512 vectors to be scalarized for now.
11281   // FIXME: Implement AVX-512 support!
11282   if (VT.getSizeInBits() == 512)
11283     return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11284
11285   llvm_unreachable("Unimplemented!");
11286 }
11287
11288
11289 //===----------------------------------------------------------------------===//
11290 // Legacy vector shuffle lowering
11291 //
11292 // This code is the legacy code handling vector shuffles until the above
11293 // replaces its functionality and performance.
11294 //===----------------------------------------------------------------------===//
11295
11296 static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
11297                         bool hasInt256, unsigned *MaskOut = nullptr) {
11298   MVT EltVT = VT.getVectorElementType();
11299
11300   // There is no blend with immediate in AVX-512.
11301   if (VT.is512BitVector())
11302     return false;
11303
11304   if (!hasSSE41 || EltVT == MVT::i8)
11305     return false;
11306   if (!hasInt256 && VT == MVT::v16i16)
11307     return false;
11308
11309   unsigned MaskValue = 0;
11310   unsigned NumElems = VT.getVectorNumElements();
11311   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
11312   unsigned NumLanes = (NumElems - 1) / 8 + 1;
11313   unsigned NumElemsInLane = NumElems / NumLanes;
11314
11315   // Blend for v16i16 should be symetric for the both lanes.
11316   for (unsigned i = 0; i < NumElemsInLane; ++i) {
11317
11318     int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
11319     int EltIdx = MaskVals[i];
11320
11321     if ((EltIdx < 0 || EltIdx == (int)i) &&
11322         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
11323       continue;
11324
11325     if (((unsigned)EltIdx == (i + NumElems)) &&
11326         (SndLaneEltIdx < 0 ||
11327          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
11328       MaskValue |= (1 << i);
11329     else
11330       return false;
11331   }
11332
11333   if (MaskOut)
11334     *MaskOut = MaskValue;
11335   return true;
11336 }
11337
11338 // Try to lower a shuffle node into a simple blend instruction.
11339 // This function assumes isBlendMask returns true for this
11340 // SuffleVectorSDNode
11341 static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
11342                                           unsigned MaskValue,
11343                                           const X86Subtarget *Subtarget,
11344                                           SelectionDAG &DAG) {
11345   MVT VT = SVOp->getSimpleValueType(0);
11346   MVT EltVT = VT.getVectorElementType();
11347   assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
11348                      Subtarget->hasInt256() && "Trying to lower a "
11349                                                "VECTOR_SHUFFLE to a Blend but "
11350                                                "with the wrong mask"));
11351   SDValue V1 = SVOp->getOperand(0);
11352   SDValue V2 = SVOp->getOperand(1);
11353   SDLoc dl(SVOp);
11354   unsigned NumElems = VT.getVectorNumElements();
11355
11356   // Convert i32 vectors to floating point if it is not AVX2.
11357   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
11358   MVT BlendVT = VT;
11359   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
11360     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
11361                                NumElems);
11362     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
11363     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
11364   }
11365
11366   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
11367                             DAG.getConstant(MaskValue, MVT::i32));
11368   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
11369 }
11370
11371 /// In vector type \p VT, return true if the element at index \p InputIdx
11372 /// falls on a different 128-bit lane than \p OutputIdx.
11373 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
11374                                      unsigned OutputIdx) {
11375   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
11376   return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
11377 }
11378
11379 /// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
11380 /// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
11381 /// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
11382 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
11383 /// zero.
11384 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
11385                          SelectionDAG &DAG) {
11386   MVT VT = V1.getSimpleValueType();
11387   assert(VT.is128BitVector() || VT.is256BitVector());
11388
11389   MVT EltVT = VT.getVectorElementType();
11390   unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
11391   unsigned NumElts = VT.getVectorNumElements();
11392
11393   SmallVector<SDValue, 32> PshufbMask;
11394   for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
11395     int InputIdx = MaskVals[OutputIdx];
11396     unsigned InputByteIdx;
11397
11398     if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
11399       InputByteIdx = 0x80;
11400     else {
11401       // Cross lane is not allowed.
11402       if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
11403         return SDValue();
11404       InputByteIdx = InputIdx * EltSizeInBytes;
11405       // Index is an byte offset within the 128-bit lane.
11406       InputByteIdx &= 0xf;
11407     }
11408
11409     for (unsigned j = 0; j < EltSizeInBytes; ++j) {
11410       PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
11411       if (InputByteIdx != 0x80)
11412         ++InputByteIdx;
11413     }
11414   }
11415
11416   MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
11417   if (ShufVT != VT)
11418     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
11419   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
11420                      DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
11421 }
11422
11423 // v8i16 shuffles - Prefer shuffles in the following order:
11424 // 1. [all]   pshuflw, pshufhw, optional move
11425 // 2. [ssse3] 1 x pshufb
11426 // 3. [ssse3] 2 x pshufb + 1 x por
11427 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
11428 static SDValue
11429 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
11430                          SelectionDAG &DAG) {
11431   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11432   SDValue V1 = SVOp->getOperand(0);
11433   SDValue V2 = SVOp->getOperand(1);
11434   SDLoc dl(SVOp);
11435   SmallVector<int, 8> MaskVals;
11436
11437   // Determine if more than 1 of the words in each of the low and high quadwords
11438   // of the result come from the same quadword of one of the two inputs.  Undef
11439   // mask values count as coming from any quadword, for better codegen.
11440   //
11441   // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
11442   // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
11443   unsigned LoQuad[] = { 0, 0, 0, 0 };
11444   unsigned HiQuad[] = { 0, 0, 0, 0 };
11445   // Indices of quads used.
11446   std::bitset<4> InputQuads;
11447   for (unsigned i = 0; i < 8; ++i) {
11448     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
11449     int EltIdx = SVOp->getMaskElt(i);
11450     MaskVals.push_back(EltIdx);
11451     if (EltIdx < 0) {
11452       ++Quad[0];
11453       ++Quad[1];
11454       ++Quad[2];
11455       ++Quad[3];
11456       continue;
11457     }
11458     ++Quad[EltIdx / 4];
11459     InputQuads.set(EltIdx / 4);
11460   }
11461
11462   int BestLoQuad = -1;
11463   unsigned MaxQuad = 1;
11464   for (unsigned i = 0; i < 4; ++i) {
11465     if (LoQuad[i] > MaxQuad) {
11466       BestLoQuad = i;
11467       MaxQuad = LoQuad[i];
11468     }
11469   }
11470
11471   int BestHiQuad = -1;
11472   MaxQuad = 1;
11473   for (unsigned i = 0; i < 4; ++i) {
11474     if (HiQuad[i] > MaxQuad) {
11475       BestHiQuad = i;
11476       MaxQuad = HiQuad[i];
11477     }
11478   }
11479
11480   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
11481   // of the two input vectors, shuffle them into one input vector so only a
11482   // single pshufb instruction is necessary. If there are more than 2 input
11483   // quads, disable the next transformation since it does not help SSSE3.
11484   bool V1Used = InputQuads[0] || InputQuads[1];
11485   bool V2Used = InputQuads[2] || InputQuads[3];
11486   if (Subtarget->hasSSSE3()) {
11487     if (InputQuads.count() == 2 && V1Used && V2Used) {
11488       BestLoQuad = InputQuads[0] ? 0 : 1;
11489       BestHiQuad = InputQuads[2] ? 2 : 3;
11490     }
11491     if (InputQuads.count() > 2) {
11492       BestLoQuad = -1;
11493       BestHiQuad = -1;
11494     }
11495   }
11496
11497   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
11498   // the shuffle mask.  If a quad is scored as -1, that means that it contains
11499   // words from all 4 input quadwords.
11500   SDValue NewV;
11501   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
11502     int MaskV[] = {
11503       BestLoQuad < 0 ? 0 : BestLoQuad,
11504       BestHiQuad < 0 ? 1 : BestHiQuad
11505     };
11506     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
11507                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
11508                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
11509     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
11510
11511     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
11512     // source words for the shuffle, to aid later transformations.
11513     bool AllWordsInNewV = true;
11514     bool InOrder[2] = { true, true };
11515     for (unsigned i = 0; i != 8; ++i) {
11516       int idx = MaskVals[i];
11517       if (idx != (int)i)
11518         InOrder[i/4] = false;
11519       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
11520         continue;
11521       AllWordsInNewV = false;
11522       break;
11523     }
11524
11525     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
11526     if (AllWordsInNewV) {
11527       for (int i = 0; i != 8; ++i) {
11528         int idx = MaskVals[i];
11529         if (idx < 0)
11530           continue;
11531         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
11532         if ((idx != i) && idx < 4)
11533           pshufhw = false;
11534         if ((idx != i) && idx > 3)
11535           pshuflw = false;
11536       }
11537       V1 = NewV;
11538       V2Used = false;
11539       BestLoQuad = 0;
11540       BestHiQuad = 1;
11541     }
11542
11543     // If we've eliminated the use of V2, and the new mask is a pshuflw or
11544     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
11545     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
11546       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
11547       unsigned TargetMask = 0;
11548       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
11549                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
11550       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11551       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
11552                              getShufflePSHUFLWImmediate(SVOp);
11553       V1 = NewV.getOperand(0);
11554       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
11555     }
11556   }
11557
11558   // Promote splats to a larger type which usually leads to more efficient code.
11559   // FIXME: Is this true if pshufb is available?
11560   if (SVOp->isSplat())
11561     return PromoteSplat(SVOp, DAG);
11562
11563   // If we have SSSE3, and all words of the result are from 1 input vector,
11564   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
11565   // is present, fall back to case 4.
11566   if (Subtarget->hasSSSE3()) {
11567     SmallVector<SDValue,16> pshufbMask;
11568
11569     // If we have elements from both input vectors, set the high bit of the
11570     // shuffle mask element to zero out elements that come from V2 in the V1
11571     // mask, and elements that come from V1 in the V2 mask, so that the two
11572     // results can be OR'd together.
11573     bool TwoInputs = V1Used && V2Used;
11574     V1 = getPSHUFB(MaskVals, V1, dl, DAG);
11575     if (!TwoInputs)
11576       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11577
11578     // Calculate the shuffle mask for the second input, shuffle it, and
11579     // OR it with the first shuffled input.
11580     CommuteVectorShuffleMask(MaskVals, 8);
11581     V2 = getPSHUFB(MaskVals, V2, dl, DAG);
11582     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11583     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11584   }
11585
11586   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
11587   // and update MaskVals with new element order.
11588   std::bitset<8> InOrder;
11589   if (BestLoQuad >= 0) {
11590     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
11591     for (int i = 0; i != 4; ++i) {
11592       int idx = MaskVals[i];
11593       if (idx < 0) {
11594         InOrder.set(i);
11595       } else if ((idx / 4) == BestLoQuad) {
11596         MaskV[i] = idx & 3;
11597         InOrder.set(i);
11598       }
11599     }
11600     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11601                                 &MaskV[0]);
11602
11603     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11604       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11605       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
11606                                   NewV.getOperand(0),
11607                                   getShufflePSHUFLWImmediate(SVOp), DAG);
11608     }
11609   }
11610
11611   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
11612   // and update MaskVals with the new element order.
11613   if (BestHiQuad >= 0) {
11614     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
11615     for (unsigned i = 4; i != 8; ++i) {
11616       int idx = MaskVals[i];
11617       if (idx < 0) {
11618         InOrder.set(i);
11619       } else if ((idx / 4) == BestHiQuad) {
11620         MaskV[i] = (idx & 3) + 4;
11621         InOrder.set(i);
11622       }
11623     }
11624     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11625                                 &MaskV[0]);
11626
11627     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11628       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11629       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
11630                                   NewV.getOperand(0),
11631                                   getShufflePSHUFHWImmediate(SVOp), DAG);
11632     }
11633   }
11634
11635   // In case BestHi & BestLo were both -1, which means each quadword has a word
11636   // from each of the four input quadwords, calculate the InOrder bitvector now
11637   // before falling through to the insert/extract cleanup.
11638   if (BestLoQuad == -1 && BestHiQuad == -1) {
11639     NewV = V1;
11640     for (int i = 0; i != 8; ++i)
11641       if (MaskVals[i] < 0 || MaskVals[i] == i)
11642         InOrder.set(i);
11643   }
11644
11645   // The other elements are put in the right place using pextrw and pinsrw.
11646   for (unsigned i = 0; i != 8; ++i) {
11647     if (InOrder[i])
11648       continue;
11649     int EltIdx = MaskVals[i];
11650     if (EltIdx < 0)
11651       continue;
11652     SDValue ExtOp = (EltIdx < 8) ?
11653       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
11654                   DAG.getIntPtrConstant(EltIdx)) :
11655       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
11656                   DAG.getIntPtrConstant(EltIdx - 8));
11657     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
11658                        DAG.getIntPtrConstant(i));
11659   }
11660   return NewV;
11661 }
11662
11663 /// \brief v16i16 shuffles
11664 ///
11665 /// FIXME: We only support generation of a single pshufb currently.  We can
11666 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
11667 /// well (e.g 2 x pshufb + 1 x por).
11668 static SDValue
11669 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
11670   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11671   SDValue V1 = SVOp->getOperand(0);
11672   SDValue V2 = SVOp->getOperand(1);
11673   SDLoc dl(SVOp);
11674
11675   if (V2.getOpcode() != ISD::UNDEF)
11676     return SDValue();
11677
11678   SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11679   return getPSHUFB(MaskVals, V1, dl, DAG);
11680 }
11681
11682 // v16i8 shuffles - Prefer shuffles in the following order:
11683 // 1. [ssse3] 1 x pshufb
11684 // 2. [ssse3] 2 x pshufb + 1 x por
11685 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
11686 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
11687                                         const X86Subtarget* Subtarget,
11688                                         SelectionDAG &DAG) {
11689   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11690   SDValue V1 = SVOp->getOperand(0);
11691   SDValue V2 = SVOp->getOperand(1);
11692   SDLoc dl(SVOp);
11693   ArrayRef<int> MaskVals = SVOp->getMask();
11694
11695   // Promote splats to a larger type which usually leads to more efficient code.
11696   // FIXME: Is this true if pshufb is available?
11697   if (SVOp->isSplat())
11698     return PromoteSplat(SVOp, DAG);
11699
11700   // If we have SSSE3, case 1 is generated when all result bytes come from
11701   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
11702   // present, fall back to case 3.
11703
11704   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
11705   if (Subtarget->hasSSSE3()) {
11706     SmallVector<SDValue,16> pshufbMask;
11707
11708     // If all result elements are from one input vector, then only translate
11709     // undef mask values to 0x80 (zero out result) in the pshufb mask.
11710     //
11711     // Otherwise, we have elements from both input vectors, and must zero out
11712     // elements that come from V2 in the first mask, and V1 in the second mask
11713     // so that we can OR them together.
11714     for (unsigned i = 0; i != 16; ++i) {
11715       int EltIdx = MaskVals[i];
11716       if (EltIdx < 0 || EltIdx >= 16)
11717         EltIdx = 0x80;
11718       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11719     }
11720     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
11721                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11722                                  MVT::v16i8, pshufbMask));
11723
11724     // As PSHUFB will zero elements with negative indices, it's safe to ignore
11725     // the 2nd operand if it's undefined or zero.
11726     if (V2.getOpcode() == ISD::UNDEF ||
11727         ISD::isBuildVectorAllZeros(V2.getNode()))
11728       return V1;
11729
11730     // Calculate the shuffle mask for the second input, shuffle it, and
11731     // OR it with the first shuffled input.
11732     pshufbMask.clear();
11733     for (unsigned i = 0; i != 16; ++i) {
11734       int EltIdx = MaskVals[i];
11735       EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
11736       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11737     }
11738     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
11739                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11740                                  MVT::v16i8, pshufbMask));
11741     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11742   }
11743
11744   // No SSSE3 - Calculate in place words and then fix all out of place words
11745   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
11746   // the 16 different words that comprise the two doublequadword input vectors.
11747   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11748   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
11749   SDValue NewV = V1;
11750   for (int i = 0; i != 8; ++i) {
11751     int Elt0 = MaskVals[i*2];
11752     int Elt1 = MaskVals[i*2+1];
11753
11754     // This word of the result is all undef, skip it.
11755     if (Elt0 < 0 && Elt1 < 0)
11756       continue;
11757
11758     // This word of the result is already in the correct place, skip it.
11759     if ((Elt0 == i*2) && (Elt1 == i*2+1))
11760       continue;
11761
11762     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
11763     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
11764     SDValue InsElt;
11765
11766     // If Elt0 and Elt1 are defined, are consecutive, and can be load
11767     // using a single extract together, load it and store it.
11768     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
11769       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11770                            DAG.getIntPtrConstant(Elt1 / 2));
11771       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11772                         DAG.getIntPtrConstant(i));
11773       continue;
11774     }
11775
11776     // If Elt1 is defined, extract it from the appropriate source.  If the
11777     // source byte is not also odd, shift the extracted word left 8 bits
11778     // otherwise clear the bottom 8 bits if we need to do an or.
11779     if (Elt1 >= 0) {
11780       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11781                            DAG.getIntPtrConstant(Elt1 / 2));
11782       if ((Elt1 & 1) == 0)
11783         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
11784                              DAG.getConstant(8,
11785                                   TLI.getShiftAmountTy(InsElt.getValueType())));
11786       else if (Elt0 >= 0)
11787         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
11788                              DAG.getConstant(0xFF00, MVT::i16));
11789     }
11790     // If Elt0 is defined, extract it from the appropriate source.  If the
11791     // source byte is not also even, shift the extracted word right 8 bits. If
11792     // Elt1 was also defined, OR the extracted values together before
11793     // inserting them in the result.
11794     if (Elt0 >= 0) {
11795       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
11796                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
11797       if ((Elt0 & 1) != 0)
11798         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
11799                               DAG.getConstant(8,
11800                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
11801       else if (Elt1 >= 0)
11802         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
11803                              DAG.getConstant(0x00FF, MVT::i16));
11804       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
11805                          : InsElt0;
11806     }
11807     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11808                        DAG.getIntPtrConstant(i));
11809   }
11810   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
11811 }
11812
11813 // v32i8 shuffles - Translate to VPSHUFB if possible.
11814 static
11815 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
11816                                  const X86Subtarget *Subtarget,
11817                                  SelectionDAG &DAG) {
11818   MVT VT = SVOp->getSimpleValueType(0);
11819   SDValue V1 = SVOp->getOperand(0);
11820   SDValue V2 = SVOp->getOperand(1);
11821   SDLoc dl(SVOp);
11822   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11823
11824   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11825   bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
11826   bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
11827
11828   // VPSHUFB may be generated if
11829   // (1) one of input vector is undefined or zeroinitializer.
11830   // The mask value 0x80 puts 0 in the corresponding slot of the vector.
11831   // And (2) the mask indexes don't cross the 128-bit lane.
11832   if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
11833       (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
11834     return SDValue();
11835
11836   if (V1IsAllZero && !V2IsAllZero) {
11837     CommuteVectorShuffleMask(MaskVals, 32);
11838     V1 = V2;
11839   }
11840   return getPSHUFB(MaskVals, V1, dl, DAG);
11841 }
11842
11843 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
11844 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
11845 /// done when every pair / quad of shuffle mask elements point to elements in
11846 /// the right sequence. e.g.
11847 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
11848 static
11849 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
11850                                  SelectionDAG &DAG) {
11851   MVT VT = SVOp->getSimpleValueType(0);
11852   SDLoc dl(SVOp);
11853   unsigned NumElems = VT.getVectorNumElements();
11854   MVT NewVT;
11855   unsigned Scale;
11856   switch (VT.SimpleTy) {
11857   default: llvm_unreachable("Unexpected!");
11858   case MVT::v2i64:
11859   case MVT::v2f64:
11860            return SDValue(SVOp, 0);
11861   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
11862   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
11863   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
11864   case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
11865   case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
11866   case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
11867   }
11868
11869   SmallVector<int, 8> MaskVec;
11870   for (unsigned i = 0; i != NumElems; i += Scale) {
11871     int StartIdx = -1;
11872     for (unsigned j = 0; j != Scale; ++j) {
11873       int EltIdx = SVOp->getMaskElt(i+j);
11874       if (EltIdx < 0)
11875         continue;
11876       if (StartIdx < 0)
11877         StartIdx = (EltIdx / Scale);
11878       if (EltIdx != (int)(StartIdx*Scale + j))
11879         return SDValue();
11880     }
11881     MaskVec.push_back(StartIdx);
11882   }
11883
11884   SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
11885   SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
11886   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
11887 }
11888
11889 /// getVZextMovL - Return a zero-extending vector move low node.
11890 ///
11891 static SDValue getVZextMovL(MVT VT, MVT OpVT,
11892                             SDValue SrcOp, SelectionDAG &DAG,
11893                             const X86Subtarget *Subtarget, SDLoc dl) {
11894   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
11895     LoadSDNode *LD = nullptr;
11896     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
11897       LD = dyn_cast<LoadSDNode>(SrcOp);
11898     if (!LD) {
11899       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
11900       // instead.
11901       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
11902       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
11903           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
11904           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
11905           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
11906         // PR2108
11907         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
11908         return DAG.getNode(ISD::BITCAST, dl, VT,
11909                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11910                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11911                                                    OpVT,
11912                                                    SrcOp.getOperand(0)
11913                                                           .getOperand(0))));
11914       }
11915     }
11916   }
11917
11918   return DAG.getNode(ISD::BITCAST, dl, VT,
11919                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11920                                  DAG.getNode(ISD::BITCAST, dl,
11921                                              OpVT, SrcOp)));
11922 }
11923
11924 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
11925 /// which could not be matched by any known target speficic shuffle
11926 static SDValue
11927 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11928
11929   SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
11930   if (NewOp.getNode())
11931     return NewOp;
11932
11933   MVT VT = SVOp->getSimpleValueType(0);
11934
11935   unsigned NumElems = VT.getVectorNumElements();
11936   unsigned NumLaneElems = NumElems / 2;
11937
11938   SDLoc dl(SVOp);
11939   MVT EltVT = VT.getVectorElementType();
11940   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
11941   SDValue Output[2];
11942
11943   SmallVector<int, 16> Mask;
11944   for (unsigned l = 0; l < 2; ++l) {
11945     // Build a shuffle mask for the output, discovering on the fly which
11946     // input vectors to use as shuffle operands (recorded in InputUsed).
11947     // If building a suitable shuffle vector proves too hard, then bail
11948     // out with UseBuildVector set.
11949     bool UseBuildVector = false;
11950     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
11951     unsigned LaneStart = l * NumLaneElems;
11952     for (unsigned i = 0; i != NumLaneElems; ++i) {
11953       // The mask element.  This indexes into the input.
11954       int Idx = SVOp->getMaskElt(i+LaneStart);
11955       if (Idx < 0) {
11956         // the mask element does not index into any input vector.
11957         Mask.push_back(-1);
11958         continue;
11959       }
11960
11961       // The input vector this mask element indexes into.
11962       int Input = Idx / NumLaneElems;
11963
11964       // Turn the index into an offset from the start of the input vector.
11965       Idx -= Input * NumLaneElems;
11966
11967       // Find or create a shuffle vector operand to hold this input.
11968       unsigned OpNo;
11969       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
11970         if (InputUsed[OpNo] == Input)
11971           // This input vector is already an operand.
11972           break;
11973         if (InputUsed[OpNo] < 0) {
11974           // Create a new operand for this input vector.
11975           InputUsed[OpNo] = Input;
11976           break;
11977         }
11978       }
11979
11980       if (OpNo >= array_lengthof(InputUsed)) {
11981         // More than two input vectors used!  Give up on trying to create a
11982         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
11983         UseBuildVector = true;
11984         break;
11985       }
11986
11987       // Add the mask index for the new shuffle vector.
11988       Mask.push_back(Idx + OpNo * NumLaneElems);
11989     }
11990
11991     if (UseBuildVector) {
11992       SmallVector<SDValue, 16> SVOps;
11993       for (unsigned i = 0; i != NumLaneElems; ++i) {
11994         // The mask element.  This indexes into the input.
11995         int Idx = SVOp->getMaskElt(i+LaneStart);
11996         if (Idx < 0) {
11997           SVOps.push_back(DAG.getUNDEF(EltVT));
11998           continue;
11999         }
12000
12001         // The input vector this mask element indexes into.
12002         int Input = Idx / NumElems;
12003
12004         // Turn the index into an offset from the start of the input vector.
12005         Idx -= Input * NumElems;
12006
12007         // Extract the vector element by hand.
12008         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
12009                                     SVOp->getOperand(Input),
12010                                     DAG.getIntPtrConstant(Idx)));
12011       }
12012
12013       // Construct the output using a BUILD_VECTOR.
12014       Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
12015     } else if (InputUsed[0] < 0) {
12016       // No input vectors were used! The result is undefined.
12017       Output[l] = DAG.getUNDEF(NVT);
12018     } else {
12019       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
12020                                         (InputUsed[0] % 2) * NumLaneElems,
12021                                         DAG, dl);
12022       // If only one input was used, use an undefined vector for the other.
12023       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
12024         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
12025                             (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
12026       // At least one input vector was used. Create a new shuffle vector.
12027       Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
12028     }
12029
12030     Mask.clear();
12031   }
12032
12033   // Concatenate the result back
12034   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
12035 }
12036
12037 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
12038 /// 4 elements, and match them with several different shuffle types.
12039 static SDValue
12040 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
12041   SDValue V1 = SVOp->getOperand(0);
12042   SDValue V2 = SVOp->getOperand(1);
12043   SDLoc dl(SVOp);
12044   MVT VT = SVOp->getSimpleValueType(0);
12045
12046   assert(VT.is128BitVector() && "Unsupported vector size");
12047
12048   std::pair<int, int> Locs[4];
12049   int Mask1[] = { -1, -1, -1, -1 };
12050   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
12051
12052   unsigned NumHi = 0;
12053   unsigned NumLo = 0;
12054   for (unsigned i = 0; i != 4; ++i) {
12055     int Idx = PermMask[i];
12056     if (Idx < 0) {
12057       Locs[i] = std::make_pair(-1, -1);
12058     } else {
12059       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
12060       if (Idx < 4) {
12061         Locs[i] = std::make_pair(0, NumLo);
12062         Mask1[NumLo] = Idx;
12063         NumLo++;
12064       } else {
12065         Locs[i] = std::make_pair(1, NumHi);
12066         if (2+NumHi < 4)
12067           Mask1[2+NumHi] = Idx;
12068         NumHi++;
12069       }
12070     }
12071   }
12072
12073   if (NumLo <= 2 && NumHi <= 2) {
12074     // If no more than two elements come from either vector. This can be
12075     // implemented with two shuffles. First shuffle gather the elements.
12076     // The second shuffle, which takes the first shuffle as both of its
12077     // vector operands, put the elements into the right order.
12078     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12079
12080     int Mask2[] = { -1, -1, -1, -1 };
12081
12082     for (unsigned i = 0; i != 4; ++i)
12083       if (Locs[i].first != -1) {
12084         unsigned Idx = (i < 2) ? 0 : 4;
12085         Idx += Locs[i].first * 2 + Locs[i].second;
12086         Mask2[i] = Idx;
12087       }
12088
12089     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
12090   }
12091
12092   if (NumLo == 3 || NumHi == 3) {
12093     // Otherwise, we must have three elements from one vector, call it X, and
12094     // one element from the other, call it Y.  First, use a shufps to build an
12095     // intermediate vector with the one element from Y and the element from X
12096     // that will be in the same half in the final destination (the indexes don't
12097     // matter). Then, use a shufps to build the final vector, taking the half
12098     // containing the element from Y from the intermediate, and the other half
12099     // from X.
12100     if (NumHi == 3) {
12101       // Normalize it so the 3 elements come from V1.
12102       CommuteVectorShuffleMask(PermMask, 4);
12103       std::swap(V1, V2);
12104     }
12105
12106     // Find the element from V2.
12107     unsigned HiIndex;
12108     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
12109       int Val = PermMask[HiIndex];
12110       if (Val < 0)
12111         continue;
12112       if (Val >= 4)
12113         break;
12114     }
12115
12116     Mask1[0] = PermMask[HiIndex];
12117     Mask1[1] = -1;
12118     Mask1[2] = PermMask[HiIndex^1];
12119     Mask1[3] = -1;
12120     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12121
12122     if (HiIndex >= 2) {
12123       Mask1[0] = PermMask[0];
12124       Mask1[1] = PermMask[1];
12125       Mask1[2] = HiIndex & 1 ? 6 : 4;
12126       Mask1[3] = HiIndex & 1 ? 4 : 6;
12127       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12128     }
12129
12130     Mask1[0] = HiIndex & 1 ? 2 : 0;
12131     Mask1[1] = HiIndex & 1 ? 0 : 2;
12132     Mask1[2] = PermMask[2];
12133     Mask1[3] = PermMask[3];
12134     if (Mask1[2] >= 0)
12135       Mask1[2] += 4;
12136     if (Mask1[3] >= 0)
12137       Mask1[3] += 4;
12138     return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
12139   }
12140
12141   // Break it into (shuffle shuffle_hi, shuffle_lo).
12142   int LoMask[] = { -1, -1, -1, -1 };
12143   int HiMask[] = { -1, -1, -1, -1 };
12144
12145   int *MaskPtr = LoMask;
12146   unsigned MaskIdx = 0;
12147   unsigned LoIdx = 0;
12148   unsigned HiIdx = 2;
12149   for (unsigned i = 0; i != 4; ++i) {
12150     if (i == 2) {
12151       MaskPtr = HiMask;
12152       MaskIdx = 1;
12153       LoIdx = 0;
12154       HiIdx = 2;
12155     }
12156     int Idx = PermMask[i];
12157     if (Idx < 0) {
12158       Locs[i] = std::make_pair(-1, -1);
12159     } else if (Idx < 4) {
12160       Locs[i] = std::make_pair(MaskIdx, LoIdx);
12161       MaskPtr[LoIdx] = Idx;
12162       LoIdx++;
12163     } else {
12164       Locs[i] = std::make_pair(MaskIdx, HiIdx);
12165       MaskPtr[HiIdx] = Idx;
12166       HiIdx++;
12167     }
12168   }
12169
12170   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
12171   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
12172   int MaskOps[] = { -1, -1, -1, -1 };
12173   for (unsigned i = 0; i != 4; ++i)
12174     if (Locs[i].first != -1)
12175       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
12176   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
12177 }
12178
12179 static bool MayFoldVectorLoad(SDValue V) {
12180   while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
12181     V = V.getOperand(0);
12182
12183   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
12184     V = V.getOperand(0);
12185   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
12186       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
12187     // BUILD_VECTOR (load), undef
12188     V = V.getOperand(0);
12189
12190   return MayFoldLoad(V);
12191 }
12192
12193 static
12194 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
12195   MVT VT = Op.getSimpleValueType();
12196
12197   // Canonizalize to v2f64.
12198   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
12199   return DAG.getNode(ISD::BITCAST, dl, VT,
12200                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
12201                                           V1, DAG));
12202 }
12203
12204 static
12205 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
12206                         bool HasSSE2) {
12207   SDValue V1 = Op.getOperand(0);
12208   SDValue V2 = Op.getOperand(1);
12209   MVT VT = Op.getSimpleValueType();
12210
12211   assert(VT != MVT::v2i64 && "unsupported shuffle type");
12212
12213   if (HasSSE2 && VT == MVT::v2f64)
12214     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
12215
12216   // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
12217   return DAG.getNode(ISD::BITCAST, dl, VT,
12218                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
12219                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
12220                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
12221 }
12222
12223 static
12224 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
12225   SDValue V1 = Op.getOperand(0);
12226   SDValue V2 = Op.getOperand(1);
12227   MVT VT = Op.getSimpleValueType();
12228
12229   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
12230          "unsupported shuffle type");
12231
12232   if (V2.getOpcode() == ISD::UNDEF)
12233     V2 = V1;
12234
12235   // v4i32 or v4f32
12236   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
12237 }
12238
12239 static
12240 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
12241   SDValue V1 = Op.getOperand(0);
12242   SDValue V2 = Op.getOperand(1);
12243   MVT VT = Op.getSimpleValueType();
12244   unsigned NumElems = VT.getVectorNumElements();
12245
12246   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
12247   // operand of these instructions is only memory, so check if there's a
12248   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
12249   // same masks.
12250   bool CanFoldLoad = false;
12251
12252   // Trivial case, when V2 comes from a load.
12253   if (MayFoldVectorLoad(V2))
12254     CanFoldLoad = true;
12255
12256   // When V1 is a load, it can be folded later into a store in isel, example:
12257   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
12258   //    turns into:
12259   //  (MOVLPSmr addr:$src1, VR128:$src2)
12260   // So, recognize this potential and also use MOVLPS or MOVLPD
12261   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
12262     CanFoldLoad = true;
12263
12264   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12265   if (CanFoldLoad) {
12266     if (HasSSE2 && NumElems == 2)
12267       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
12268
12269     if (NumElems == 4)
12270       // If we don't care about the second element, proceed to use movss.
12271       if (SVOp->getMaskElt(1) != -1)
12272         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
12273   }
12274
12275   // movl and movlp will both match v2i64, but v2i64 is never matched by
12276   // movl earlier because we make it strict to avoid messing with the movlp load
12277   // folding logic (see the code above getMOVLP call). Match it here then,
12278   // this is horrible, but will stay like this until we move all shuffle
12279   // matching to x86 specific nodes. Note that for the 1st condition all
12280   // types are matched with movsd.
12281   if (HasSSE2) {
12282     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
12283     // as to remove this logic from here, as much as possible
12284     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
12285       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12286     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12287   }
12288
12289   assert(VT != MVT::v4i32 && "unsupported shuffle type");
12290
12291   // Invert the operand order and use SHUFPS to match it.
12292   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
12293                               getShuffleSHUFImmediate(SVOp), DAG);
12294 }
12295
12296 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
12297                                          SelectionDAG &DAG) {
12298   SDLoc dl(Load);
12299   MVT VT = Load->getSimpleValueType(0);
12300   MVT EVT = VT.getVectorElementType();
12301   SDValue Addr = Load->getOperand(1);
12302   SDValue NewAddr = DAG.getNode(
12303       ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
12304       DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
12305
12306   SDValue NewLoad =
12307       DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
12308                   DAG.getMachineFunction().getMachineMemOperand(
12309                       Load->getMemOperand(), 0, EVT.getStoreSize()));
12310   return NewLoad;
12311 }
12312
12313 // It is only safe to call this function if isINSERTPSMask is true for
12314 // this shufflevector mask.
12315 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
12316                            SelectionDAG &DAG) {
12317   // Generate an insertps instruction when inserting an f32 from memory onto a
12318   // v4f32 or when copying a member from one v4f32 to another.
12319   // We also use it for transferring i32 from one register to another,
12320   // since it simply copies the same bits.
12321   // If we're transferring an i32 from memory to a specific element in a
12322   // register, we output a generic DAG that will match the PINSRD
12323   // instruction.
12324   MVT VT = SVOp->getSimpleValueType(0);
12325   MVT EVT = VT.getVectorElementType();
12326   SDValue V1 = SVOp->getOperand(0);
12327   SDValue V2 = SVOp->getOperand(1);
12328   auto Mask = SVOp->getMask();
12329   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
12330          "unsupported vector type for insertps/pinsrd");
12331
12332   auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
12333   auto FromV2Predicate = [](const int &i) { return i >= 4; };
12334   int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
12335
12336   SDValue From;
12337   SDValue To;
12338   unsigned DestIndex;
12339   if (FromV1 == 1) {
12340     From = V1;
12341     To = V2;
12342     DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
12343                 Mask.begin();
12344
12345     // If we have 1 element from each vector, we have to check if we're
12346     // changing V1's element's place. If so, we're done. Otherwise, we
12347     // should assume we're changing V2's element's place and behave
12348     // accordingly.
12349     int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
12350     assert(DestIndex <= INT32_MAX && "truncated destination index");
12351     if (FromV1 == FromV2 &&
12352         static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
12353       From = V2;
12354       To = V1;
12355       DestIndex =
12356           std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12357     }
12358   } else {
12359     assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
12360            "More than one element from V1 and from V2, or no elements from one "
12361            "of the vectors. This case should not have returned true from "
12362            "isINSERTPSMask");
12363     From = V2;
12364     To = V1;
12365     DestIndex =
12366         std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12367   }
12368
12369   // Get an index into the source vector in the range [0,4) (the mask is
12370   // in the range [0,8) because it can address V1 and V2)
12371   unsigned SrcIndex = Mask[DestIndex] % 4;
12372   if (MayFoldLoad(From)) {
12373     // Trivial case, when From comes from a load and is only used by the
12374     // shuffle. Make it use insertps from the vector that we need from that
12375     // load.
12376     SDValue NewLoad =
12377         NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
12378     if (!NewLoad.getNode())
12379       return SDValue();
12380
12381     if (EVT == MVT::f32) {
12382       // Create this as a scalar to vector to match the instruction pattern.
12383       SDValue LoadScalarToVector =
12384           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
12385       SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
12386       return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
12387                          InsertpsMask);
12388     } else { // EVT == MVT::i32
12389       // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
12390       // instruction, to match the PINSRD instruction, which loads an i32 to a
12391       // certain vector element.
12392       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
12393                          DAG.getConstant(DestIndex, MVT::i32));
12394     }
12395   }
12396
12397   // Vector-element-to-vector
12398   SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
12399   return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
12400 }
12401
12402 // Reduce a vector shuffle to zext.
12403 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
12404                                     SelectionDAG &DAG) {
12405   // PMOVZX is only available from SSE41.
12406   if (!Subtarget->hasSSE41())
12407     return SDValue();
12408
12409   MVT VT = Op.getSimpleValueType();
12410
12411   // Only AVX2 support 256-bit vector integer extending.
12412   if (!Subtarget->hasInt256() && VT.is256BitVector())
12413     return SDValue();
12414
12415   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12416   SDLoc DL(Op);
12417   SDValue V1 = Op.getOperand(0);
12418   SDValue V2 = Op.getOperand(1);
12419   unsigned NumElems = VT.getVectorNumElements();
12420
12421   // Extending is an unary operation and the element type of the source vector
12422   // won't be equal to or larger than i64.
12423   if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
12424       VT.getVectorElementType() == MVT::i64)
12425     return SDValue();
12426
12427   // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
12428   unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
12429   while ((1U << Shift) < NumElems) {
12430     if (SVOp->getMaskElt(1U << Shift) == 1)
12431       break;
12432     Shift += 1;
12433     // The maximal ratio is 8, i.e. from i8 to i64.
12434     if (Shift > 3)
12435       return SDValue();
12436   }
12437
12438   // Check the shuffle mask.
12439   unsigned Mask = (1U << Shift) - 1;
12440   for (unsigned i = 0; i != NumElems; ++i) {
12441     int EltIdx = SVOp->getMaskElt(i);
12442     if ((i & Mask) != 0 && EltIdx != -1)
12443       return SDValue();
12444     if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
12445       return SDValue();
12446   }
12447
12448   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
12449   MVT NeVT = MVT::getIntegerVT(NBits);
12450   MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
12451
12452   if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
12453     return SDValue();
12454
12455   return DAG.getNode(ISD::BITCAST, DL, VT,
12456                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
12457 }
12458
12459 static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
12460                                       SelectionDAG &DAG) {
12461   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12462   MVT VT = Op.getSimpleValueType();
12463   SDLoc dl(Op);
12464   SDValue V1 = Op.getOperand(0);
12465   SDValue V2 = Op.getOperand(1);
12466
12467   if (isZeroShuffle(SVOp))
12468     return getZeroVector(VT, Subtarget, DAG, dl);
12469
12470   // Handle splat operations
12471   if (SVOp->isSplat()) {
12472     // Use vbroadcast whenever the splat comes from a foldable load
12473     SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
12474     if (Broadcast.getNode())
12475       return Broadcast;
12476   }
12477
12478   // Check integer expanding shuffles.
12479   SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
12480   if (NewOp.getNode())
12481     return NewOp;
12482
12483   // If the shuffle can be profitably rewritten as a narrower shuffle, then
12484   // do it!
12485   if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
12486       VT == MVT::v32i8) {
12487     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12488     if (NewOp.getNode())
12489       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
12490   } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
12491     // FIXME: Figure out a cleaner way to do this.
12492     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
12493       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12494       if (NewOp.getNode()) {
12495         MVT NewVT = NewOp.getSimpleValueType();
12496         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
12497                                NewVT, true, false))
12498           return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
12499                               dl);
12500       }
12501     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
12502       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12503       if (NewOp.getNode()) {
12504         MVT NewVT = NewOp.getSimpleValueType();
12505         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
12506           return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
12507                               dl);
12508       }
12509     }
12510   }
12511   return SDValue();
12512 }
12513
12514 SDValue
12515 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
12516   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12517   SDValue V1 = Op.getOperand(0);
12518   SDValue V2 = Op.getOperand(1);
12519   MVT VT = Op.getSimpleValueType();
12520   SDLoc dl(Op);
12521   unsigned NumElems = VT.getVectorNumElements();
12522   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
12523   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
12524   bool V1IsSplat = false;
12525   bool V2IsSplat = false;
12526   bool HasSSE2 = Subtarget->hasSSE2();
12527   bool HasFp256    = Subtarget->hasFp256();
12528   bool HasInt256   = Subtarget->hasInt256();
12529   MachineFunction &MF = DAG.getMachineFunction();
12530   bool OptForSize = MF.getFunction()->getAttributes().
12531     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
12532
12533   // Check if we should use the experimental vector shuffle lowering. If so,
12534   // delegate completely to that code path.
12535   if (ExperimentalVectorShuffleLowering)
12536     return lowerVectorShuffle(Op, Subtarget, DAG);
12537
12538   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
12539
12540   if (V1IsUndef && V2IsUndef)
12541     return DAG.getUNDEF(VT);
12542
12543   // When we create a shuffle node we put the UNDEF node to second operand,
12544   // but in some cases the first operand may be transformed to UNDEF.
12545   // In this case we should just commute the node.
12546   if (V1IsUndef)
12547     return DAG.getCommutedVectorShuffle(*SVOp);
12548
12549   // Vector shuffle lowering takes 3 steps:
12550   //
12551   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
12552   //    narrowing and commutation of operands should be handled.
12553   // 2) Matching of shuffles with known shuffle masks to x86 target specific
12554   //    shuffle nodes.
12555   // 3) Rewriting of unmatched masks into new generic shuffle operations,
12556   //    so the shuffle can be broken into other shuffles and the legalizer can
12557   //    try the lowering again.
12558   //
12559   // The general idea is that no vector_shuffle operation should be left to
12560   // be matched during isel, all of them must be converted to a target specific
12561   // node here.
12562
12563   // Normalize the input vectors. Here splats, zeroed vectors, profitable
12564   // narrowing and commutation of operands should be handled. The actual code
12565   // doesn't include all of those, work in progress...
12566   SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
12567   if (NewOp.getNode())
12568     return NewOp;
12569
12570   SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
12571
12572   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
12573   // unpckh_undef). Only use pshufd if speed is more important than size.
12574   if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12575     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12576   if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12577     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12578
12579   if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
12580       V2IsUndef && MayFoldVectorLoad(V1))
12581     return getMOVDDup(Op, dl, V1, DAG);
12582
12583   if (isMOVHLPS_v_undef_Mask(M, VT))
12584     return getMOVHighToLow(Op, dl, DAG);
12585
12586   // Use to match splats
12587   if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
12588       (VT == MVT::v2f64 || VT == MVT::v2i64))
12589     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12590
12591   if (isPSHUFDMask(M, VT)) {
12592     // The actual implementation will match the mask in the if above and then
12593     // during isel it can match several different instructions, not only pshufd
12594     // as its name says, sad but true, emulate the behavior for now...
12595     if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
12596       return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
12597
12598     unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
12599
12600     if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
12601       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
12602
12603     if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
12604       return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
12605                                   DAG);
12606
12607     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
12608                                 TargetMask, DAG);
12609   }
12610
12611   if (isPALIGNRMask(M, VT, Subtarget))
12612     return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
12613                                 getShufflePALIGNRImmediate(SVOp),
12614                                 DAG);
12615
12616   if (isVALIGNMask(M, VT, Subtarget))
12617     return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
12618                                 getShuffleVALIGNImmediate(SVOp),
12619                                 DAG);
12620
12621   // Check if this can be converted into a logical shift.
12622   bool isLeft = false;
12623   unsigned ShAmt = 0;
12624   SDValue ShVal;
12625   bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
12626   if (isShift && ShVal.hasOneUse()) {
12627     // If the shifted value has multiple uses, it may be cheaper to use
12628     // v_set0 + movlhps or movhlps, etc.
12629     MVT EltVT = VT.getVectorElementType();
12630     ShAmt *= EltVT.getSizeInBits();
12631     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12632   }
12633
12634   if (isMOVLMask(M, VT)) {
12635     if (ISD::isBuildVectorAllZeros(V1.getNode()))
12636       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
12637     if (!isMOVLPMask(M, VT)) {
12638       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
12639         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12640
12641       if (VT == MVT::v4i32 || VT == MVT::v4f32)
12642         return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12643     }
12644   }
12645
12646   // FIXME: fold these into legal mask.
12647   if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
12648     return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
12649
12650   if (isMOVHLPSMask(M, VT))
12651     return getMOVHighToLow(Op, dl, DAG);
12652
12653   if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
12654     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
12655
12656   if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
12657     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
12658
12659   if (isMOVLPMask(M, VT))
12660     return getMOVLP(Op, dl, DAG, HasSSE2);
12661
12662   if (ShouldXformToMOVHLPS(M, VT) ||
12663       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
12664     return DAG.getCommutedVectorShuffle(*SVOp);
12665
12666   if (isShift) {
12667     // No better options. Use a vshldq / vsrldq.
12668     MVT EltVT = VT.getVectorElementType();
12669     ShAmt *= EltVT.getSizeInBits();
12670     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12671   }
12672
12673   bool Commuted = false;
12674   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
12675   // 1,1,1,1 -> v8i16 though.
12676   BitVector UndefElements;
12677   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
12678     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12679       V1IsSplat = true;
12680   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
12681     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12682       V2IsSplat = true;
12683
12684   // Canonicalize the splat or undef, if present, to be on the RHS.
12685   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
12686     CommuteVectorShuffleMask(M, NumElems);
12687     std::swap(V1, V2);
12688     std::swap(V1IsSplat, V2IsSplat);
12689     Commuted = true;
12690   }
12691
12692   if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
12693     // Shuffling low element of v1 into undef, just return v1.
12694     if (V2IsUndef)
12695       return V1;
12696     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
12697     // the instruction selector will not match, so get a canonical MOVL with
12698     // swapped operands to undo the commute.
12699     return getMOVL(DAG, dl, VT, V2, V1);
12700   }
12701
12702   if (isUNPCKLMask(M, VT, HasInt256))
12703     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12704
12705   if (isUNPCKHMask(M, VT, HasInt256))
12706     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12707
12708   if (V2IsSplat) {
12709     // Normalize mask so all entries that point to V2 points to its first
12710     // element then try to match unpck{h|l} again. If match, return a
12711     // new vector_shuffle with the corrected mask.p
12712     SmallVector<int, 8> NewMask(M.begin(), M.end());
12713     NormalizeMask(NewMask, NumElems);
12714     if (isUNPCKLMask(NewMask, VT, HasInt256, true))
12715       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12716     if (isUNPCKHMask(NewMask, VT, HasInt256, true))
12717       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12718   }
12719
12720   if (Commuted) {
12721     // Commute is back and try unpck* again.
12722     // FIXME: this seems wrong.
12723     CommuteVectorShuffleMask(M, NumElems);
12724     std::swap(V1, V2);
12725     std::swap(V1IsSplat, V2IsSplat);
12726
12727     if (isUNPCKLMask(M, VT, HasInt256))
12728       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12729
12730     if (isUNPCKHMask(M, VT, HasInt256))
12731       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12732   }
12733
12734   // Normalize the node to match x86 shuffle ops if needed
12735   if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
12736     return DAG.getCommutedVectorShuffle(*SVOp);
12737
12738   // The checks below are all present in isShuffleMaskLegal, but they are
12739   // inlined here right now to enable us to directly emit target specific
12740   // nodes, and remove one by one until they don't return Op anymore.
12741
12742   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
12743       SVOp->getSplatIndex() == 0 && V2IsUndef) {
12744     if (VT == MVT::v2f64 || VT == MVT::v2i64)
12745       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12746   }
12747
12748   if (isPSHUFHWMask(M, VT, HasInt256))
12749     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
12750                                 getShufflePSHUFHWImmediate(SVOp),
12751                                 DAG);
12752
12753   if (isPSHUFLWMask(M, VT, HasInt256))
12754     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
12755                                 getShufflePSHUFLWImmediate(SVOp),
12756                                 DAG);
12757
12758   unsigned MaskValue;
12759   if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
12760                   &MaskValue))
12761     return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
12762
12763   if (isSHUFPMask(M, VT))
12764     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
12765                                 getShuffleSHUFImmediate(SVOp), DAG);
12766
12767   if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12768     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12769   if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12770     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12771
12772   //===--------------------------------------------------------------------===//
12773   // Generate target specific nodes for 128 or 256-bit shuffles only
12774   // supported in the AVX instruction set.
12775   //
12776
12777   // Handle VMOVDDUPY permutations
12778   if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
12779     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
12780
12781   // Handle VPERMILPS/D* permutations
12782   if (isVPERMILPMask(M, VT)) {
12783     if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
12784       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
12785                                   getShuffleSHUFImmediate(SVOp), DAG);
12786     return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
12787                                 getShuffleSHUFImmediate(SVOp), DAG);
12788   }
12789
12790   unsigned Idx;
12791   if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
12792     return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
12793                               Idx*(NumElems/2), DAG, dl);
12794
12795   // Handle VPERM2F128/VPERM2I128 permutations
12796   if (isVPERM2X128Mask(M, VT, HasFp256))
12797     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
12798                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
12799
12800   if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
12801     return getINSERTPS(SVOp, dl, DAG);
12802
12803   unsigned Imm8;
12804   if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
12805     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
12806
12807   if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
12808       VT.is512BitVector()) {
12809     MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
12810     MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
12811     SmallVector<SDValue, 16> permclMask;
12812     for (unsigned i = 0; i != NumElems; ++i) {
12813       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
12814     }
12815
12816     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
12817     if (V2IsUndef)
12818       // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
12819       return DAG.getNode(X86ISD::VPERMV, dl, VT,
12820                           DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
12821     return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
12822                        DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
12823   }
12824
12825   //===--------------------------------------------------------------------===//
12826   // Since no target specific shuffle was selected for this generic one,
12827   // lower it into other known shuffles. FIXME: this isn't true yet, but
12828   // this is the plan.
12829   //
12830
12831   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
12832   if (VT == MVT::v8i16) {
12833     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
12834     if (NewOp.getNode())
12835       return NewOp;
12836   }
12837
12838   if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
12839     SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
12840     if (NewOp.getNode())
12841       return NewOp;
12842   }
12843
12844   if (VT == MVT::v16i8) {
12845     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
12846     if (NewOp.getNode())
12847       return NewOp;
12848   }
12849
12850   if (VT == MVT::v32i8) {
12851     SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
12852     if (NewOp.getNode())
12853       return NewOp;
12854   }
12855
12856   // Handle all 128-bit wide vectors with 4 elements, and match them with
12857   // several different shuffle types.
12858   if (NumElems == 4 && VT.is128BitVector())
12859     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
12860
12861   // Handle general 256-bit shuffles
12862   if (VT.is256BitVector())
12863     return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
12864
12865   return SDValue();
12866 }
12867
12868 // This function assumes its argument is a BUILD_VECTOR of constants or
12869 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
12870 // true.
12871 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
12872                                     unsigned &MaskValue) {
12873   MaskValue = 0;
12874   unsigned NumElems = BuildVector->getNumOperands();
12875   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
12876   unsigned NumLanes = (NumElems - 1) / 8 + 1;
12877   unsigned NumElemsInLane = NumElems / NumLanes;
12878
12879   // Blend for v16i16 should be symetric for the both lanes.
12880   for (unsigned i = 0; i < NumElemsInLane; ++i) {
12881     SDValue EltCond = BuildVector->getOperand(i);
12882     SDValue SndLaneEltCond =
12883         (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
12884
12885     int Lane1Cond = -1, Lane2Cond = -1;
12886     if (isa<ConstantSDNode>(EltCond))
12887       Lane1Cond = !isZero(EltCond);
12888     if (isa<ConstantSDNode>(SndLaneEltCond))
12889       Lane2Cond = !isZero(SndLaneEltCond);
12890
12891     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
12892       // Lane1Cond != 0, means we want the first argument.
12893       // Lane1Cond == 0, means we want the second argument.
12894       // The encoding of this argument is 0 for the first argument, 1
12895       // for the second. Therefore, invert the condition.
12896       MaskValue |= !Lane1Cond << i;
12897     else if (Lane1Cond < 0)
12898       MaskValue |= !Lane2Cond << i;
12899     else
12900       return false;
12901   }
12902   return true;
12903 }
12904
12905 /// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
12906 /// instruction.
12907 static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
12908                                     SelectionDAG &DAG) {
12909   SDValue Cond = Op.getOperand(0);
12910   SDValue LHS = Op.getOperand(1);
12911   SDValue RHS = Op.getOperand(2);
12912   SDLoc dl(Op);
12913   MVT VT = Op.getSimpleValueType();
12914   MVT EltVT = VT.getVectorElementType();
12915   unsigned NumElems = VT.getVectorNumElements();
12916
12917   // There is no blend with immediate in AVX-512.
12918   if (VT.is512BitVector())
12919     return SDValue();
12920
12921   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
12922     return SDValue();
12923   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
12924     return SDValue();
12925
12926   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12927     return SDValue();
12928
12929   // Check the mask for BLEND and build the value.
12930   unsigned MaskValue = 0;
12931   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
12932     return SDValue();
12933
12934   // Convert i32 vectors to floating point if it is not AVX2.
12935   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
12936   MVT BlendVT = VT;
12937   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
12938     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
12939                                NumElems);
12940     LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
12941     RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
12942   }
12943
12944   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
12945                             DAG.getConstant(MaskValue, MVT::i32));
12946   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
12947 }
12948
12949 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12950   // A vselect where all conditions and data are constants can be optimized into
12951   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12952   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12953       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12954       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12955     return SDValue();
12956
12957   SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
12958   if (BlendOp.getNode())
12959     return BlendOp;
12960
12961   // Some types for vselect were previously set to Expand, not Legal or
12962   // Custom. Return an empty SDValue so we fall-through to Expand, after
12963   // the Custom lowering phase.
12964   MVT VT = Op.getSimpleValueType();
12965   switch (VT.SimpleTy) {
12966   default:
12967     break;
12968   case MVT::v8i16:
12969   case MVT::v16i16:
12970     if (Subtarget->hasBWI() && Subtarget->hasVLX())
12971       break;
12972     return SDValue();
12973   }
12974
12975   // We couldn't create a "Blend with immediate" node.
12976   // This node should still be legal, but we'll have to emit a blendv*
12977   // instruction.
12978   return Op;
12979 }
12980
12981 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12982   MVT VT = Op.getSimpleValueType();
12983   SDLoc dl(Op);
12984
12985   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12986     return SDValue();
12987
12988   if (VT.getSizeInBits() == 8) {
12989     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12990                                   Op.getOperand(0), Op.getOperand(1));
12991     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12992                                   DAG.getValueType(VT));
12993     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12994   }
12995
12996   if (VT.getSizeInBits() == 16) {
12997     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12998     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
12999     if (Idx == 0)
13000       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13001                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13002                                      DAG.getNode(ISD::BITCAST, dl,
13003                                                  MVT::v4i32,
13004                                                  Op.getOperand(0)),
13005                                      Op.getOperand(1)));
13006     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
13007                                   Op.getOperand(0), Op.getOperand(1));
13008     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13009                                   DAG.getValueType(VT));
13010     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13011   }
13012
13013   if (VT == MVT::f32) {
13014     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13015     // the result back to FR32 register. It's only worth matching if the
13016     // result has a single use which is a store or a bitcast to i32.  And in
13017     // the case of a store, it's not worth it if the index is a constant 0,
13018     // because a MOVSSmr can be used instead, which is smaller and faster.
13019     if (!Op.hasOneUse())
13020       return SDValue();
13021     SDNode *User = *Op.getNode()->use_begin();
13022     if ((User->getOpcode() != ISD::STORE ||
13023          (isa<ConstantSDNode>(Op.getOperand(1)) &&
13024           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
13025         (User->getOpcode() != ISD::BITCAST ||
13026          User->getValueType(0) != MVT::i32))
13027       return SDValue();
13028     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13029                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
13030                                               Op.getOperand(0)),
13031                                               Op.getOperand(1));
13032     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
13033   }
13034
13035   if (VT == MVT::i32 || VT == MVT::i64) {
13036     // ExtractPS/pextrq works with constant index.
13037     if (isa<ConstantSDNode>(Op.getOperand(1)))
13038       return Op;
13039   }
13040   return SDValue();
13041 }
13042
13043 /// Extract one bit from mask vector, like v16i1 or v8i1.
13044 /// AVX-512 feature.
13045 SDValue
13046 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13047   SDValue Vec = Op.getOperand(0);
13048   SDLoc dl(Vec);
13049   MVT VecVT = Vec.getSimpleValueType();
13050   SDValue Idx = Op.getOperand(1);
13051   MVT EltVT = Op.getSimpleValueType();
13052
13053   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13054   assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
13055          "Unexpected vector type in ExtractBitFromMaskVector");
13056
13057   // variable index can't be handled in mask registers,
13058   // extend vector to VR512
13059   if (!isa<ConstantSDNode>(Idx)) {
13060     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13061     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
13062     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13063                               ExtVT.getVectorElementType(), Ext, Idx);
13064     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13065   }
13066
13067   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13068   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13069   if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
13070     rc = getRegClassFor(MVT::v16i1);
13071   unsigned MaxSift = rc->getSize()*8 - 1;
13072   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13073                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13074   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13075                     DAG.getConstant(MaxSift, MVT::i8));
13076   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
13077                        DAG.getIntPtrConstant(0));
13078 }
13079
13080 SDValue
13081 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13082                                            SelectionDAG &DAG) const {
13083   SDLoc dl(Op);
13084   SDValue Vec = Op.getOperand(0);
13085   MVT VecVT = Vec.getSimpleValueType();
13086   SDValue Idx = Op.getOperand(1);
13087
13088   if (Op.getSimpleValueType() == MVT::i1)
13089     return ExtractBitFromMaskVector(Op, DAG);
13090
13091   if (!isa<ConstantSDNode>(Idx)) {
13092     if (VecVT.is512BitVector() ||
13093         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
13094          VecVT.getVectorElementType().getSizeInBits() == 32)) {
13095
13096       MVT MaskEltVT =
13097         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
13098       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
13099                                     MaskEltVT.getSizeInBits());
13100
13101       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
13102       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
13103                                 getZeroVector(MaskVT, Subtarget, DAG, dl),
13104                                 Idx, DAG.getConstant(0, getPointerTy()));
13105       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
13106       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
13107                         Perm, DAG.getConstant(0, getPointerTy()));
13108     }
13109     return SDValue();
13110   }
13111
13112   // If this is a 256-bit vector result, first extract the 128-bit vector and
13113   // then extract the element from the 128-bit vector.
13114   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
13115
13116     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13117     // Get the 128-bit vector.
13118     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
13119     MVT EltVT = VecVT.getVectorElementType();
13120
13121     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
13122
13123     //if (IdxVal >= NumElems/2)
13124     //  IdxVal -= NumElems/2;
13125     IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
13126     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
13127                        DAG.getConstant(IdxVal, MVT::i32));
13128   }
13129
13130   assert(VecVT.is128BitVector() && "Unexpected vector length");
13131
13132   if (Subtarget->hasSSE41()) {
13133     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
13134     if (Res.getNode())
13135       return Res;
13136   }
13137
13138   MVT VT = Op.getSimpleValueType();
13139   // TODO: handle v16i8.
13140   if (VT.getSizeInBits() == 16) {
13141     SDValue Vec = Op.getOperand(0);
13142     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13143     if (Idx == 0)
13144       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13145                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13146                                      DAG.getNode(ISD::BITCAST, dl,
13147                                                  MVT::v4i32, Vec),
13148                                      Op.getOperand(1)));
13149     // Transform it so it match pextrw which produces a 32-bit result.
13150     MVT EltVT = MVT::i32;
13151     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
13152                                   Op.getOperand(0), Op.getOperand(1));
13153     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
13154                                   DAG.getValueType(VT));
13155     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13156   }
13157
13158   if (VT.getSizeInBits() == 32) {
13159     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13160     if (Idx == 0)
13161       return Op;
13162
13163     // SHUFPS the element to the lowest double word, then movss.
13164     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
13165     MVT VVT = Op.getOperand(0).getSimpleValueType();
13166     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13167                                        DAG.getUNDEF(VVT), Mask);
13168     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13169                        DAG.getIntPtrConstant(0));
13170   }
13171
13172   if (VT.getSizeInBits() == 64) {
13173     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
13174     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
13175     //        to match extract_elt for f64.
13176     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13177     if (Idx == 0)
13178       return Op;
13179
13180     // UNPCKHPD the element to the lowest double word, then movsd.
13181     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
13182     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
13183     int Mask[2] = { 1, -1 };
13184     MVT VVT = Op.getOperand(0).getSimpleValueType();
13185     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13186                                        DAG.getUNDEF(VVT), Mask);
13187     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13188                        DAG.getIntPtrConstant(0));
13189   }
13190
13191   return SDValue();
13192 }
13193
13194 /// Insert one bit to mask vector, like v16i1 or v8i1.
13195 /// AVX-512 feature.
13196 SDValue
13197 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
13198   SDLoc dl(Op);
13199   SDValue Vec = Op.getOperand(0);
13200   SDValue Elt = Op.getOperand(1);
13201   SDValue Idx = Op.getOperand(2);
13202   MVT VecVT = Vec.getSimpleValueType();
13203
13204   if (!isa<ConstantSDNode>(Idx)) {
13205     // Non constant index. Extend source and destination,
13206     // insert element and then truncate the result.
13207     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13208     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
13209     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13210       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13211       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13212     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13213   }
13214
13215   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13216   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13217   if (Vec.getOpcode() == ISD::UNDEF)
13218     return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13219                        DAG.getConstant(IdxVal, MVT::i8));
13220   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13221   unsigned MaxSift = rc->getSize()*8 - 1;
13222   EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13223                     DAG.getConstant(MaxSift, MVT::i8));
13224   EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
13225                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13226   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13227 }
13228
13229 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13230                                                   SelectionDAG &DAG) const {
13231   MVT VT = Op.getSimpleValueType();
13232   MVT EltVT = VT.getVectorElementType();
13233
13234   if (EltVT == MVT::i1)
13235     return InsertBitToMaskVector(Op, DAG);
13236
13237   SDLoc dl(Op);
13238   SDValue N0 = Op.getOperand(0);
13239   SDValue N1 = Op.getOperand(1);
13240   SDValue N2 = Op.getOperand(2);
13241   if (!isa<ConstantSDNode>(N2))
13242     return SDValue();
13243   auto *N2C = cast<ConstantSDNode>(N2);
13244   unsigned IdxVal = N2C->getZExtValue();
13245
13246   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13247   // into that, and then insert the subvector back into the result.
13248   if (VT.is256BitVector() || VT.is512BitVector()) {
13249     // Get the desired 128-bit vector half.
13250     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
13251
13252     // Insert the element into the desired half.
13253     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13254     unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
13255
13256     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13257                     DAG.getConstant(IdxIn128, MVT::i32));
13258
13259     // Insert the changed part back to the 256-bit vector
13260     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
13261   }
13262   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13263
13264   if (Subtarget->hasSSE41()) {
13265     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13266       unsigned Opc;
13267       if (VT == MVT::v8i16) {
13268         Opc = X86ISD::PINSRW;
13269       } else {
13270         assert(VT == MVT::v16i8);
13271         Opc = X86ISD::PINSRB;
13272       }
13273
13274       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13275       // argument.
13276       if (N1.getValueType() != MVT::i32)
13277         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13278       if (N2.getValueType() != MVT::i32)
13279         N2 = DAG.getIntPtrConstant(IdxVal);
13280       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13281     }
13282
13283     if (EltVT == MVT::f32) {
13284       // Bits [7:6] of the constant are the source select.  This will always be
13285       //  zero here.  The DAG Combiner may combine an extract_elt index into
13286       //  these
13287       //  bits.  For example (insert (extract, 3), 2) could be matched by
13288       //  putting
13289       //  the '3' into bits [7:6] of X86ISD::INSERTPS.
13290       // Bits [5:4] of the constant are the destination select.  This is the
13291       //  value of the incoming immediate.
13292       // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
13293       //   combine either bitwise AND or insert of float 0.0 to set these bits.
13294       N2 = DAG.getIntPtrConstant(IdxVal << 4);
13295       // Create this as a scalar to vector..
13296       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13297       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13298     }
13299
13300     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13301       // PINSR* works with constant index.
13302       return Op;
13303     }
13304   }
13305
13306   if (EltVT == MVT::i8)
13307     return SDValue();
13308
13309   if (EltVT.getSizeInBits() == 16) {
13310     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13311     // as its second argument.
13312     if (N1.getValueType() != MVT::i32)
13313       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13314     if (N2.getValueType() != MVT::i32)
13315       N2 = DAG.getIntPtrConstant(IdxVal);
13316     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13317   }
13318   return SDValue();
13319 }
13320
13321 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13322   SDLoc dl(Op);
13323   MVT OpVT = Op.getSimpleValueType();
13324
13325   // If this is a 256-bit vector result, first insert into a 128-bit
13326   // vector and then insert into the 256-bit vector.
13327   if (!OpVT.is128BitVector()) {
13328     // Insert into a 128-bit vector.
13329     unsigned SizeFactor = OpVT.getSizeInBits()/128;
13330     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13331                                  OpVT.getVectorNumElements() / SizeFactor);
13332
13333     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13334
13335     // Insert the 128-bit vector.
13336     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13337   }
13338
13339   if (OpVT == MVT::v1i64 &&
13340       Op.getOperand(0).getValueType() == MVT::i64)
13341     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
13342
13343   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13344   assert(OpVT.is128BitVector() && "Expected an SSE type!");
13345   return DAG.getNode(ISD::BITCAST, dl, OpVT,
13346                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
13347 }
13348
13349 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
13350 // a simple subregister reference or explicit instructions to grab
13351 // upper bits of a vector.
13352 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13353                                       SelectionDAG &DAG) {
13354   SDLoc dl(Op);
13355   SDValue In =  Op.getOperand(0);
13356   SDValue Idx = Op.getOperand(1);
13357   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13358   MVT ResVT   = Op.getSimpleValueType();
13359   MVT InVT    = In.getSimpleValueType();
13360
13361   if (Subtarget->hasFp256()) {
13362     if (ResVT.is128BitVector() &&
13363         (InVT.is256BitVector() || InVT.is512BitVector()) &&
13364         isa<ConstantSDNode>(Idx)) {
13365       return Extract128BitVector(In, IdxVal, DAG, dl);
13366     }
13367     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
13368         isa<ConstantSDNode>(Idx)) {
13369       return Extract256BitVector(In, IdxVal, DAG, dl);
13370     }
13371   }
13372   return SDValue();
13373 }
13374
13375 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
13376 // simple superregister reference or explicit instructions to insert
13377 // the upper bits of a vector.
13378 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13379                                      SelectionDAG &DAG) {
13380   if (!Subtarget->hasAVX())
13381     return SDValue();
13382
13383   SDLoc dl(Op);
13384   SDValue Vec = Op.getOperand(0);
13385   SDValue SubVec = Op.getOperand(1);
13386   SDValue Idx = Op.getOperand(2);
13387
13388   if (!isa<ConstantSDNode>(Idx))
13389     return SDValue();
13390
13391   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13392   MVT OpVT = Op.getSimpleValueType();
13393   MVT SubVecVT = SubVec.getSimpleValueType();
13394
13395   // Fold two 16-byte subvector loads into one 32-byte load:
13396   // (insert_subvector (insert_subvector undef, (load addr), 0),
13397   //                   (load addr + 16), Elts/2)
13398   // --> load32 addr
13399   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
13400       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
13401       OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
13402       !Subtarget->isUnalignedMem32Slow()) {
13403     SDValue SubVec2 = Vec.getOperand(1);
13404     if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
13405       if (Idx2->getZExtValue() == 0) {
13406         SDValue Ops[] = { SubVec2, SubVec };
13407         SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
13408         if (LD.getNode())
13409           return LD;
13410       }
13411     }
13412   }
13413
13414   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13415       SubVecVT.is128BitVector())
13416     return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13417
13418   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
13419     return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13420
13421   return SDValue();
13422 }
13423
13424 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13425 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13426 // one of the above mentioned nodes. It has to be wrapped because otherwise
13427 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13428 // be used to form addressing mode. These wrapped nodes will be selected
13429 // into MOV32ri.
13430 SDValue
13431 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13432   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13433
13434   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13435   // global base reg.
13436   unsigned char OpFlag = 0;
13437   unsigned WrapperKind = X86ISD::Wrapper;
13438   CodeModel::Model M = DAG.getTarget().getCodeModel();
13439
13440   if (Subtarget->isPICStyleRIPRel() &&
13441       (M == CodeModel::Small || M == CodeModel::Kernel))
13442     WrapperKind = X86ISD::WrapperRIP;
13443   else if (Subtarget->isPICStyleGOT())
13444     OpFlag = X86II::MO_GOTOFF;
13445   else if (Subtarget->isPICStyleStubPIC())
13446     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13447
13448   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
13449                                              CP->getAlignment(),
13450                                              CP->getOffset(), OpFlag);
13451   SDLoc DL(CP);
13452   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13453   // With PIC, the address is actually $g + Offset.
13454   if (OpFlag) {
13455     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13456                          DAG.getNode(X86ISD::GlobalBaseReg,
13457                                      SDLoc(), getPointerTy()),
13458                          Result);
13459   }
13460
13461   return Result;
13462 }
13463
13464 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13465   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13466
13467   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13468   // global base reg.
13469   unsigned char OpFlag = 0;
13470   unsigned WrapperKind = X86ISD::Wrapper;
13471   CodeModel::Model M = DAG.getTarget().getCodeModel();
13472
13473   if (Subtarget->isPICStyleRIPRel() &&
13474       (M == CodeModel::Small || M == CodeModel::Kernel))
13475     WrapperKind = X86ISD::WrapperRIP;
13476   else if (Subtarget->isPICStyleGOT())
13477     OpFlag = X86II::MO_GOTOFF;
13478   else if (Subtarget->isPICStyleStubPIC())
13479     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13480
13481   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
13482                                           OpFlag);
13483   SDLoc DL(JT);
13484   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13485
13486   // With PIC, the address is actually $g + Offset.
13487   if (OpFlag)
13488     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13489                          DAG.getNode(X86ISD::GlobalBaseReg,
13490                                      SDLoc(), getPointerTy()),
13491                          Result);
13492
13493   return Result;
13494 }
13495
13496 SDValue
13497 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13498   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13499
13500   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13501   // global base reg.
13502   unsigned char OpFlag = 0;
13503   unsigned WrapperKind = X86ISD::Wrapper;
13504   CodeModel::Model M = DAG.getTarget().getCodeModel();
13505
13506   if (Subtarget->isPICStyleRIPRel() &&
13507       (M == CodeModel::Small || M == CodeModel::Kernel)) {
13508     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
13509       OpFlag = X86II::MO_GOTPCREL;
13510     WrapperKind = X86ISD::WrapperRIP;
13511   } else if (Subtarget->isPICStyleGOT()) {
13512     OpFlag = X86II::MO_GOT;
13513   } else if (Subtarget->isPICStyleStubPIC()) {
13514     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
13515   } else if (Subtarget->isPICStyleStubNoDynamic()) {
13516     OpFlag = X86II::MO_DARWIN_NONLAZY;
13517   }
13518
13519   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
13520
13521   SDLoc DL(Op);
13522   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13523
13524   // With PIC, the address is actually $g + Offset.
13525   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
13526       !Subtarget->is64Bit()) {
13527     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13528                          DAG.getNode(X86ISD::GlobalBaseReg,
13529                                      SDLoc(), getPointerTy()),
13530                          Result);
13531   }
13532
13533   // For symbols that require a load from a stub to get the address, emit the
13534   // load.
13535   if (isGlobalStubReference(OpFlag))
13536     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
13537                          MachinePointerInfo::getGOT(), false, false, false, 0);
13538
13539   return Result;
13540 }
13541
13542 SDValue
13543 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13544   // Create the TargetBlockAddressAddress node.
13545   unsigned char OpFlags =
13546     Subtarget->ClassifyBlockAddressReference();
13547   CodeModel::Model M = DAG.getTarget().getCodeModel();
13548   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13549   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13550   SDLoc dl(Op);
13551   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
13552                                              OpFlags);
13553
13554   if (Subtarget->isPICStyleRIPRel() &&
13555       (M == CodeModel::Small || M == CodeModel::Kernel))
13556     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13557   else
13558     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13559
13560   // With PIC, the address is actually $g + Offset.
13561   if (isGlobalRelativeToPICBase(OpFlags)) {
13562     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13563                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13564                          Result);
13565   }
13566
13567   return Result;
13568 }
13569
13570 SDValue
13571 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
13572                                       int64_t Offset, SelectionDAG &DAG) const {
13573   // Create the TargetGlobalAddress node, folding in the constant
13574   // offset if it is legal.
13575   unsigned char OpFlags =
13576       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
13577   CodeModel::Model M = DAG.getTarget().getCodeModel();
13578   SDValue Result;
13579   if (OpFlags == X86II::MO_NO_FLAG &&
13580       X86::isOffsetSuitableForCodeModel(Offset, M)) {
13581     // A direct static reference to a global.
13582     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
13583     Offset = 0;
13584   } else {
13585     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
13586   }
13587
13588   if (Subtarget->isPICStyleRIPRel() &&
13589       (M == CodeModel::Small || M == CodeModel::Kernel))
13590     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13591   else
13592     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13593
13594   // With PIC, the address is actually $g + Offset.
13595   if (isGlobalRelativeToPICBase(OpFlags)) {
13596     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13597                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13598                          Result);
13599   }
13600
13601   // For globals that require a load from a stub to get the address, emit the
13602   // load.
13603   if (isGlobalStubReference(OpFlags))
13604     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
13605                          MachinePointerInfo::getGOT(), false, false, false, 0);
13606
13607   // If there was a non-zero offset that we didn't fold, create an explicit
13608   // addition for it.
13609   if (Offset != 0)
13610     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
13611                          DAG.getConstant(Offset, getPointerTy()));
13612
13613   return Result;
13614 }
13615
13616 SDValue
13617 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13618   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13619   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13620   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13621 }
13622
13623 static SDValue
13624 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13625            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13626            unsigned char OperandFlags, bool LocalDynamic = false) {
13627   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13628   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13629   SDLoc dl(GA);
13630   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13631                                            GA->getValueType(0),
13632                                            GA->getOffset(),
13633                                            OperandFlags);
13634
13635   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13636                                            : X86ISD::TLSADDR;
13637
13638   if (InFlag) {
13639     SDValue Ops[] = { Chain,  TGA, *InFlag };
13640     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13641   } else {
13642     SDValue Ops[]  = { Chain, TGA };
13643     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13644   }
13645
13646   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13647   MFI->setAdjustsStack(true);
13648   MFI->setHasCalls(true);
13649
13650   SDValue Flag = Chain.getValue(1);
13651   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13652 }
13653
13654 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13655 static SDValue
13656 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13657                                 const EVT PtrVT) {
13658   SDValue InFlag;
13659   SDLoc dl(GA);  // ? function entry point might be better
13660   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13661                                    DAG.getNode(X86ISD::GlobalBaseReg,
13662                                                SDLoc(), PtrVT), InFlag);
13663   InFlag = Chain.getValue(1);
13664
13665   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13666 }
13667
13668 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13669 static SDValue
13670 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13671                                 const EVT PtrVT) {
13672   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13673                     X86::RAX, X86II::MO_TLSGD);
13674 }
13675
13676 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13677                                            SelectionDAG &DAG,
13678                                            const EVT PtrVT,
13679                                            bool is64Bit) {
13680   SDLoc dl(GA);
13681
13682   // Get the start address of the TLS block for this module.
13683   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13684       .getInfo<X86MachineFunctionInfo>();
13685   MFI->incNumLocalDynamicTLSAccesses();
13686
13687   SDValue Base;
13688   if (is64Bit) {
13689     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13690                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
13691   } else {
13692     SDValue InFlag;
13693     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13694         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13695     InFlag = Chain.getValue(1);
13696     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13697                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13698   }
13699
13700   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13701   // of Base.
13702
13703   // Build x@dtpoff.
13704   unsigned char OperandFlags = X86II::MO_DTPOFF;
13705   unsigned WrapperKind = X86ISD::Wrapper;
13706   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13707                                            GA->getValueType(0),
13708                                            GA->getOffset(), OperandFlags);
13709   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13710
13711   // Add x@dtpoff with the base.
13712   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13713 }
13714
13715 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13716 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13717                                    const EVT PtrVT, TLSModel::Model model,
13718                                    bool is64Bit, bool isPIC) {
13719   SDLoc dl(GA);
13720
13721   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13722   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13723                                                          is64Bit ? 257 : 256));
13724
13725   SDValue ThreadPointer =
13726       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
13727                   MachinePointerInfo(Ptr), false, false, false, 0);
13728
13729   unsigned char OperandFlags = 0;
13730   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
13731   // initialexec.
13732   unsigned WrapperKind = X86ISD::Wrapper;
13733   if (model == TLSModel::LocalExec) {
13734     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13735   } else if (model == TLSModel::InitialExec) {
13736     if (is64Bit) {
13737       OperandFlags = X86II::MO_GOTTPOFF;
13738       WrapperKind = X86ISD::WrapperRIP;
13739     } else {
13740       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13741     }
13742   } else {
13743     llvm_unreachable("Unexpected model");
13744   }
13745
13746   // emit "addl x@ntpoff,%eax" (local exec)
13747   // or "addl x@indntpoff,%eax" (initial exec)
13748   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13749   SDValue TGA =
13750       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13751                                  GA->getOffset(), OperandFlags);
13752   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13753
13754   if (model == TLSModel::InitialExec) {
13755     if (isPIC && !is64Bit) {
13756       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13757                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13758                            Offset);
13759     }
13760
13761     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13762                          MachinePointerInfo::getGOT(), false, false, false, 0);
13763   }
13764
13765   // The address of the thread local variable is the add of the thread
13766   // pointer with the offset of the variable.
13767   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13768 }
13769
13770 SDValue
13771 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13772
13773   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13774   const GlobalValue *GV = GA->getGlobal();
13775
13776   if (Subtarget->isTargetELF()) {
13777     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13778
13779     switch (model) {
13780       case TLSModel::GeneralDynamic:
13781         if (Subtarget->is64Bit())
13782           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
13783         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
13784       case TLSModel::LocalDynamic:
13785         return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
13786                                            Subtarget->is64Bit());
13787       case TLSModel::InitialExec:
13788       case TLSModel::LocalExec:
13789         return LowerToTLSExecModel(
13790             GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
13791             DAG.getTarget().getRelocationModel() == Reloc::PIC_);
13792     }
13793     llvm_unreachable("Unknown TLS model.");
13794   }
13795
13796   if (Subtarget->isTargetDarwin()) {
13797     // Darwin only has one model of TLS.  Lower to that.
13798     unsigned char OpFlag = 0;
13799     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
13800                            X86ISD::WrapperRIP : X86ISD::Wrapper;
13801
13802     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13803     // global base reg.
13804     bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
13805                  !Subtarget->is64Bit();
13806     if (PIC32)
13807       OpFlag = X86II::MO_TLVP_PIC_BASE;
13808     else
13809       OpFlag = X86II::MO_TLVP;
13810     SDLoc DL(Op);
13811     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13812                                                 GA->getValueType(0),
13813                                                 GA->getOffset(), OpFlag);
13814     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13815
13816     // With PIC32, the address is actually $g + Offset.
13817     if (PIC32)
13818       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13819                            DAG.getNode(X86ISD::GlobalBaseReg,
13820                                        SDLoc(), getPointerTy()),
13821                            Offset);
13822
13823     // Lowering the machine isd will make sure everything is in the right
13824     // location.
13825     SDValue Chain = DAG.getEntryNode();
13826     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13827     SDValue Args[] = { Chain, Offset };
13828     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13829
13830     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13831     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13832     MFI->setAdjustsStack(true);
13833
13834     // And our return value (tls address) is in the standard call return value
13835     // location.
13836     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
13837     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
13838                               Chain.getValue(1));
13839   }
13840
13841   if (Subtarget->isTargetKnownWindowsMSVC() ||
13842       Subtarget->isTargetWindowsGNU()) {
13843     // Just use the implicit TLS architecture
13844     // Need to generate someting similar to:
13845     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13846     //                                  ; from TEB
13847     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
13848     //   mov     rcx, qword [rdx+rcx*8]
13849     //   mov     eax, .tls$:tlsvar
13850     //   [rax+rcx] contains the address
13851     // Windows 64bit: gs:0x58
13852     // Windows 32bit: fs:__tls_array
13853
13854     SDLoc dl(GA);
13855     SDValue Chain = DAG.getEntryNode();
13856
13857     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13858     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13859     // use its literal value of 0x2C.
13860     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
13861                                         ? Type::getInt8PtrTy(*DAG.getContext(),
13862                                                              256)
13863                                         : Type::getInt32PtrTy(*DAG.getContext(),
13864                                                               257));
13865
13866     SDValue TlsArray =
13867         Subtarget->is64Bit()
13868             ? DAG.getIntPtrConstant(0x58)
13869             : (Subtarget->isTargetWindowsGNU()
13870                    ? DAG.getIntPtrConstant(0x2C)
13871                    : DAG.getExternalSymbol("_tls_array", getPointerTy()));
13872
13873     SDValue ThreadPointer =
13874         DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
13875                     MachinePointerInfo(Ptr), false, false, false, 0);
13876
13877     // Load the _tls_index variable
13878     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
13879     if (Subtarget->is64Bit())
13880       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
13881                            IDX, MachinePointerInfo(), MVT::i32,
13882                            false, false, false, 0);
13883     else
13884       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
13885                         false, false, false, 0);
13886
13887     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
13888                                     getPointerTy());
13889     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
13890
13891     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
13892     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
13893                       false, false, false, 0);
13894
13895     // Get the offset of start of .tls section
13896     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13897                                              GA->getValueType(0),
13898                                              GA->getOffset(), X86II::MO_SECREL);
13899     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
13900
13901     // The address of the thread local variable is the add of the thread
13902     // pointer with the offset of the variable.
13903     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
13904   }
13905
13906   llvm_unreachable("TLS not implemented for this target.");
13907 }
13908
13909 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
13910 /// and take a 2 x i32 value to shift plus a shift amount.
13911 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13912   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13913   MVT VT = Op.getSimpleValueType();
13914   unsigned VTBits = VT.getSizeInBits();
13915   SDLoc dl(Op);
13916   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13917   SDValue ShOpLo = Op.getOperand(0);
13918   SDValue ShOpHi = Op.getOperand(1);
13919   SDValue ShAmt  = Op.getOperand(2);
13920   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13921   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13922   // during isel.
13923   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13924                                   DAG.getConstant(VTBits - 1, MVT::i8));
13925   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13926                                      DAG.getConstant(VTBits - 1, MVT::i8))
13927                        : DAG.getConstant(0, VT);
13928
13929   SDValue Tmp2, Tmp3;
13930   if (Op.getOpcode() == ISD::SHL_PARTS) {
13931     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13932     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13933   } else {
13934     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13935     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13936   }
13937
13938   // If the shift amount is larger or equal than the width of a part we can't
13939   // rely on the results of shld/shrd. Insert a test and select the appropriate
13940   // values for large shift amounts.
13941   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13942                                 DAG.getConstant(VTBits, MVT::i8));
13943   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13944                              AndNode, DAG.getConstant(0, MVT::i8));
13945
13946   SDValue Hi, Lo;
13947   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
13948   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13949   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13950
13951   if (Op.getOpcode() == ISD::SHL_PARTS) {
13952     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13953     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13954   } else {
13955     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13956     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13957   }
13958
13959   SDValue Ops[2] = { Lo, Hi };
13960   return DAG.getMergeValues(Ops, dl);
13961 }
13962
13963 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13964                                            SelectionDAG &DAG) const {
13965   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
13966   SDLoc dl(Op);
13967
13968   if (SrcVT.isVector()) {
13969     if (SrcVT.getVectorElementType() == MVT::i1) {
13970       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13971       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13972                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
13973                                      Op.getOperand(0)));
13974     }
13975     return SDValue();
13976   }
13977
13978   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13979          "Unknown SINT_TO_FP to lower!");
13980
13981   // These are really Legal; return the operand so the caller accepts it as
13982   // Legal.
13983   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13984     return Op;
13985   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13986       Subtarget->is64Bit()) {
13987     return Op;
13988   }
13989
13990   unsigned Size = SrcVT.getSizeInBits()/8;
13991   MachineFunction &MF = DAG.getMachineFunction();
13992   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13993   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13994   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13995                                StackSlot,
13996                                MachinePointerInfo::getFixedStack(SSFI),
13997                                false, false, 0);
13998   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
13999 }
14000
14001 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14002                                      SDValue StackSlot,
14003                                      SelectionDAG &DAG) const {
14004   // Build the FILD
14005   SDLoc DL(Op);
14006   SDVTList Tys;
14007   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14008   if (useSSE)
14009     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14010   else
14011     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14012
14013   unsigned ByteSize = SrcVT.getSizeInBits()/8;
14014
14015   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14016   MachineMemOperand *MMO;
14017   if (FI) {
14018     int SSFI = FI->getIndex();
14019     MMO =
14020       DAG.getMachineFunction()
14021       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14022                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
14023   } else {
14024     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14025     StackSlot = StackSlot.getOperand(1);
14026   }
14027   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14028   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14029                                            X86ISD::FILD, DL,
14030                                            Tys, Ops, SrcVT, MMO);
14031
14032   if (useSSE) {
14033     Chain = Result.getValue(1);
14034     SDValue InFlag = Result.getValue(2);
14035
14036     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14037     // shouldn't be necessary except that RFP cannot be live across
14038     // multiple blocks. When stackifier is fixed, they can be uncoupled.
14039     MachineFunction &MF = DAG.getMachineFunction();
14040     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
14041     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
14042     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14043     Tys = DAG.getVTList(MVT::Other);
14044     SDValue Ops[] = {
14045       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
14046     };
14047     MachineMemOperand *MMO =
14048       DAG.getMachineFunction()
14049       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14050                             MachineMemOperand::MOStore, SSFISize, SSFISize);
14051
14052     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
14053                                     Ops, Op.getValueType(), MMO);
14054     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
14055                          MachinePointerInfo::getFixedStack(SSFI),
14056                          false, false, false, 0);
14057   }
14058
14059   return Result;
14060 }
14061
14062 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
14063 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
14064                                                SelectionDAG &DAG) const {
14065   // This algorithm is not obvious. Here it is what we're trying to output:
14066   /*
14067      movq       %rax,  %xmm0
14068      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
14069      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
14070      #ifdef __SSE3__
14071        haddpd   %xmm0, %xmm0
14072      #else
14073        pshufd   $0x4e, %xmm0, %xmm1
14074        addpd    %xmm1, %xmm0
14075      #endif
14076   */
14077
14078   SDLoc dl(Op);
14079   LLVMContext *Context = DAG.getContext();
14080
14081   // Build some magic constants.
14082   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
14083   Constant *C0 = ConstantDataVector::get(*Context, CV0);
14084   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
14085
14086   SmallVector<Constant*,2> CV1;
14087   CV1.push_back(
14088     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14089                                       APInt(64, 0x4330000000000000ULL))));
14090   CV1.push_back(
14091     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14092                                       APInt(64, 0x4530000000000000ULL))));
14093   Constant *C1 = ConstantVector::get(CV1);
14094   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
14095
14096   // Load the 64-bit value into an XMM register.
14097   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
14098                             Op.getOperand(0));
14099   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
14100                               MachinePointerInfo::getConstantPool(),
14101                               false, false, false, 16);
14102   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
14103                               DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
14104                               CLod0);
14105
14106   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
14107                               MachinePointerInfo::getConstantPool(),
14108                               false, false, false, 16);
14109   SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
14110   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
14111   SDValue Result;
14112
14113   if (Subtarget->hasSSE3()) {
14114     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
14115     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
14116   } else {
14117     SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
14118     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
14119                                            S2F, 0x4E, DAG);
14120     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
14121                          DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
14122                          Sub);
14123   }
14124
14125   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
14126                      DAG.getIntPtrConstant(0));
14127 }
14128
14129 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
14130 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
14131                                                SelectionDAG &DAG) const {
14132   SDLoc dl(Op);
14133   // FP constant to bias correct the final result.
14134   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
14135                                    MVT::f64);
14136
14137   // Load the 32-bit value into an XMM register.
14138   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
14139                              Op.getOperand(0));
14140
14141   // Zero out the upper parts of the register.
14142   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
14143
14144   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14145                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
14146                      DAG.getIntPtrConstant(0));
14147
14148   // Or the load with the bias.
14149   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
14150                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14151                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14152                                                    MVT::v2f64, Load)),
14153                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14154                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14155                                                    MVT::v2f64, Bias)));
14156   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14157                    DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
14158                    DAG.getIntPtrConstant(0));
14159
14160   // Subtract the bias.
14161   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
14162
14163   // Handle final rounding.
14164   EVT DestVT = Op.getValueType();
14165
14166   if (DestVT.bitsLT(MVT::f64))
14167     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
14168                        DAG.getIntPtrConstant(0));
14169   if (DestVT.bitsGT(MVT::f64))
14170     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
14171
14172   // Handle final rounding.
14173   return Sub;
14174 }
14175
14176 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
14177                                      const X86Subtarget &Subtarget) {
14178   // The algorithm is the following:
14179   // #ifdef __SSE4_1__
14180   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14181   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14182   //                                 (uint4) 0x53000000, 0xaa);
14183   // #else
14184   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14185   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14186   // #endif
14187   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14188   //     return (float4) lo + fhi;
14189
14190   SDLoc DL(Op);
14191   SDValue V = Op->getOperand(0);
14192   EVT VecIntVT = V.getValueType();
14193   bool Is128 = VecIntVT == MVT::v4i32;
14194   EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
14195   // If we convert to something else than the supported type, e.g., to v4f64,
14196   // abort early.
14197   if (VecFloatVT != Op->getValueType(0))
14198     return SDValue();
14199
14200   unsigned NumElts = VecIntVT.getVectorNumElements();
14201   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
14202          "Unsupported custom type");
14203   assert(NumElts <= 8 && "The size of the constant array must be fixed");
14204
14205   // In the #idef/#else code, we have in common:
14206   // - The vector of constants:
14207   // -- 0x4b000000
14208   // -- 0x53000000
14209   // - A shift:
14210   // -- v >> 16
14211
14212   // Create the splat vector for 0x4b000000.
14213   SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
14214   SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
14215                            CstLow, CstLow, CstLow, CstLow};
14216   SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14217                                   makeArrayRef(&CstLowArray[0], NumElts));
14218   // Create the splat vector for 0x53000000.
14219   SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
14220   SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
14221                             CstHigh, CstHigh, CstHigh, CstHigh};
14222   SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14223                                    makeArrayRef(&CstHighArray[0], NumElts));
14224
14225   // Create the right shift.
14226   SDValue CstShift = DAG.getConstant(16, MVT::i32);
14227   SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
14228                              CstShift, CstShift, CstShift, CstShift};
14229   SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14230                                     makeArrayRef(&CstShiftArray[0], NumElts));
14231   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14232
14233   SDValue Low, High;
14234   if (Subtarget.hasSSE41()) {
14235     EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14236     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14237     SDValue VecCstLowBitcast =
14238         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
14239     SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
14240     // Low will be bitcasted right away, so do not bother bitcasting back to its
14241     // original type.
14242     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14243                       VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
14244     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14245     //                                 (uint4) 0x53000000, 0xaa);
14246     SDValue VecCstHighBitcast =
14247         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
14248     SDValue VecShiftBitcast =
14249         DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
14250     // High will be bitcasted right away, so do not bother bitcasting back to
14251     // its original type.
14252     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14253                        VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
14254   } else {
14255     SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
14256     SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
14257                                      CstMask, CstMask, CstMask);
14258     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14259     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14260     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14261
14262     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14263     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14264   }
14265
14266   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14267   SDValue CstFAdd = DAG.getConstantFP(
14268       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
14269   SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
14270                             CstFAdd, CstFAdd, CstFAdd, CstFAdd};
14271   SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
14272                                    makeArrayRef(&CstFAddArray[0], NumElts));
14273
14274   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14275   SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
14276   SDValue FHigh =
14277       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14278   //     return (float4) lo + fhi;
14279   SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
14280   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14281 }
14282
14283 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14284                                                SelectionDAG &DAG) const {
14285   SDValue N0 = Op.getOperand(0);
14286   MVT SVT = N0.getSimpleValueType();
14287   SDLoc dl(Op);
14288
14289   switch (SVT.SimpleTy) {
14290   default:
14291     llvm_unreachable("Custom UINT_TO_FP is not supported!");
14292   case MVT::v4i8:
14293   case MVT::v4i16:
14294   case MVT::v8i8:
14295   case MVT::v8i16: {
14296     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
14297     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14298                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14299   }
14300   case MVT::v4i32:
14301   case MVT::v8i32:
14302     return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
14303   }
14304   llvm_unreachable(nullptr);
14305 }
14306
14307 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14308                                            SelectionDAG &DAG) const {
14309   SDValue N0 = Op.getOperand(0);
14310   SDLoc dl(Op);
14311
14312   if (Op.getValueType().isVector())
14313     return lowerUINT_TO_FP_vec(Op, DAG);
14314
14315   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14316   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14317   // the optimization here.
14318   if (DAG.SignBitIsZero(N0))
14319     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14320
14321   MVT SrcVT = N0.getSimpleValueType();
14322   MVT DstVT = Op.getSimpleValueType();
14323   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14324     return LowerUINT_TO_FP_i64(Op, DAG);
14325   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14326     return LowerUINT_TO_FP_i32(Op, DAG);
14327   if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14328     return SDValue();
14329
14330   // Make a 64-bit buffer, and use it to build an FILD.
14331   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14332   if (SrcVT == MVT::i32) {
14333     SDValue WordOff = DAG.getConstant(4, getPointerTy());
14334     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
14335                                      getPointerTy(), StackSlot, WordOff);
14336     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14337                                   StackSlot, MachinePointerInfo(),
14338                                   false, false, 0);
14339     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
14340                                   OffsetSlot, MachinePointerInfo(),
14341                                   false, false, 0);
14342     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14343     return Fild;
14344   }
14345
14346   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14347   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14348                                StackSlot, MachinePointerInfo(),
14349                                false, false, 0);
14350   // For i64 source, we need to add the appropriate power of 2 if the input
14351   // was negative.  This is the same as the optimization in
14352   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14353   // we must be careful to do the computation in x87 extended precision, not
14354   // in SSE. (The generic code can't know it's OK to do this, or how to.)
14355   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14356   MachineMemOperand *MMO =
14357     DAG.getMachineFunction()
14358     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14359                           MachineMemOperand::MOLoad, 8, 8);
14360
14361   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14362   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14363   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14364                                          MVT::i64, MMO);
14365
14366   APInt FF(32, 0x5F800000ULL);
14367
14368   // Check whether the sign bit is set.
14369   SDValue SignSet = DAG.getSetCC(dl,
14370                                  getSetCCResultType(*DAG.getContext(), MVT::i64),
14371                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
14372                                  ISD::SETLT);
14373
14374   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14375   SDValue FudgePtr = DAG.getConstantPool(
14376                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
14377                                          getPointerTy());
14378
14379   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14380   SDValue Zero = DAG.getIntPtrConstant(0);
14381   SDValue Four = DAG.getIntPtrConstant(4);
14382   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14383                                Zero, Four);
14384   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
14385
14386   // Load the value out, extending it from f32 to f80.
14387   // FIXME: Avoid the extend by constructing the right constant pool?
14388   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
14389                                  FudgePtr, MachinePointerInfo::getConstantPool(),
14390                                  MVT::f32, false, false, false, 4);
14391   // Extend everything to 80 bits to force it to be done on x87.
14392   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14393   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
14394 }
14395
14396 std::pair<SDValue,SDValue>
14397 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14398                                     bool IsSigned, bool IsReplace) const {
14399   SDLoc DL(Op);
14400
14401   EVT DstTy = Op.getValueType();
14402
14403   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
14404     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14405     DstTy = MVT::i64;
14406   }
14407
14408   assert(DstTy.getSimpleVT() <= MVT::i64 &&
14409          DstTy.getSimpleVT() >= MVT::i16 &&
14410          "Unknown FP_TO_INT to lower!");
14411
14412   // These are really Legal.
14413   if (DstTy == MVT::i32 &&
14414       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14415     return std::make_pair(SDValue(), SDValue());
14416   if (Subtarget->is64Bit() &&
14417       DstTy == MVT::i64 &&
14418       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14419     return std::make_pair(SDValue(), SDValue());
14420
14421   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
14422   // stack slot, or into the FTOL runtime function.
14423   MachineFunction &MF = DAG.getMachineFunction();
14424   unsigned MemSize = DstTy.getSizeInBits()/8;
14425   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14426   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14427
14428   unsigned Opc;
14429   if (!IsSigned && isIntegerTypeFTOL(DstTy))
14430     Opc = X86ISD::WIN_FTOL;
14431   else
14432     switch (DstTy.getSimpleVT().SimpleTy) {
14433     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14434     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14435     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14436     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14437     }
14438
14439   SDValue Chain = DAG.getEntryNode();
14440   SDValue Value = Op.getOperand(0);
14441   EVT TheVT = Op.getOperand(0).getValueType();
14442   // FIXME This causes a redundant load/store if the SSE-class value is already
14443   // in memory, such as if it is on the callstack.
14444   if (isScalarFPTypeInSSEReg(TheVT)) {
14445     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14446     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14447                          MachinePointerInfo::getFixedStack(SSFI),
14448                          false, false, 0);
14449     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14450     SDValue Ops[] = {
14451       Chain, StackSlot, DAG.getValueType(TheVT)
14452     };
14453
14454     MachineMemOperand *MMO =
14455       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14456                               MachineMemOperand::MOLoad, MemSize, MemSize);
14457     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14458     Chain = Value.getValue(1);
14459     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14460     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14461   }
14462
14463   MachineMemOperand *MMO =
14464     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14465                             MachineMemOperand::MOStore, MemSize, MemSize);
14466
14467   if (Opc != X86ISD::WIN_FTOL) {
14468     // Build the FP_TO_INT*_IN_MEM
14469     SDValue Ops[] = { Chain, Value, StackSlot };
14470     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
14471                                            Ops, DstTy, MMO);
14472     return std::make_pair(FIST, StackSlot);
14473   } else {
14474     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
14475       DAG.getVTList(MVT::Other, MVT::Glue),
14476       Chain, Value);
14477     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
14478       MVT::i32, ftol.getValue(1));
14479     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
14480       MVT::i32, eax.getValue(2));
14481     SDValue Ops[] = { eax, edx };
14482     SDValue pair = IsReplace
14483       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
14484       : DAG.getMergeValues(Ops, DL);
14485     return std::make_pair(pair, SDValue());
14486   }
14487 }
14488
14489 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14490                               const X86Subtarget *Subtarget) {
14491   MVT VT = Op->getSimpleValueType(0);
14492   SDValue In = Op->getOperand(0);
14493   MVT InVT = In.getSimpleValueType();
14494   SDLoc dl(Op);
14495
14496   // Optimize vectors in AVX mode:
14497   //
14498   //   v8i16 -> v8i32
14499   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
14500   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
14501   //   Concat upper and lower parts.
14502   //
14503   //   v4i32 -> v4i64
14504   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
14505   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
14506   //   Concat upper and lower parts.
14507   //
14508
14509   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14510       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14511       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14512     return SDValue();
14513
14514   if (Subtarget->hasInt256())
14515     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14516
14517   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14518   SDValue Undef = DAG.getUNDEF(InVT);
14519   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14520   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14521   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14522
14523   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14524                              VT.getVectorNumElements()/2);
14525
14526   OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
14527   OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
14528
14529   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14530 }
14531
14532 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14533                                         SelectionDAG &DAG) {
14534   MVT VT = Op->getSimpleValueType(0);
14535   SDValue In = Op->getOperand(0);
14536   MVT InVT = In.getSimpleValueType();
14537   SDLoc DL(Op);
14538   unsigned int NumElts = VT.getVectorNumElements();
14539   if (NumElts != 8 && NumElts != 16)
14540     return SDValue();
14541
14542   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14543     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14544
14545   EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
14546   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14547   // Now we have only mask extension
14548   assert(InVT.getVectorElementType() == MVT::i1);
14549   SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
14550   const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14551   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
14552   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14553   SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14554                            MachinePointerInfo::getConstantPool(),
14555                            false, false, false, Alignment);
14556
14557   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
14558   if (VT.is512BitVector())
14559     return Brcst;
14560   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
14561 }
14562
14563 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14564                                SelectionDAG &DAG) {
14565   if (Subtarget->hasFp256()) {
14566     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14567     if (Res.getNode())
14568       return Res;
14569   }
14570
14571   return SDValue();
14572 }
14573
14574 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14575                                 SelectionDAG &DAG) {
14576   SDLoc DL(Op);
14577   MVT VT = Op.getSimpleValueType();
14578   SDValue In = Op.getOperand(0);
14579   MVT SVT = In.getSimpleValueType();
14580
14581   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14582     return LowerZERO_EXTEND_AVX512(Op, DAG);
14583
14584   if (Subtarget->hasFp256()) {
14585     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14586     if (Res.getNode())
14587       return Res;
14588   }
14589
14590   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14591          VT.getVectorNumElements() != SVT.getVectorNumElements());
14592   return SDValue();
14593 }
14594
14595 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14596   SDLoc DL(Op);
14597   MVT VT = Op.getSimpleValueType();
14598   SDValue In = Op.getOperand(0);
14599   MVT InVT = In.getSimpleValueType();
14600
14601   if (VT == MVT::i1) {
14602     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14603            "Invalid scalar TRUNCATE operation");
14604     if (InVT.getSizeInBits() >= 32)
14605       return SDValue();
14606     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14607     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14608   }
14609   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14610          "Invalid TRUNCATE operation");
14611
14612   if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
14613     if (VT.getVectorElementType().getSizeInBits() >=8)
14614       return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14615
14616     assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
14617     unsigned NumElts = InVT.getVectorNumElements();
14618     assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
14619     if (InVT.getSizeInBits() < 512) {
14620       MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
14621       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14622       InVT = ExtVT;
14623     }
14624
14625     SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
14626     const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14627     SDValue CP = DAG.getConstantPool(C, getPointerTy());
14628     unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14629     SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14630                            MachinePointerInfo::getConstantPool(),
14631                            false, false, false, Alignment);
14632     SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
14633     SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
14634     return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
14635   }
14636
14637   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14638     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14639     if (Subtarget->hasInt256()) {
14640       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14641       In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
14642       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14643                                 ShufMask);
14644       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14645                          DAG.getIntPtrConstant(0));
14646     }
14647
14648     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14649                                DAG.getIntPtrConstant(0));
14650     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14651                                DAG.getIntPtrConstant(2));
14652     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14653     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14654     static const int ShufMask[] = {0, 2, 4, 6};
14655     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14656   }
14657
14658   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14659     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14660     if (Subtarget->hasInt256()) {
14661       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
14662
14663       SmallVector<SDValue,32> pshufbMask;
14664       for (unsigned i = 0; i < 2; ++i) {
14665         pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
14666         pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
14667         pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
14668         pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
14669         pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
14670         pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
14671         pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
14672         pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
14673         for (unsigned j = 0; j < 8; ++j)
14674           pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
14675       }
14676       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
14677       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14678       In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
14679
14680       static const int ShufMask[] = {0,  2,  -1,  -1};
14681       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
14682                                 &ShufMask[0]);
14683       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14684                        DAG.getIntPtrConstant(0));
14685       return DAG.getNode(ISD::BITCAST, DL, VT, In);
14686     }
14687
14688     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14689                                DAG.getIntPtrConstant(0));
14690
14691     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14692                                DAG.getIntPtrConstant(4));
14693
14694     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
14695     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
14696
14697     // The PSHUFB mask:
14698     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
14699                                    -1, -1, -1, -1, -1, -1, -1, -1};
14700
14701     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14702     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14703     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14704
14705     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14706     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14707
14708     // The MOVLHPS Mask:
14709     static const int ShufMask2[] = {0, 1, 4, 5};
14710     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14711     return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
14712   }
14713
14714   // Handle truncation of V256 to V128 using shuffles.
14715   if (!VT.is128BitVector() || !InVT.is256BitVector())
14716     return SDValue();
14717
14718   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
14719
14720   unsigned NumElems = VT.getVectorNumElements();
14721   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14722
14723   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14724   // Prepare truncation shuffle mask
14725   for (unsigned i = 0; i != NumElems; ++i)
14726     MaskVec[i] = i * 2;
14727   SDValue V = DAG.getVectorShuffle(NVT, DL,
14728                                    DAG.getNode(ISD::BITCAST, DL, NVT, In),
14729                                    DAG.getUNDEF(NVT), &MaskVec[0]);
14730   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14731                      DAG.getIntPtrConstant(0));
14732 }
14733
14734 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14735                                            SelectionDAG &DAG) const {
14736   assert(!Op.getSimpleValueType().isVector());
14737
14738   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14739     /*IsSigned=*/ true, /*IsReplace=*/ false);
14740   SDValue FIST = Vals.first, StackSlot = Vals.second;
14741   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14742   if (!FIST.getNode()) return Op;
14743
14744   if (StackSlot.getNode())
14745     // Load the result.
14746     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14747                        FIST, StackSlot, MachinePointerInfo(),
14748                        false, false, false, 0);
14749
14750   // The node is the result.
14751   return FIST;
14752 }
14753
14754 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14755                                            SelectionDAG &DAG) const {
14756   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14757     /*IsSigned=*/ false, /*IsReplace=*/ false);
14758   SDValue FIST = Vals.first, StackSlot = Vals.second;
14759   assert(FIST.getNode() && "Unexpected failure");
14760
14761   if (StackSlot.getNode())
14762     // Load the result.
14763     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14764                        FIST, StackSlot, MachinePointerInfo(),
14765                        false, false, false, 0);
14766
14767   // The node is the result.
14768   return FIST;
14769 }
14770
14771 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14772   SDLoc DL(Op);
14773   MVT VT = Op.getSimpleValueType();
14774   SDValue In = Op.getOperand(0);
14775   MVT SVT = In.getSimpleValueType();
14776
14777   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14778
14779   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14780                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14781                                  In, DAG.getUNDEF(SVT)));
14782 }
14783
14784 /// The only differences between FABS and FNEG are the mask and the logic op.
14785 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
14786 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14787   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14788          "Wrong opcode for lowering FABS or FNEG.");
14789
14790   bool IsFABS = (Op.getOpcode() == ISD::FABS);
14791
14792   // If this is a FABS and it has an FNEG user, bail out to fold the combination
14793   // into an FNABS. We'll lower the FABS after that if it is still in use.
14794   if (IsFABS)
14795     for (SDNode *User : Op->uses())
14796       if (User->getOpcode() == ISD::FNEG)
14797         return Op;
14798
14799   SDValue Op0 = Op.getOperand(0);
14800   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14801
14802   SDLoc dl(Op);
14803   MVT VT = Op.getSimpleValueType();
14804   // Assume scalar op for initialization; update for vector if needed.
14805   // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
14806   // generate a 16-byte vector constant and logic op even for the scalar case.
14807   // Using a 16-byte mask allows folding the load of the mask with
14808   // the logic op, so it can save (~4 bytes) on code size.
14809   MVT EltVT = VT;
14810   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
14811   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14812   // decide if we should generate a 16-byte constant mask when we only need 4 or
14813   // 8 bytes for the scalar case.
14814   if (VT.isVector()) {
14815     EltVT = VT.getVectorElementType();
14816     NumElts = VT.getVectorNumElements();
14817   }
14818
14819   unsigned EltBits = EltVT.getSizeInBits();
14820   LLVMContext *Context = DAG.getContext();
14821   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14822   APInt MaskElt =
14823     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14824   Constant *C = ConstantInt::get(*Context, MaskElt);
14825   C = ConstantVector::getSplat(NumElts, C);
14826   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14827   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
14828   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14829   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14830                              MachinePointerInfo::getConstantPool(),
14831                              false, false, false, Alignment);
14832
14833   if (VT.isVector()) {
14834     // For a vector, cast operands to a vector type, perform the logic op,
14835     // and cast the result back to the original value type.
14836     MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
14837     SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
14838     SDValue Operand = IsFNABS ?
14839       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
14840       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
14841     unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
14842     return DAG.getNode(ISD::BITCAST, dl, VT,
14843                        DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
14844   }
14845
14846   // If not vector, then scalar.
14847   unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14848   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14849   return DAG.getNode(BitOp, dl, VT, Operand, Mask);
14850 }
14851
14852 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14853   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14854   LLVMContext *Context = DAG.getContext();
14855   SDValue Op0 = Op.getOperand(0);
14856   SDValue Op1 = Op.getOperand(1);
14857   SDLoc dl(Op);
14858   MVT VT = Op.getSimpleValueType();
14859   MVT SrcVT = Op1.getSimpleValueType();
14860
14861   // If second operand is smaller, extend it first.
14862   if (SrcVT.bitsLT(VT)) {
14863     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14864     SrcVT = VT;
14865   }
14866   // And if it is bigger, shrink it first.
14867   if (SrcVT.bitsGT(VT)) {
14868     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
14869     SrcVT = VT;
14870   }
14871
14872   // At this point the operands and the result should have the same
14873   // type, and that won't be f80 since that is not custom lowered.
14874
14875   const fltSemantics &Sem =
14876       VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
14877   const unsigned SizeInBits = VT.getSizeInBits();
14878
14879   SmallVector<Constant *, 4> CV(
14880       VT == MVT::f64 ? 2 : 4,
14881       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14882
14883   // First, clear all bits but the sign bit from the second operand (sign).
14884   CV[0] = ConstantFP::get(*Context,
14885                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14886   Constant *C = ConstantVector::get(CV);
14887   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14888   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
14889                               MachinePointerInfo::getConstantPool(),
14890                               false, false, false, 16);
14891   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
14892
14893   // Next, clear the sign bit from the first operand (magnitude).
14894   // If it's a constant, we can clear it here.
14895   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14896     APFloat APF = Op0CN->getValueAPF();
14897     // If the magnitude is a positive zero, the sign bit alone is enough.
14898     if (APF.isPosZero())
14899       return SignBit;
14900     APF.clearSign();
14901     CV[0] = ConstantFP::get(*Context, APF);
14902   } else {
14903     CV[0] = ConstantFP::get(
14904         *Context,
14905         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14906   }
14907   C = ConstantVector::get(CV);
14908   CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14909   SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14910                             MachinePointerInfo::getConstantPool(),
14911                             false, false, false, 16);
14912   // If the magnitude operand wasn't a constant, we need to AND out the sign.
14913   if (!isa<ConstantFPSDNode>(Op0))
14914     Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
14915
14916   // OR the magnitude value with the sign bit.
14917   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
14918 }
14919
14920 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14921   SDValue N0 = Op.getOperand(0);
14922   SDLoc dl(Op);
14923   MVT VT = Op.getSimpleValueType();
14924
14925   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
14926   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
14927                                   DAG.getConstant(1, VT));
14928   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
14929 }
14930
14931 // Check whether an OR'd tree is PTEST-able.
14932 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
14933                                       SelectionDAG &DAG) {
14934   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14935
14936   if (!Subtarget->hasSSE41())
14937     return SDValue();
14938
14939   if (!Op->hasOneUse())
14940     return SDValue();
14941
14942   SDNode *N = Op.getNode();
14943   SDLoc DL(N);
14944
14945   SmallVector<SDValue, 8> Opnds;
14946   DenseMap<SDValue, unsigned> VecInMap;
14947   SmallVector<SDValue, 8> VecIns;
14948   EVT VT = MVT::Other;
14949
14950   // Recognize a special case where a vector is casted into wide integer to
14951   // test all 0s.
14952   Opnds.push_back(N->getOperand(0));
14953   Opnds.push_back(N->getOperand(1));
14954
14955   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14956     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14957     // BFS traverse all OR'd operands.
14958     if (I->getOpcode() == ISD::OR) {
14959       Opnds.push_back(I->getOperand(0));
14960       Opnds.push_back(I->getOperand(1));
14961       // Re-evaluate the number of nodes to be traversed.
14962       e += 2; // 2 more nodes (LHS and RHS) are pushed.
14963       continue;
14964     }
14965
14966     // Quit if a non-EXTRACT_VECTOR_ELT
14967     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14968       return SDValue();
14969
14970     // Quit if without a constant index.
14971     SDValue Idx = I->getOperand(1);
14972     if (!isa<ConstantSDNode>(Idx))
14973       return SDValue();
14974
14975     SDValue ExtractedFromVec = I->getOperand(0);
14976     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14977     if (M == VecInMap.end()) {
14978       VT = ExtractedFromVec.getValueType();
14979       // Quit if not 128/256-bit vector.
14980       if (!VT.is128BitVector() && !VT.is256BitVector())
14981         return SDValue();
14982       // Quit if not the same type.
14983       if (VecInMap.begin() != VecInMap.end() &&
14984           VT != VecInMap.begin()->first.getValueType())
14985         return SDValue();
14986       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14987       VecIns.push_back(ExtractedFromVec);
14988     }
14989     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14990   }
14991
14992   assert((VT.is128BitVector() || VT.is256BitVector()) &&
14993          "Not extracted from 128-/256-bit vector.");
14994
14995   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14996
14997   for (DenseMap<SDValue, unsigned>::const_iterator
14998         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
14999     // Quit if not all elements are used.
15000     if (I->second != FullMask)
15001       return SDValue();
15002   }
15003
15004   EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
15005
15006   // Cast all vectors into TestVT for PTEST.
15007   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
15008     VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
15009
15010   // If more than one full vectors are evaluated, OR them first before PTEST.
15011   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
15012     // Each iteration will OR 2 nodes and append the result until there is only
15013     // 1 node left, i.e. the final OR'd value of all vectors.
15014     SDValue LHS = VecIns[Slot];
15015     SDValue RHS = VecIns[Slot + 1];
15016     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
15017   }
15018
15019   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
15020                      VecIns.back(), VecIns.back());
15021 }
15022
15023 /// \brief return true if \c Op has a use that doesn't just read flags.
15024 static bool hasNonFlagsUse(SDValue Op) {
15025   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
15026        ++UI) {
15027     SDNode *User = *UI;
15028     unsigned UOpNo = UI.getOperandNo();
15029     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
15030       // Look pass truncate.
15031       UOpNo = User->use_begin().getOperandNo();
15032       User = *User->use_begin();
15033     }
15034
15035     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
15036         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
15037       return true;
15038   }
15039   return false;
15040 }
15041
15042 /// Emit nodes that will be selected as "test Op0,Op0", or something
15043 /// equivalent.
15044 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
15045                                     SelectionDAG &DAG) const {
15046   if (Op.getValueType() == MVT::i1)
15047     // KORTEST instruction should be selected
15048     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15049                        DAG.getConstant(0, Op.getValueType()));
15050
15051   // CF and OF aren't always set the way we want. Determine which
15052   // of these we need.
15053   bool NeedCF = false;
15054   bool NeedOF = false;
15055   switch (X86CC) {
15056   default: break;
15057   case X86::COND_A: case X86::COND_AE:
15058   case X86::COND_B: case X86::COND_BE:
15059     NeedCF = true;
15060     break;
15061   case X86::COND_G: case X86::COND_GE:
15062   case X86::COND_L: case X86::COND_LE:
15063   case X86::COND_O: case X86::COND_NO: {
15064     // Check if we really need to set the
15065     // Overflow flag. If NoSignedWrap is present
15066     // that is not actually needed.
15067     switch (Op->getOpcode()) {
15068     case ISD::ADD:
15069     case ISD::SUB:
15070     case ISD::MUL:
15071     case ISD::SHL: {
15072       const BinaryWithFlagsSDNode *BinNode =
15073           cast<BinaryWithFlagsSDNode>(Op.getNode());
15074       if (BinNode->hasNoSignedWrap())
15075         break;
15076     }
15077     default:
15078       NeedOF = true;
15079       break;
15080     }
15081     break;
15082   }
15083   }
15084   // See if we can use the EFLAGS value from the operand instead of
15085   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
15086   // we prove that the arithmetic won't overflow, we can't use OF or CF.
15087   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
15088     // Emit a CMP with 0, which is the TEST pattern.
15089     //if (Op.getValueType() == MVT::i1)
15090     //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
15091     //                     DAG.getConstant(0, MVT::i1));
15092     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15093                        DAG.getConstant(0, Op.getValueType()));
15094   }
15095   unsigned Opcode = 0;
15096   unsigned NumOperands = 0;
15097
15098   // Truncate operations may prevent the merge of the SETCC instruction
15099   // and the arithmetic instruction before it. Attempt to truncate the operands
15100   // of the arithmetic instruction and use a reduced bit-width instruction.
15101   bool NeedTruncation = false;
15102   SDValue ArithOp = Op;
15103   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
15104     SDValue Arith = Op->getOperand(0);
15105     // Both the trunc and the arithmetic op need to have one user each.
15106     if (Arith->hasOneUse())
15107       switch (Arith.getOpcode()) {
15108         default: break;
15109         case ISD::ADD:
15110         case ISD::SUB:
15111         case ISD::AND:
15112         case ISD::OR:
15113         case ISD::XOR: {
15114           NeedTruncation = true;
15115           ArithOp = Arith;
15116         }
15117       }
15118   }
15119
15120   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
15121   // which may be the result of a CAST.  We use the variable 'Op', which is the
15122   // non-casted variable when we check for possible users.
15123   switch (ArithOp.getOpcode()) {
15124   case ISD::ADD:
15125     // Due to an isel shortcoming, be conservative if this add is likely to be
15126     // selected as part of a load-modify-store instruction. When the root node
15127     // in a match is a store, isel doesn't know how to remap non-chain non-flag
15128     // uses of other nodes in the match, such as the ADD in this case. This
15129     // leads to the ADD being left around and reselected, with the result being
15130     // two adds in the output.  Alas, even if none our users are stores, that
15131     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
15132     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
15133     // climbing the DAG back to the root, and it doesn't seem to be worth the
15134     // effort.
15135     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15136          UE = Op.getNode()->use_end(); UI != UE; ++UI)
15137       if (UI->getOpcode() != ISD::CopyToReg &&
15138           UI->getOpcode() != ISD::SETCC &&
15139           UI->getOpcode() != ISD::STORE)
15140         goto default_case;
15141
15142     if (ConstantSDNode *C =
15143         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
15144       // An add of one will be selected as an INC.
15145       if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
15146         Opcode = X86ISD::INC;
15147         NumOperands = 1;
15148         break;
15149       }
15150
15151       // An add of negative one (subtract of one) will be selected as a DEC.
15152       if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
15153         Opcode = X86ISD::DEC;
15154         NumOperands = 1;
15155         break;
15156       }
15157     }
15158
15159     // Otherwise use a regular EFLAGS-setting add.
15160     Opcode = X86ISD::ADD;
15161     NumOperands = 2;
15162     break;
15163   case ISD::SHL:
15164   case ISD::SRL:
15165     // If we have a constant logical shift that's only used in a comparison
15166     // against zero turn it into an equivalent AND. This allows turning it into
15167     // a TEST instruction later.
15168     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
15169         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
15170       EVT VT = Op.getValueType();
15171       unsigned BitWidth = VT.getSizeInBits();
15172       unsigned ShAmt = Op->getConstantOperandVal(1);
15173       if (ShAmt >= BitWidth) // Avoid undefined shifts.
15174         break;
15175       APInt Mask = ArithOp.getOpcode() == ISD::SRL
15176                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
15177                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
15178       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
15179         break;
15180       SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
15181                                 DAG.getConstant(Mask, VT));
15182       DAG.ReplaceAllUsesWith(Op, New);
15183       Op = New;
15184     }
15185     break;
15186
15187   case ISD::AND:
15188     // If the primary and result isn't used, don't bother using X86ISD::AND,
15189     // because a TEST instruction will be better.
15190     if (!hasNonFlagsUse(Op))
15191       break;
15192     // FALL THROUGH
15193   case ISD::SUB:
15194   case ISD::OR:
15195   case ISD::XOR:
15196     // Due to the ISEL shortcoming noted above, be conservative if this op is
15197     // likely to be selected as part of a load-modify-store instruction.
15198     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15199            UE = Op.getNode()->use_end(); UI != UE; ++UI)
15200       if (UI->getOpcode() == ISD::STORE)
15201         goto default_case;
15202
15203     // Otherwise use a regular EFLAGS-setting instruction.
15204     switch (ArithOp.getOpcode()) {
15205     default: llvm_unreachable("unexpected operator!");
15206     case ISD::SUB: Opcode = X86ISD::SUB; break;
15207     case ISD::XOR: Opcode = X86ISD::XOR; break;
15208     case ISD::AND: Opcode = X86ISD::AND; break;
15209     case ISD::OR: {
15210       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
15211         SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
15212         if (EFLAGS.getNode())
15213           return EFLAGS;
15214       }
15215       Opcode = X86ISD::OR;
15216       break;
15217     }
15218     }
15219
15220     NumOperands = 2;
15221     break;
15222   case X86ISD::ADD:
15223   case X86ISD::SUB:
15224   case X86ISD::INC:
15225   case X86ISD::DEC:
15226   case X86ISD::OR:
15227   case X86ISD::XOR:
15228   case X86ISD::AND:
15229     return SDValue(Op.getNode(), 1);
15230   default:
15231   default_case:
15232     break;
15233   }
15234
15235   // If we found that truncation is beneficial, perform the truncation and
15236   // update 'Op'.
15237   if (NeedTruncation) {
15238     EVT VT = Op.getValueType();
15239     SDValue WideVal = Op->getOperand(0);
15240     EVT WideVT = WideVal.getValueType();
15241     unsigned ConvertedOp = 0;
15242     // Use a target machine opcode to prevent further DAGCombine
15243     // optimizations that may separate the arithmetic operations
15244     // from the setcc node.
15245     switch (WideVal.getOpcode()) {
15246       default: break;
15247       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
15248       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
15249       case ISD::AND: ConvertedOp = X86ISD::AND; break;
15250       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
15251       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15252     }
15253
15254     if (ConvertedOp) {
15255       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15256       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15257         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15258         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15259         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15260       }
15261     }
15262   }
15263
15264   if (Opcode == 0)
15265     // Emit a CMP with 0, which is the TEST pattern.
15266     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15267                        DAG.getConstant(0, Op.getValueType()));
15268
15269   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15270   SmallVector<SDValue, 4> Ops;
15271   for (unsigned i = 0; i != NumOperands; ++i)
15272     Ops.push_back(Op.getOperand(i));
15273
15274   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15275   DAG.ReplaceAllUsesWith(Op, New);
15276   return SDValue(New.getNode(), 1);
15277 }
15278
15279 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15280 /// equivalent.
15281 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15282                                    SDLoc dl, SelectionDAG &DAG) const {
15283   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
15284     if (C->getAPIntValue() == 0)
15285       return EmitTest(Op0, X86CC, dl, DAG);
15286
15287      if (Op0.getValueType() == MVT::i1)
15288        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
15289   }
15290
15291   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
15292        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
15293     // Do the comparison at i32 if it's smaller, besides the Atom case.
15294     // This avoids subregister aliasing issues. Keep the smaller reference
15295     // if we're optimizing for size, however, as that'll allow better folding
15296     // of memory operations.
15297     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
15298         !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
15299              AttributeSet::FunctionIndex, Attribute::MinSize) &&
15300         !Subtarget->isAtom()) {
15301       unsigned ExtendOp =
15302           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
15303       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
15304       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
15305     }
15306     // Use SUB instead of CMP to enable CSE between SUB and CMP.
15307     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
15308     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
15309                               Op0, Op1);
15310     return SDValue(Sub.getNode(), 1);
15311   }
15312   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
15313 }
15314
15315 /// Convert a comparison if required by the subtarget.
15316 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
15317                                                  SelectionDAG &DAG) const {
15318   // If the subtarget does not support the FUCOMI instruction, floating-point
15319   // comparisons have to be converted.
15320   if (Subtarget->hasCMov() ||
15321       Cmp.getOpcode() != X86ISD::CMP ||
15322       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
15323       !Cmp.getOperand(1).getValueType().isFloatingPoint())
15324     return Cmp;
15325
15326   // The instruction selector will select an FUCOM instruction instead of
15327   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
15328   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
15329   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
15330   SDLoc dl(Cmp);
15331   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
15332   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
15333   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
15334                             DAG.getConstant(8, MVT::i8));
15335   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
15336   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
15337 }
15338
15339 /// The minimum architected relative accuracy is 2^-12. We need one
15340 /// Newton-Raphson step to have a good float result (24 bits of precision).
15341 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
15342                                             DAGCombinerInfo &DCI,
15343                                             unsigned &RefinementSteps,
15344                                             bool &UseOneConstNR) const {
15345   // FIXME: We should use instruction latency models to calculate the cost of
15346   // each potential sequence, but this is very hard to do reliably because
15347   // at least Intel's Core* chips have variable timing based on the number of
15348   // significant digits in the divisor and/or sqrt operand.
15349   if (!Subtarget->useSqrtEst())
15350     return SDValue();
15351
15352   EVT VT = Op.getValueType();
15353
15354   // SSE1 has rsqrtss and rsqrtps.
15355   // TODO: Add support for AVX512 (v16f32).
15356   // It is likely not profitable to do this for f64 because a double-precision
15357   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
15358   // instructions: convert to single, rsqrtss, convert back to double, refine
15359   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
15360   // along with FMA, this could be a throughput win.
15361   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15362       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15363     RefinementSteps = 1;
15364     UseOneConstNR = false;
15365     return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
15366   }
15367   return SDValue();
15368 }
15369
15370 /// The minimum architected relative accuracy is 2^-12. We need one
15371 /// Newton-Raphson step to have a good float result (24 bits of precision).
15372 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
15373                                             DAGCombinerInfo &DCI,
15374                                             unsigned &RefinementSteps) const {
15375   // FIXME: We should use instruction latency models to calculate the cost of
15376   // each potential sequence, but this is very hard to do reliably because
15377   // at least Intel's Core* chips have variable timing based on the number of
15378   // significant digits in the divisor.
15379   if (!Subtarget->useReciprocalEst())
15380     return SDValue();
15381
15382   EVT VT = Op.getValueType();
15383
15384   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
15385   // TODO: Add support for AVX512 (v16f32).
15386   // It is likely not profitable to do this for f64 because a double-precision
15387   // reciprocal estimate with refinement on x86 prior to FMA requires
15388   // 15 instructions: convert to single, rcpss, convert back to double, refine
15389   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
15390   // along with FMA, this could be a throughput win.
15391   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15392       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15393     RefinementSteps = ReciprocalEstimateRefinementSteps;
15394     return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15395   }
15396   return SDValue();
15397 }
15398
15399 static bool isAllOnes(SDValue V) {
15400   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
15401   return C && C->isAllOnesValue();
15402 }
15403
15404 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
15405 /// if it's possible.
15406 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15407                                      SDLoc dl, SelectionDAG &DAG) const {
15408   SDValue Op0 = And.getOperand(0);
15409   SDValue Op1 = And.getOperand(1);
15410   if (Op0.getOpcode() == ISD::TRUNCATE)
15411     Op0 = Op0.getOperand(0);
15412   if (Op1.getOpcode() == ISD::TRUNCATE)
15413     Op1 = Op1.getOperand(0);
15414
15415   SDValue LHS, RHS;
15416   if (Op1.getOpcode() == ISD::SHL)
15417     std::swap(Op0, Op1);
15418   if (Op0.getOpcode() == ISD::SHL) {
15419     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
15420       if (And00C->getZExtValue() == 1) {
15421         // If we looked past a truncate, check that it's only truncating away
15422         // known zeros.
15423         unsigned BitWidth = Op0.getValueSizeInBits();
15424         unsigned AndBitWidth = And.getValueSizeInBits();
15425         if (BitWidth > AndBitWidth) {
15426           APInt Zeros, Ones;
15427           DAG.computeKnownBits(Op0, Zeros, Ones);
15428           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15429             return SDValue();
15430         }
15431         LHS = Op1;
15432         RHS = Op0.getOperand(1);
15433       }
15434   } else if (Op1.getOpcode() == ISD::Constant) {
15435     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15436     uint64_t AndRHSVal = AndRHS->getZExtValue();
15437     SDValue AndLHS = Op0;
15438
15439     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15440       LHS = AndLHS.getOperand(0);
15441       RHS = AndLHS.getOperand(1);
15442     }
15443
15444     // Use BT if the immediate can't be encoded in a TEST instruction.
15445     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15446       LHS = AndLHS;
15447       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
15448     }
15449   }
15450
15451   if (LHS.getNode()) {
15452     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
15453     // instruction.  Since the shift amount is in-range-or-undefined, we know
15454     // that doing a bittest on the i32 value is ok.  We extend to i32 because
15455     // the encoding for the i16 version is larger than the i32 version.
15456     // Also promote i16 to i32 for performance / code size reason.
15457     if (LHS.getValueType() == MVT::i8 ||
15458         LHS.getValueType() == MVT::i16)
15459       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15460
15461     // If the operand types disagree, extend the shift amount to match.  Since
15462     // BT ignores high bits (like shifts) we can use anyextend.
15463     if (LHS.getValueType() != RHS.getValueType())
15464       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15465
15466     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15467     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15468     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15469                        DAG.getConstant(Cond, MVT::i8), BT);
15470   }
15471
15472   return SDValue();
15473 }
15474
15475 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
15476 /// mask CMPs.
15477 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15478                               SDValue &Op1) {
15479   unsigned SSECC;
15480   bool Swap = false;
15481
15482   // SSE Condition code mapping:
15483   //  0 - EQ
15484   //  1 - LT
15485   //  2 - LE
15486   //  3 - UNORD
15487   //  4 - NEQ
15488   //  5 - NLT
15489   //  6 - NLE
15490   //  7 - ORD
15491   switch (SetCCOpcode) {
15492   default: llvm_unreachable("Unexpected SETCC condition");
15493   case ISD::SETOEQ:
15494   case ISD::SETEQ:  SSECC = 0; break;
15495   case ISD::SETOGT:
15496   case ISD::SETGT:  Swap = true; // Fallthrough
15497   case ISD::SETLT:
15498   case ISD::SETOLT: SSECC = 1; break;
15499   case ISD::SETOGE:
15500   case ISD::SETGE:  Swap = true; // Fallthrough
15501   case ISD::SETLE:
15502   case ISD::SETOLE: SSECC = 2; break;
15503   case ISD::SETUO:  SSECC = 3; break;
15504   case ISD::SETUNE:
15505   case ISD::SETNE:  SSECC = 4; break;
15506   case ISD::SETULE: Swap = true; // Fallthrough
15507   case ISD::SETUGE: SSECC = 5; break;
15508   case ISD::SETULT: Swap = true; // Fallthrough
15509   case ISD::SETUGT: SSECC = 6; break;
15510   case ISD::SETO:   SSECC = 7; break;
15511   case ISD::SETUEQ:
15512   case ISD::SETONE: SSECC = 8; break;
15513   }
15514   if (Swap)
15515     std::swap(Op0, Op1);
15516
15517   return SSECC;
15518 }
15519
15520 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
15521 // ones, and then concatenate the result back.
15522 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15523   MVT VT = Op.getSimpleValueType();
15524
15525   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15526          "Unsupported value type for operation");
15527
15528   unsigned NumElems = VT.getVectorNumElements();
15529   SDLoc dl(Op);
15530   SDValue CC = Op.getOperand(2);
15531
15532   // Extract the LHS vectors
15533   SDValue LHS = Op.getOperand(0);
15534   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
15535   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
15536
15537   // Extract the RHS vectors
15538   SDValue RHS = Op.getOperand(1);
15539   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
15540   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
15541
15542   // Issue the operation on the smaller types and concatenate the result back
15543   MVT EltVT = VT.getVectorElementType();
15544   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15545   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15546                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15547                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15548 }
15549
15550 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
15551                                      const X86Subtarget *Subtarget) {
15552   SDValue Op0 = Op.getOperand(0);
15553   SDValue Op1 = Op.getOperand(1);
15554   SDValue CC = Op.getOperand(2);
15555   MVT VT = Op.getSimpleValueType();
15556   SDLoc dl(Op);
15557
15558   assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
15559          Op.getValueType().getScalarType() == MVT::i1 &&
15560          "Cannot set masked compare for this operation");
15561
15562   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15563   unsigned  Opc = 0;
15564   bool Unsigned = false;
15565   bool Swap = false;
15566   unsigned SSECC;
15567   switch (SetCCOpcode) {
15568   default: llvm_unreachable("Unexpected SETCC condition");
15569   case ISD::SETNE:  SSECC = 4; break;
15570   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
15571   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15572   case ISD::SETLT:  Swap = true; //fall-through
15573   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
15574   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15575   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15576   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
15577   case ISD::SETULE: Unsigned = true; //fall-through
15578   case ISD::SETLE:  SSECC = 2; break;
15579   }
15580
15581   if (Swap)
15582     std::swap(Op0, Op1);
15583   if (Opc)
15584     return DAG.getNode(Opc, dl, VT, Op0, Op1);
15585   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15586   return DAG.getNode(Opc, dl, VT, Op0, Op1,
15587                      DAG.getConstant(SSECC, MVT::i8));
15588 }
15589
15590 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15591 /// operand \p Op1.  If non-trivial (for example because it's not constant)
15592 /// return an empty value.
15593 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
15594 {
15595   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15596   if (!BV)
15597     return SDValue();
15598
15599   MVT VT = Op1.getSimpleValueType();
15600   MVT EVT = VT.getVectorElementType();
15601   unsigned n = VT.getVectorNumElements();
15602   SmallVector<SDValue, 8> ULTOp1;
15603
15604   for (unsigned i = 0; i < n; ++i) {
15605     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15606     if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
15607       return SDValue();
15608
15609     // Avoid underflow.
15610     APInt Val = Elt->getAPIntValue();
15611     if (Val == 0)
15612       return SDValue();
15613
15614     ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
15615   }
15616
15617   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
15618 }
15619
15620 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
15621                            SelectionDAG &DAG) {
15622   SDValue Op0 = Op.getOperand(0);
15623   SDValue Op1 = Op.getOperand(1);
15624   SDValue CC = Op.getOperand(2);
15625   MVT VT = Op.getSimpleValueType();
15626   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15627   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15628   SDLoc dl(Op);
15629
15630   if (isFP) {
15631 #ifndef NDEBUG
15632     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15633     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15634 #endif
15635
15636     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15637     unsigned Opc = X86ISD::CMPP;
15638     if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15639       assert(VT.getVectorNumElements() <= 16);
15640       Opc = X86ISD::CMPM;
15641     }
15642     // In the two special cases we can't handle, emit two comparisons.
15643     if (SSECC == 8) {
15644       unsigned CC0, CC1;
15645       unsigned CombineOpc;
15646       if (SetCCOpcode == ISD::SETUEQ) {
15647         CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
15648       } else {
15649         assert(SetCCOpcode == ISD::SETONE);
15650         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
15651       }
15652
15653       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15654                                  DAG.getConstant(CC0, MVT::i8));
15655       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15656                                  DAG.getConstant(CC1, MVT::i8));
15657       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15658     }
15659     // Handle all other FP comparisons here.
15660     return DAG.getNode(Opc, dl, VT, Op0, Op1,
15661                        DAG.getConstant(SSECC, MVT::i8));
15662   }
15663
15664   // Break 256-bit integer vector compare into smaller ones.
15665   if (VT.is256BitVector() && !Subtarget->hasInt256())
15666     return Lower256IntVSETCC(Op, DAG);
15667
15668   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
15669   EVT OpVT = Op1.getValueType();
15670   if (Subtarget->hasAVX512()) {
15671     if (Op1.getValueType().is512BitVector() ||
15672         (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
15673         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
15674       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
15675
15676     // In AVX-512 architecture setcc returns mask with i1 elements,
15677     // But there is no compare instruction for i8 and i16 elements in KNL.
15678     // We are not talking about 512-bit operands in this case, these
15679     // types are illegal.
15680     if (MaskResult &&
15681         (OpVT.getVectorElementType().getSizeInBits() < 32 &&
15682          OpVT.getVectorElementType().getSizeInBits() >= 8))
15683       return DAG.getNode(ISD::TRUNCATE, dl, VT,
15684                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15685   }
15686
15687   // We are handling one of the integer comparisons here.  Since SSE only has
15688   // GT and EQ comparisons for integer, swapping operands and multiple
15689   // operations may be required for some comparisons.
15690   unsigned Opc;
15691   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15692   bool Subus = false;
15693
15694   switch (SetCCOpcode) {
15695   default: llvm_unreachable("Unexpected SETCC condition");
15696   case ISD::SETNE:  Invert = true;
15697   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
15698   case ISD::SETLT:  Swap = true;
15699   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
15700   case ISD::SETGE:  Swap = true;
15701   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
15702                     Invert = true; break;
15703   case ISD::SETULT: Swap = true;
15704   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15705                     FlipSigns = true; break;
15706   case ISD::SETUGE: Swap = true;
15707   case ISD::SETULE: Opc = X86ISD::PCMPGT;
15708                     FlipSigns = true; Invert = true; break;
15709   }
15710
15711   // Special case: Use min/max operations for SETULE/SETUGE
15712   MVT VET = VT.getVectorElementType();
15713   bool hasMinMax =
15714        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15715     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
15716
15717   if (hasMinMax) {
15718     switch (SetCCOpcode) {
15719     default: break;
15720     case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
15721     case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
15722     }
15723
15724     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15725   }
15726
15727   bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15728   if (!MinMax && hasSubus) {
15729     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15730     // Op0 u<= Op1:
15731     //   t = psubus Op0, Op1
15732     //   pcmpeq t, <0..0>
15733     switch (SetCCOpcode) {
15734     default: break;
15735     case ISD::SETULT: {
15736       // If the comparison is against a constant we can turn this into a
15737       // setule.  With psubus, setule does not require a swap.  This is
15738       // beneficial because the constant in the register is no longer
15739       // destructed as the destination so it can be hoisted out of a loop.
15740       // Only do this pre-AVX since vpcmp* is no longer destructive.
15741       if (Subtarget->hasAVX())
15742         break;
15743       SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
15744       if (ULEOp1.getNode()) {
15745         Op1 = ULEOp1;
15746         Subus = true; Invert = false; Swap = false;
15747       }
15748       break;
15749     }
15750     // Psubus is better than flip-sign because it requires no inversion.
15751     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
15752     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15753     }
15754
15755     if (Subus) {
15756       Opc = X86ISD::SUBUS;
15757       FlipSigns = false;
15758     }
15759   }
15760
15761   if (Swap)
15762     std::swap(Op0, Op1);
15763
15764   // Check that the operation in question is available (most are plain SSE2,
15765   // but PCMPGTQ and PCMPEQQ have different requirements).
15766   if (VT == MVT::v2i64) {
15767     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
15768       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
15769
15770       // First cast everything to the right type.
15771       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15772       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15773
15774       // Since SSE has no unsigned integer comparisons, we need to flip the sign
15775       // bits of the inputs before performing those operations. The lower
15776       // compare is always unsigned.
15777       SDValue SB;
15778       if (FlipSigns) {
15779         SB = DAG.getConstant(0x80000000U, MVT::v4i32);
15780       } else {
15781         SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
15782         SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
15783         SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
15784                          Sign, Zero, Sign, Zero);
15785       }
15786       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15787       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15788
15789       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15790       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15791       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15792
15793       // Create masks for only the low parts/high parts of the 64 bit integers.
15794       static const int MaskHi[] = { 1, 1, 3, 3 };
15795       static const int MaskLo[] = { 0, 0, 2, 2 };
15796       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15797       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15798       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15799
15800       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15801       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15802
15803       if (Invert)
15804         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15805
15806       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15807     }
15808
15809     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
15810       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15811       // pcmpeqd + pshufd + pand.
15812       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
15813
15814       // First cast everything to the right type.
15815       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15816       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15817
15818       // Do the compare.
15819       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15820
15821       // Make sure the lower and upper halves are both all-ones.
15822       static const int Mask[] = { 1, 0, 3, 2 };
15823       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15824       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15825
15826       if (Invert)
15827         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15828
15829       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15830     }
15831   }
15832
15833   // Since SSE has no unsigned integer comparisons, we need to flip the sign
15834   // bits of the inputs before performing those operations.
15835   if (FlipSigns) {
15836     EVT EltVT = VT.getVectorElementType();
15837     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
15838     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15839     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15840   }
15841
15842   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15843
15844   // If the logical-not of the result is required, perform that now.
15845   if (Invert)
15846     Result = DAG.getNOT(dl, Result, VT);
15847
15848   if (MinMax)
15849     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15850
15851   if (Subus)
15852     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15853                          getZeroVector(VT, Subtarget, DAG, dl));
15854
15855   return Result;
15856 }
15857
15858 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15859
15860   MVT VT = Op.getSimpleValueType();
15861
15862   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15863
15864   assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15865          && "SetCC type must be 8-bit or 1-bit integer");
15866   SDValue Op0 = Op.getOperand(0);
15867   SDValue Op1 = Op.getOperand(1);
15868   SDLoc dl(Op);
15869   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15870
15871   // Optimize to BT if possible.
15872   // Lower (X & (1 << N)) == 0 to BT(X, N).
15873   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15874   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15875   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15876       Op1.getOpcode() == ISD::Constant &&
15877       cast<ConstantSDNode>(Op1)->isNullValue() &&
15878       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15879     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
15880     if (NewSetCC.getNode()) {
15881       if (VT == MVT::i1)
15882         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15883       return NewSetCC;
15884     }
15885   }
15886
15887   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
15888   // these.
15889   if (Op1.getOpcode() == ISD::Constant &&
15890       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
15891        cast<ConstantSDNode>(Op1)->isNullValue()) &&
15892       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15893
15894     // If the input is a setcc, then reuse the input setcc or use a new one with
15895     // the inverted condition.
15896     if (Op0.getOpcode() == X86ISD::SETCC) {
15897       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15898       bool Invert = (CC == ISD::SETNE) ^
15899         cast<ConstantSDNode>(Op1)->isNullValue();
15900       if (!Invert)
15901         return Op0;
15902
15903       CCode = X86::GetOppositeBranchCondition(CCode);
15904       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15905                                   DAG.getConstant(CCode, MVT::i8),
15906                                   Op0.getOperand(1));
15907       if (VT == MVT::i1)
15908         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15909       return SetCC;
15910     }
15911   }
15912   if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
15913       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
15914       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15915
15916     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15917     return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
15918   }
15919
15920   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15921   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
15922   if (X86CC == X86::COND_INVALID)
15923     return SDValue();
15924
15925   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15926   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15927   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15928                               DAG.getConstant(X86CC, MVT::i8), EFLAGS);
15929   if (VT == MVT::i1)
15930     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15931   return SetCC;
15932 }
15933
15934 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
15935 static bool isX86LogicalCmp(SDValue Op) {
15936   unsigned Opc = Op.getNode()->getOpcode();
15937   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15938       Opc == X86ISD::SAHF)
15939     return true;
15940   if (Op.getResNo() == 1 &&
15941       (Opc == X86ISD::ADD ||
15942        Opc == X86ISD::SUB ||
15943        Opc == X86ISD::ADC ||
15944        Opc == X86ISD::SBB ||
15945        Opc == X86ISD::SMUL ||
15946        Opc == X86ISD::UMUL ||
15947        Opc == X86ISD::INC ||
15948        Opc == X86ISD::DEC ||
15949        Opc == X86ISD::OR ||
15950        Opc == X86ISD::XOR ||
15951        Opc == X86ISD::AND))
15952     return true;
15953
15954   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15955     return true;
15956
15957   return false;
15958 }
15959
15960 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15961   if (V.getOpcode() != ISD::TRUNCATE)
15962     return false;
15963
15964   SDValue VOp0 = V.getOperand(0);
15965   unsigned InBits = VOp0.getValueSizeInBits();
15966   unsigned Bits = V.getValueSizeInBits();
15967   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
15968 }
15969
15970 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15971   bool addTest = true;
15972   SDValue Cond  = Op.getOperand(0);
15973   SDValue Op1 = Op.getOperand(1);
15974   SDValue Op2 = Op.getOperand(2);
15975   SDLoc DL(Op);
15976   EVT VT = Op1.getValueType();
15977   SDValue CC;
15978
15979   // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15980   // are available. Otherwise fp cmovs get lowered into a less efficient branch
15981   // sequence later on.
15982   if (Cond.getOpcode() == ISD::SETCC &&
15983       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15984        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
15985       VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
15986     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15987     int SSECC = translateX86FSETCC(
15988         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15989
15990     if (SSECC != 8) {
15991       if (Subtarget->hasAVX512()) {
15992         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15993                                   DAG.getConstant(SSECC, MVT::i8));
15994         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15995       }
15996       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
15997                                 DAG.getConstant(SSECC, MVT::i8));
15998       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
15999       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
16000       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
16001     }
16002   }
16003
16004   if (Cond.getOpcode() == ISD::SETCC) {
16005     SDValue NewCond = LowerSETCC(Cond, DAG);
16006     if (NewCond.getNode())
16007       Cond = NewCond;
16008   }
16009
16010   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
16011   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
16012   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
16013   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
16014   if (Cond.getOpcode() == X86ISD::SETCC &&
16015       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
16016       isZero(Cond.getOperand(1).getOperand(1))) {
16017     SDValue Cmp = Cond.getOperand(1);
16018
16019     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
16020
16021     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
16022         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
16023       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
16024
16025       SDValue CmpOp0 = Cmp.getOperand(0);
16026       // Apply further optimizations for special cases
16027       // (select (x != 0), -1, 0) -> neg & sbb
16028       // (select (x == 0), 0, -1) -> neg & sbb
16029       if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
16030         if (YC->isNullValue() &&
16031             (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
16032           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
16033           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
16034                                     DAG.getConstant(0, CmpOp0.getValueType()),
16035                                     CmpOp0);
16036           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16037                                     DAG.getConstant(X86::COND_B, MVT::i8),
16038                                     SDValue(Neg.getNode(), 1));
16039           return Res;
16040         }
16041
16042       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
16043                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
16044       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16045
16046       SDValue Res =   // Res = 0 or -1.
16047         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16048                     DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
16049
16050       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
16051         Res = DAG.getNOT(DL, Res, Res.getValueType());
16052
16053       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
16054       if (!N2C || !N2C->isNullValue())
16055         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
16056       return Res;
16057     }
16058   }
16059
16060   // Look past (and (setcc_carry (cmp ...)), 1).
16061   if (Cond.getOpcode() == ISD::AND &&
16062       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16063     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16064     if (C && C->getAPIntValue() == 1)
16065       Cond = Cond.getOperand(0);
16066   }
16067
16068   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16069   // setting operand in place of the X86ISD::SETCC.
16070   unsigned CondOpcode = Cond.getOpcode();
16071   if (CondOpcode == X86ISD::SETCC ||
16072       CondOpcode == X86ISD::SETCC_CARRY) {
16073     CC = Cond.getOperand(0);
16074
16075     SDValue Cmp = Cond.getOperand(1);
16076     unsigned Opc = Cmp.getOpcode();
16077     MVT VT = Op.getSimpleValueType();
16078
16079     bool IllegalFPCMov = false;
16080     if (VT.isFloatingPoint() && !VT.isVector() &&
16081         !isScalarFPTypeInSSEReg(VT))  // FPStack?
16082       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
16083
16084     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
16085         Opc == X86ISD::BT) { // FIXME
16086       Cond = Cmp;
16087       addTest = false;
16088     }
16089   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16090              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16091              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16092               Cond.getOperand(0).getValueType() != MVT::i8)) {
16093     SDValue LHS = Cond.getOperand(0);
16094     SDValue RHS = Cond.getOperand(1);
16095     unsigned X86Opcode;
16096     unsigned X86Cond;
16097     SDVTList VTs;
16098     switch (CondOpcode) {
16099     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16100     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16101     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16102     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16103     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16104     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16105     default: llvm_unreachable("unexpected overflowing operator");
16106     }
16107     if (CondOpcode == ISD::UMULO)
16108       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16109                           MVT::i32);
16110     else
16111       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16112
16113     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
16114
16115     if (CondOpcode == ISD::UMULO)
16116       Cond = X86Op.getValue(2);
16117     else
16118       Cond = X86Op.getValue(1);
16119
16120     CC = DAG.getConstant(X86Cond, MVT::i8);
16121     addTest = false;
16122   }
16123
16124   if (addTest) {
16125     // Look pass the truncate if the high bits are known zero.
16126     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16127         Cond = Cond.getOperand(0);
16128
16129     // We know the result of AND is compared against zero. Try to match
16130     // it to BT.
16131     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16132       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
16133       if (NewSetCC.getNode()) {
16134         CC = NewSetCC.getOperand(0);
16135         Cond = NewSetCC.getOperand(1);
16136         addTest = false;
16137       }
16138     }
16139   }
16140
16141   if (addTest) {
16142     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16143     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
16144   }
16145
16146   // a <  b ? -1 :  0 -> RES = ~setcc_carry
16147   // a <  b ?  0 : -1 -> RES = setcc_carry
16148   // a >= b ? -1 :  0 -> RES = setcc_carry
16149   // a >= b ?  0 : -1 -> RES = ~setcc_carry
16150   if (Cond.getOpcode() == X86ISD::SUB) {
16151     Cond = ConvertCmpIfNecessary(Cond, DAG);
16152     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
16153
16154     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
16155         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
16156       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16157                                 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
16158       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
16159         return DAG.getNOT(DL, Res, Res.getValueType());
16160       return Res;
16161     }
16162   }
16163
16164   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
16165   // widen the cmov and push the truncate through. This avoids introducing a new
16166   // branch during isel and doesn't add any extensions.
16167   if (Op.getValueType() == MVT::i8 &&
16168       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
16169     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
16170     if (T1.getValueType() == T2.getValueType() &&
16171         // Blacklist CopyFromReg to avoid partial register stalls.
16172         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
16173       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
16174       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
16175       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
16176     }
16177   }
16178
16179   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
16180   // condition is true.
16181   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
16182   SDValue Ops[] = { Op2, Op1, CC, Cond };
16183   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
16184 }
16185
16186 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
16187                                        SelectionDAG &DAG) {
16188   MVT VT = Op->getSimpleValueType(0);
16189   SDValue In = Op->getOperand(0);
16190   MVT InVT = In.getSimpleValueType();
16191   MVT VTElt = VT.getVectorElementType();
16192   MVT InVTElt = InVT.getVectorElementType();
16193   SDLoc dl(Op);
16194
16195   // SKX processor
16196   if ((InVTElt == MVT::i1) &&
16197       (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
16198         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
16199
16200        ((Subtarget->hasBWI() && VT.is512BitVector() &&
16201         VTElt.getSizeInBits() <= 16)) ||
16202
16203        ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
16204         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
16205
16206        ((Subtarget->hasDQI() && VT.is512BitVector() &&
16207         VTElt.getSizeInBits() >= 32))))
16208     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16209
16210   unsigned int NumElts = VT.getVectorNumElements();
16211
16212   if (NumElts != 8 && NumElts != 16)
16213     return SDValue();
16214
16215   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
16216     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
16217       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
16218     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16219   }
16220
16221   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16222   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16223
16224   MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
16225   Constant *C = ConstantInt::get(*DAG.getContext(),
16226     APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
16227
16228   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
16229   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
16230   SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
16231                           MachinePointerInfo::getConstantPool(),
16232                           false, false, false, Alignment);
16233   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
16234   if (VT.is512BitVector())
16235     return Brcst;
16236   return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
16237 }
16238
16239 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
16240                                 SelectionDAG &DAG) {
16241   MVT VT = Op->getSimpleValueType(0);
16242   SDValue In = Op->getOperand(0);
16243   MVT InVT = In.getSimpleValueType();
16244   SDLoc dl(Op);
16245
16246   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16247     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16248
16249   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16250       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16251       (VT != MVT::v16i16 || InVT != MVT::v16i8))
16252     return SDValue();
16253
16254   if (Subtarget->hasInt256())
16255     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16256
16257   // Optimize vectors in AVX mode
16258   // Sign extend  v8i16 to v8i32 and
16259   //              v4i32 to v4i64
16260   //
16261   // Divide input vector into two parts
16262   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16263   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16264   // concat the vectors to original VT
16265
16266   unsigned NumElems = InVT.getVectorNumElements();
16267   SDValue Undef = DAG.getUNDEF(InVT);
16268
16269   SmallVector<int,8> ShufMask1(NumElems, -1);
16270   for (unsigned i = 0; i != NumElems/2; ++i)
16271     ShufMask1[i] = i;
16272
16273   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
16274
16275   SmallVector<int,8> ShufMask2(NumElems, -1);
16276   for (unsigned i = 0; i != NumElems/2; ++i)
16277     ShufMask2[i] = i + NumElems/2;
16278
16279   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
16280
16281   MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
16282                                 VT.getVectorNumElements()/2);
16283
16284   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16285   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16286
16287   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16288 }
16289
16290 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16291 // may emit an illegal shuffle but the expansion is still better than scalar
16292 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16293 // we'll emit a shuffle and a arithmetic shift.
16294 // TODO: It is possible to support ZExt by zeroing the undef values during
16295 // the shuffle phase or after the shuffle.
16296 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
16297                                  SelectionDAG &DAG) {
16298   MVT RegVT = Op.getSimpleValueType();
16299   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16300   assert(RegVT.isInteger() &&
16301          "We only custom lower integer vector sext loads.");
16302
16303   // Nothing useful we can do without SSE2 shuffles.
16304   assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
16305
16306   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16307   SDLoc dl(Ld);
16308   EVT MemVT = Ld->getMemoryVT();
16309   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16310   unsigned RegSz = RegVT.getSizeInBits();
16311
16312   ISD::LoadExtType Ext = Ld->getExtensionType();
16313
16314   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16315          && "Only anyext and sext are currently implemented.");
16316   assert(MemVT != RegVT && "Cannot extend to the same type");
16317   assert(MemVT.isVector() && "Must load a vector from memory");
16318
16319   unsigned NumElems = RegVT.getVectorNumElements();
16320   unsigned MemSz = MemVT.getSizeInBits();
16321   assert(RegSz > MemSz && "Register size must be greater than the mem size");
16322
16323   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
16324     // The only way in which we have a legal 256-bit vector result but not the
16325     // integer 256-bit operations needed to directly lower a sextload is if we
16326     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16327     // a 128-bit vector and a normal sign_extend to 256-bits that should get
16328     // correctly legalized. We do this late to allow the canonical form of
16329     // sextload to persist throughout the rest of the DAG combiner -- it wants
16330     // to fold together any extensions it can, and so will fuse a sign_extend
16331     // of an sextload into a sextload targeting a wider value.
16332     SDValue Load;
16333     if (MemSz == 128) {
16334       // Just switch this to a normal load.
16335       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16336                                        "it must be a legal 128-bit vector "
16337                                        "type!");
16338       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16339                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16340                   Ld->isInvariant(), Ld->getAlignment());
16341     } else {
16342       assert(MemSz < 128 &&
16343              "Can't extend a type wider than 128 bits to a 256 bit vector!");
16344       // Do an sext load to a 128-bit vector type. We want to use the same
16345       // number of elements, but elements half as wide. This will end up being
16346       // recursively lowered by this routine, but will succeed as we definitely
16347       // have all the necessary features if we're using AVX1.
16348       EVT HalfEltVT =
16349           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16350       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16351       Load =
16352           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16353                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16354                          Ld->isNonTemporal(), Ld->isInvariant(),
16355                          Ld->getAlignment());
16356     }
16357
16358     // Replace chain users with the new chain.
16359     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16360     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16361
16362     // Finally, do a normal sign-extend to the desired register.
16363     return DAG.getSExtOrTrunc(Load, dl, RegVT);
16364   }
16365
16366   // All sizes must be a power of two.
16367   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16368          "Non-power-of-two elements are not custom lowered!");
16369
16370   // Attempt to load the original value using scalar loads.
16371   // Find the largest scalar type that divides the total loaded size.
16372   MVT SclrLoadTy = MVT::i8;
16373   for (MVT Tp : MVT::integer_valuetypes()) {
16374     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16375       SclrLoadTy = Tp;
16376     }
16377   }
16378
16379   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16380   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16381       (64 <= MemSz))
16382     SclrLoadTy = MVT::f64;
16383
16384   // Calculate the number of scalar loads that we need to perform
16385   // in order to load our vector from memory.
16386   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16387
16388   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16389          "Can only lower sext loads with a single scalar load!");
16390
16391   unsigned loadRegZize = RegSz;
16392   if (Ext == ISD::SEXTLOAD && RegSz == 256)
16393     loadRegZize /= 2;
16394
16395   // Represent our vector as a sequence of elements which are the
16396   // largest scalar that we can load.
16397   EVT LoadUnitVecVT = EVT::getVectorVT(
16398       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16399
16400   // Represent the data using the same element type that is stored in
16401   // memory. In practice, we ''widen'' MemVT.
16402   EVT WideVecVT =
16403       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16404                        loadRegZize / MemVT.getScalarType().getSizeInBits());
16405
16406   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16407          "Invalid vector type");
16408
16409   // We can't shuffle using an illegal type.
16410   assert(TLI.isTypeLegal(WideVecVT) &&
16411          "We only lower types that form legal widened vector types");
16412
16413   SmallVector<SDValue, 8> Chains;
16414   SDValue Ptr = Ld->getBasePtr();
16415   SDValue Increment =
16416       DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
16417   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16418
16419   for (unsigned i = 0; i < NumLoads; ++i) {
16420     // Perform a single load.
16421     SDValue ScalarLoad =
16422         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16423                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16424                     Ld->getAlignment());
16425     Chains.push_back(ScalarLoad.getValue(1));
16426     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16427     // another round of DAGCombining.
16428     if (i == 0)
16429       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16430     else
16431       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16432                         ScalarLoad, DAG.getIntPtrConstant(i));
16433
16434     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16435   }
16436
16437   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16438
16439   // Bitcast the loaded value to a vector of the original element type, in
16440   // the size of the target vector type.
16441   SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
16442   unsigned SizeRatio = RegSz / MemSz;
16443
16444   if (Ext == ISD::SEXTLOAD) {
16445     // If we have SSE4.1, we can directly emit a VSEXT node.
16446     if (Subtarget->hasSSE41()) {
16447       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16448       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16449       return Sext;
16450     }
16451
16452     // Otherwise we'll shuffle the small elements in the high bits of the
16453     // larger type and perform an arithmetic shift. If the shift is not legal
16454     // it's better to scalarize.
16455     assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
16456            "We can't implement a sext load without an arithmetic right shift!");
16457
16458     // Redistribute the loaded elements into the different locations.
16459     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16460     for (unsigned i = 0; i != NumElems; ++i)
16461       ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
16462
16463     SDValue Shuff = DAG.getVectorShuffle(
16464         WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16465
16466     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16467
16468     // Build the arithmetic shift.
16469     unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16470                    MemVT.getVectorElementType().getSizeInBits();
16471     Shuff =
16472         DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
16473
16474     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16475     return Shuff;
16476   }
16477
16478   // Redistribute the loaded elements into the different locations.
16479   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16480   for (unsigned i = 0; i != NumElems; ++i)
16481     ShuffleVec[i * SizeRatio] = i;
16482
16483   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16484                                        DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16485
16486   // Bitcast to the requested type.
16487   Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16488   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16489   return Shuff;
16490 }
16491
16492 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
16493 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
16494 // from the AND / OR.
16495 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16496   Opc = Op.getOpcode();
16497   if (Opc != ISD::OR && Opc != ISD::AND)
16498     return false;
16499   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16500           Op.getOperand(0).hasOneUse() &&
16501           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16502           Op.getOperand(1).hasOneUse());
16503 }
16504
16505 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
16506 // 1 and that the SETCC node has a single use.
16507 static bool isXor1OfSetCC(SDValue Op) {
16508   if (Op.getOpcode() != ISD::XOR)
16509     return false;
16510   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16511   if (N1C && N1C->getAPIntValue() == 1) {
16512     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16513       Op.getOperand(0).hasOneUse();
16514   }
16515   return false;
16516 }
16517
16518 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16519   bool addTest = true;
16520   SDValue Chain = Op.getOperand(0);
16521   SDValue Cond  = Op.getOperand(1);
16522   SDValue Dest  = Op.getOperand(2);
16523   SDLoc dl(Op);
16524   SDValue CC;
16525   bool Inverted = false;
16526
16527   if (Cond.getOpcode() == ISD::SETCC) {
16528     // Check for setcc([su]{add,sub,mul}o == 0).
16529     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16530         isa<ConstantSDNode>(Cond.getOperand(1)) &&
16531         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
16532         Cond.getOperand(0).getResNo() == 1 &&
16533         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16534          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16535          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16536          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16537          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16538          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16539       Inverted = true;
16540       Cond = Cond.getOperand(0);
16541     } else {
16542       SDValue NewCond = LowerSETCC(Cond, DAG);
16543       if (NewCond.getNode())
16544         Cond = NewCond;
16545     }
16546   }
16547 #if 0
16548   // FIXME: LowerXALUO doesn't handle these!!
16549   else if (Cond.getOpcode() == X86ISD::ADD  ||
16550            Cond.getOpcode() == X86ISD::SUB  ||
16551            Cond.getOpcode() == X86ISD::SMUL ||
16552            Cond.getOpcode() == X86ISD::UMUL)
16553     Cond = LowerXALUO(Cond, DAG);
16554 #endif
16555
16556   // Look pass (and (setcc_carry (cmp ...)), 1).
16557   if (Cond.getOpcode() == ISD::AND &&
16558       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16559     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16560     if (C && C->getAPIntValue() == 1)
16561       Cond = Cond.getOperand(0);
16562   }
16563
16564   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16565   // setting operand in place of the X86ISD::SETCC.
16566   unsigned CondOpcode = Cond.getOpcode();
16567   if (CondOpcode == X86ISD::SETCC ||
16568       CondOpcode == X86ISD::SETCC_CARRY) {
16569     CC = Cond.getOperand(0);
16570
16571     SDValue Cmp = Cond.getOperand(1);
16572     unsigned Opc = Cmp.getOpcode();
16573     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16574     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16575       Cond = Cmp;
16576       addTest = false;
16577     } else {
16578       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16579       default: break;
16580       case X86::COND_O:
16581       case X86::COND_B:
16582         // These can only come from an arithmetic instruction with overflow,
16583         // e.g. SADDO, UADDO.
16584         Cond = Cond.getNode()->getOperand(1);
16585         addTest = false;
16586         break;
16587       }
16588     }
16589   }
16590   CondOpcode = Cond.getOpcode();
16591   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16592       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16593       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16594        Cond.getOperand(0).getValueType() != MVT::i8)) {
16595     SDValue LHS = Cond.getOperand(0);
16596     SDValue RHS = Cond.getOperand(1);
16597     unsigned X86Opcode;
16598     unsigned X86Cond;
16599     SDVTList VTs;
16600     // Keep this in sync with LowerXALUO, otherwise we might create redundant
16601     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16602     // X86ISD::INC).
16603     switch (CondOpcode) {
16604     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16605     case ISD::SADDO:
16606       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16607         if (C->isOne()) {
16608           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16609           break;
16610         }
16611       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16612     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16613     case ISD::SSUBO:
16614       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16615         if (C->isOne()) {
16616           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16617           break;
16618         }
16619       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16620     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16621     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16622     default: llvm_unreachable("unexpected overflowing operator");
16623     }
16624     if (Inverted)
16625       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16626     if (CondOpcode == ISD::UMULO)
16627       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16628                           MVT::i32);
16629     else
16630       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16631
16632     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16633
16634     if (CondOpcode == ISD::UMULO)
16635       Cond = X86Op.getValue(2);
16636     else
16637       Cond = X86Op.getValue(1);
16638
16639     CC = DAG.getConstant(X86Cond, MVT::i8);
16640     addTest = false;
16641   } else {
16642     unsigned CondOpc;
16643     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16644       SDValue Cmp = Cond.getOperand(0).getOperand(1);
16645       if (CondOpc == ISD::OR) {
16646         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16647         // two branches instead of an explicit OR instruction with a
16648         // separate test.
16649         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16650             isX86LogicalCmp(Cmp)) {
16651           CC = Cond.getOperand(0).getOperand(0);
16652           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16653                               Chain, Dest, CC, Cmp);
16654           CC = Cond.getOperand(1).getOperand(0);
16655           Cond = Cmp;
16656           addTest = false;
16657         }
16658       } else { // ISD::AND
16659         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16660         // two branches instead of an explicit AND instruction with a
16661         // separate test. However, we only do this if this block doesn't
16662         // have a fall-through edge, because this requires an explicit
16663         // jmp when the condition is false.
16664         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16665             isX86LogicalCmp(Cmp) &&
16666             Op.getNode()->hasOneUse()) {
16667           X86::CondCode CCode =
16668             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16669           CCode = X86::GetOppositeBranchCondition(CCode);
16670           CC = DAG.getConstant(CCode, MVT::i8);
16671           SDNode *User = *Op.getNode()->use_begin();
16672           // Look for an unconditional branch following this conditional branch.
16673           // We need this because we need to reverse the successors in order
16674           // to implement FCMP_OEQ.
16675           if (User->getOpcode() == ISD::BR) {
16676             SDValue FalseBB = User->getOperand(1);
16677             SDNode *NewBR =
16678               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16679             assert(NewBR == User);
16680             (void)NewBR;
16681             Dest = FalseBB;
16682
16683             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16684                                 Chain, Dest, CC, Cmp);
16685             X86::CondCode CCode =
16686               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16687             CCode = X86::GetOppositeBranchCondition(CCode);
16688             CC = DAG.getConstant(CCode, MVT::i8);
16689             Cond = Cmp;
16690             addTest = false;
16691           }
16692         }
16693       }
16694     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16695       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16696       // It should be transformed during dag combiner except when the condition
16697       // is set by a arithmetics with overflow node.
16698       X86::CondCode CCode =
16699         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16700       CCode = X86::GetOppositeBranchCondition(CCode);
16701       CC = DAG.getConstant(CCode, MVT::i8);
16702       Cond = Cond.getOperand(0).getOperand(1);
16703       addTest = false;
16704     } else if (Cond.getOpcode() == ISD::SETCC &&
16705                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16706       // For FCMP_OEQ, we can emit
16707       // two branches instead of an explicit AND instruction with a
16708       // separate test. However, we only do this if this block doesn't
16709       // have a fall-through edge, because this requires an explicit
16710       // jmp when the condition is false.
16711       if (Op.getNode()->hasOneUse()) {
16712         SDNode *User = *Op.getNode()->use_begin();
16713         // Look for an unconditional branch following this conditional branch.
16714         // We need this because we need to reverse the successors in order
16715         // to implement FCMP_OEQ.
16716         if (User->getOpcode() == ISD::BR) {
16717           SDValue FalseBB = User->getOperand(1);
16718           SDNode *NewBR =
16719             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16720           assert(NewBR == User);
16721           (void)NewBR;
16722           Dest = FalseBB;
16723
16724           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16725                                     Cond.getOperand(0), Cond.getOperand(1));
16726           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16727           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16728           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16729                               Chain, Dest, CC, Cmp);
16730           CC = DAG.getConstant(X86::COND_P, MVT::i8);
16731           Cond = Cmp;
16732           addTest = false;
16733         }
16734       }
16735     } else if (Cond.getOpcode() == ISD::SETCC &&
16736                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16737       // For FCMP_UNE, we can emit
16738       // two branches instead of an explicit AND instruction with a
16739       // separate test. However, we only do this if this block doesn't
16740       // have a fall-through edge, because this requires an explicit
16741       // jmp when the condition is false.
16742       if (Op.getNode()->hasOneUse()) {
16743         SDNode *User = *Op.getNode()->use_begin();
16744         // Look for an unconditional branch following this conditional branch.
16745         // We need this because we need to reverse the successors in order
16746         // to implement FCMP_UNE.
16747         if (User->getOpcode() == ISD::BR) {
16748           SDValue FalseBB = User->getOperand(1);
16749           SDNode *NewBR =
16750             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16751           assert(NewBR == User);
16752           (void)NewBR;
16753
16754           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16755                                     Cond.getOperand(0), Cond.getOperand(1));
16756           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16757           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16758           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16759                               Chain, Dest, CC, Cmp);
16760           CC = DAG.getConstant(X86::COND_NP, MVT::i8);
16761           Cond = Cmp;
16762           addTest = false;
16763           Dest = FalseBB;
16764         }
16765       }
16766     }
16767   }
16768
16769   if (addTest) {
16770     // Look pass the truncate if the high bits are known zero.
16771     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16772         Cond = Cond.getOperand(0);
16773
16774     // We know the result of AND is compared against zero. Try to match
16775     // it to BT.
16776     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16777       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
16778       if (NewSetCC.getNode()) {
16779         CC = NewSetCC.getOperand(0);
16780         Cond = NewSetCC.getOperand(1);
16781         addTest = false;
16782       }
16783     }
16784   }
16785
16786   if (addTest) {
16787     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16788     CC = DAG.getConstant(X86Cond, MVT::i8);
16789     Cond = EmitTest(Cond, X86Cond, dl, DAG);
16790   }
16791   Cond = ConvertCmpIfNecessary(Cond, DAG);
16792   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16793                      Chain, Dest, CC, Cond);
16794 }
16795
16796 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16797 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16798 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16799 // that the guard pages used by the OS virtual memory manager are allocated in
16800 // correct sequence.
16801 SDValue
16802 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16803                                            SelectionDAG &DAG) const {
16804   MachineFunction &MF = DAG.getMachineFunction();
16805   bool SplitStack = MF.shouldSplitStack();
16806   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
16807                SplitStack;
16808   SDLoc dl(Op);
16809
16810   if (!Lower) {
16811     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16812     SDNode* Node = Op.getNode();
16813
16814     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16815     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16816         " not tell us which reg is the stack pointer!");
16817     EVT VT = Node->getValueType(0);
16818     SDValue Tmp1 = SDValue(Node, 0);
16819     SDValue Tmp2 = SDValue(Node, 1);
16820     SDValue Tmp3 = Node->getOperand(2);
16821     SDValue Chain = Tmp1.getOperand(0);
16822
16823     // Chain the dynamic stack allocation so that it doesn't modify the stack
16824     // pointer when other instructions are using the stack.
16825     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
16826         SDLoc(Node));
16827
16828     SDValue Size = Tmp2.getOperand(1);
16829     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16830     Chain = SP.getValue(1);
16831     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
16832     const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
16833     unsigned StackAlign = TFI.getStackAlignment();
16834     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16835     if (Align > StackAlign)
16836       Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
16837           DAG.getConstant(-(uint64_t)Align, VT));
16838     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
16839
16840     Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
16841         DAG.getIntPtrConstant(0, true), SDValue(),
16842         SDLoc(Node));
16843
16844     SDValue Ops[2] = { Tmp1, Tmp2 };
16845     return DAG.getMergeValues(Ops, dl);
16846   }
16847
16848   // Get the inputs.
16849   SDValue Chain = Op.getOperand(0);
16850   SDValue Size  = Op.getOperand(1);
16851   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16852   EVT VT = Op.getNode()->getValueType(0);
16853
16854   bool Is64Bit = Subtarget->is64Bit();
16855   EVT SPTy = getPointerTy();
16856
16857   if (SplitStack) {
16858     MachineRegisterInfo &MRI = MF.getRegInfo();
16859
16860     if (Is64Bit) {
16861       // The 64 bit implementation of segmented stacks needs to clobber both r10
16862       // r11. This makes it impossible to use it along with nested parameters.
16863       const Function *F = MF.getFunction();
16864
16865       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
16866            I != E; ++I)
16867         if (I->hasNestAttr())
16868           report_fatal_error("Cannot use segmented stacks with functions that "
16869                              "have nested arguments.");
16870     }
16871
16872     const TargetRegisterClass *AddrRegClass =
16873       getRegClassFor(getPointerTy());
16874     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16875     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16876     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16877                                 DAG.getRegister(Vreg, SPTy));
16878     SDValue Ops1[2] = { Value, Chain };
16879     return DAG.getMergeValues(Ops1, dl);
16880   } else {
16881     SDValue Flag;
16882     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
16883
16884     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
16885     Flag = Chain.getValue(1);
16886     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16887
16888     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
16889
16890     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
16891     unsigned SPReg = RegInfo->getStackRegister();
16892     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16893     Chain = SP.getValue(1);
16894
16895     if (Align) {
16896       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16897                        DAG.getConstant(-(uint64_t)Align, VT));
16898       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16899     }
16900
16901     SDValue Ops1[2] = { SP, Chain };
16902     return DAG.getMergeValues(Ops1, dl);
16903   }
16904 }
16905
16906 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16907   MachineFunction &MF = DAG.getMachineFunction();
16908   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16909
16910   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16911   SDLoc DL(Op);
16912
16913   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
16914     // vastart just stores the address of the VarArgsFrameIndex slot into the
16915     // memory location argument.
16916     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16917                                    getPointerTy());
16918     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16919                         MachinePointerInfo(SV), false, false, 0);
16920   }
16921
16922   // __va_list_tag:
16923   //   gp_offset         (0 - 6 * 8)
16924   //   fp_offset         (48 - 48 + 8 * 16)
16925   //   overflow_arg_area (point to parameters coming in memory).
16926   //   reg_save_area
16927   SmallVector<SDValue, 8> MemOps;
16928   SDValue FIN = Op.getOperand(1);
16929   // Store gp_offset
16930   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
16931                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
16932                                                MVT::i32),
16933                                FIN, MachinePointerInfo(SV), false, false, 0);
16934   MemOps.push_back(Store);
16935
16936   // Store fp_offset
16937   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16938                     FIN, DAG.getIntPtrConstant(4));
16939   Store = DAG.getStore(Op.getOperand(0), DL,
16940                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
16941                                        MVT::i32),
16942                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
16943   MemOps.push_back(Store);
16944
16945   // Store ptr to overflow_arg_area
16946   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16947                     FIN, DAG.getIntPtrConstant(4));
16948   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16949                                     getPointerTy());
16950   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
16951                        MachinePointerInfo(SV, 8),
16952                        false, false, 0);
16953   MemOps.push_back(Store);
16954
16955   // Store ptr to reg_save_area.
16956   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16957                     FIN, DAG.getIntPtrConstant(8));
16958   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
16959                                     getPointerTy());
16960   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
16961                        MachinePointerInfo(SV, 16), false, false, 0);
16962   MemOps.push_back(Store);
16963   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16964 }
16965
16966 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16967   assert(Subtarget->is64Bit() &&
16968          "LowerVAARG only handles 64-bit va_arg!");
16969   assert((Subtarget->isTargetLinux() ||
16970           Subtarget->isTargetDarwin()) &&
16971           "Unhandled target in LowerVAARG");
16972   assert(Op.getNode()->getNumOperands() == 4);
16973   SDValue Chain = Op.getOperand(0);
16974   SDValue SrcPtr = Op.getOperand(1);
16975   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16976   unsigned Align = Op.getConstantOperandVal(3);
16977   SDLoc dl(Op);
16978
16979   EVT ArgVT = Op.getNode()->getValueType(0);
16980   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16981   uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
16982   uint8_t ArgMode;
16983
16984   // Decide which area this value should be read from.
16985   // TODO: Implement the AMD64 ABI in its entirety. This simple
16986   // selection mechanism works only for the basic types.
16987   if (ArgVT == MVT::f80) {
16988     llvm_unreachable("va_arg for f80 not yet implemented");
16989   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16990     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
16991   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16992     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
16993   } else {
16994     llvm_unreachable("Unhandled argument type in LowerVAARG");
16995   }
16996
16997   if (ArgMode == 2) {
16998     // Sanity Check: Make sure using fp_offset makes sense.
16999     assert(!DAG.getTarget().Options.UseSoftFloat &&
17000            !(DAG.getMachineFunction()
17001                 .getFunction()->getAttributes()
17002                 .hasAttribute(AttributeSet::FunctionIndex,
17003                               Attribute::NoImplicitFloat)) &&
17004            Subtarget->hasSSE1());
17005   }
17006
17007   // Insert VAARG_64 node into the DAG
17008   // VAARG_64 returns two values: Variable Argument Address, Chain
17009   SmallVector<SDValue, 11> InstOps;
17010   InstOps.push_back(Chain);
17011   InstOps.push_back(SrcPtr);
17012   InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
17013   InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
17014   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
17015   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
17016   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
17017                                           VTs, InstOps, MVT::i64,
17018                                           MachinePointerInfo(SV),
17019                                           /*Align=*/0,
17020                                           /*Volatile=*/false,
17021                                           /*ReadMem=*/true,
17022                                           /*WriteMem=*/true);
17023   Chain = VAARG.getValue(1);
17024
17025   // Load the next argument and return it
17026   return DAG.getLoad(ArgVT, dl,
17027                      Chain,
17028                      VAARG,
17029                      MachinePointerInfo(),
17030                      false, false, false, 0);
17031 }
17032
17033 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
17034                            SelectionDAG &DAG) {
17035   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
17036   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
17037   SDValue Chain = Op.getOperand(0);
17038   SDValue DstPtr = Op.getOperand(1);
17039   SDValue SrcPtr = Op.getOperand(2);
17040   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
17041   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17042   SDLoc DL(Op);
17043
17044   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
17045                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
17046                        false,
17047                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
17048 }
17049
17050 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
17051 // amount is a constant. Takes immediate version of shift as input.
17052 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
17053                                           SDValue SrcOp, uint64_t ShiftAmt,
17054                                           SelectionDAG &DAG) {
17055   MVT ElementType = VT.getVectorElementType();
17056
17057   // Fold this packed shift into its first operand if ShiftAmt is 0.
17058   if (ShiftAmt == 0)
17059     return SrcOp;
17060
17061   // Check for ShiftAmt >= element width
17062   if (ShiftAmt >= ElementType.getSizeInBits()) {
17063     if (Opc == X86ISD::VSRAI)
17064       ShiftAmt = ElementType.getSizeInBits() - 1;
17065     else
17066       return DAG.getConstant(0, VT);
17067   }
17068
17069   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
17070          && "Unknown target vector shift-by-constant node");
17071
17072   // Fold this packed vector shift into a build vector if SrcOp is a
17073   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
17074   if (VT == SrcOp.getSimpleValueType() &&
17075       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
17076     SmallVector<SDValue, 8> Elts;
17077     unsigned NumElts = SrcOp->getNumOperands();
17078     ConstantSDNode *ND;
17079
17080     switch(Opc) {
17081     default: llvm_unreachable(nullptr);
17082     case X86ISD::VSHLI:
17083       for (unsigned i=0; i!=NumElts; ++i) {
17084         SDValue CurrentOp = SrcOp->getOperand(i);
17085         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17086           Elts.push_back(CurrentOp);
17087           continue;
17088         }
17089         ND = cast<ConstantSDNode>(CurrentOp);
17090         const APInt &C = ND->getAPIntValue();
17091         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
17092       }
17093       break;
17094     case X86ISD::VSRLI:
17095       for (unsigned i=0; i!=NumElts; ++i) {
17096         SDValue CurrentOp = SrcOp->getOperand(i);
17097         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17098           Elts.push_back(CurrentOp);
17099           continue;
17100         }
17101         ND = cast<ConstantSDNode>(CurrentOp);
17102         const APInt &C = ND->getAPIntValue();
17103         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
17104       }
17105       break;
17106     case X86ISD::VSRAI:
17107       for (unsigned i=0; i!=NumElts; ++i) {
17108         SDValue CurrentOp = SrcOp->getOperand(i);
17109         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17110           Elts.push_back(CurrentOp);
17111           continue;
17112         }
17113         ND = cast<ConstantSDNode>(CurrentOp);
17114         const APInt &C = ND->getAPIntValue();
17115         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
17116       }
17117       break;
17118     }
17119
17120     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
17121   }
17122
17123   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
17124 }
17125
17126 // getTargetVShiftNode - Handle vector element shifts where the shift amount
17127 // may or may not be a constant. Takes immediate version of shift as input.
17128 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
17129                                    SDValue SrcOp, SDValue ShAmt,
17130                                    SelectionDAG &DAG) {
17131   MVT SVT = ShAmt.getSimpleValueType();
17132   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
17133
17134   // Catch shift-by-constant.
17135   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
17136     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
17137                                       CShAmt->getZExtValue(), DAG);
17138
17139   // Change opcode to non-immediate version
17140   switch (Opc) {
17141     default: llvm_unreachable("Unknown target vector shift node");
17142     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
17143     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
17144     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
17145   }
17146
17147   const X86Subtarget &Subtarget =
17148       static_cast<const X86Subtarget &>(DAG.getSubtarget());
17149   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
17150       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
17151     // Let the shuffle legalizer expand this shift amount node.
17152     SDValue Op0 = ShAmt.getOperand(0);
17153     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
17154     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
17155   } else {
17156     // Need to build a vector containing shift amount.
17157     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
17158     SmallVector<SDValue, 4> ShOps;
17159     ShOps.push_back(ShAmt);
17160     if (SVT == MVT::i32) {
17161       ShOps.push_back(DAG.getConstant(0, SVT));
17162       ShOps.push_back(DAG.getUNDEF(SVT));
17163     }
17164     ShOps.push_back(DAG.getUNDEF(SVT));
17165
17166     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
17167     ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
17168   }
17169
17170   // The return type has to be a 128-bit type with the same element
17171   // type as the input type.
17172   MVT EltVT = VT.getVectorElementType();
17173   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
17174
17175   ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
17176   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
17177 }
17178
17179 /// \brief Return (and \p Op, \p Mask) for compare instructions or
17180 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
17181 /// necessary casting for \p Mask when lowering masking intrinsics.
17182 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
17183                                     SDValue PreservedSrc,
17184                                     const X86Subtarget *Subtarget,
17185                                     SelectionDAG &DAG) {
17186     EVT VT = Op.getValueType();
17187     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
17188                                   MVT::i1, VT.getVectorNumElements());
17189     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17190                                      Mask.getValueType().getSizeInBits());
17191     SDLoc dl(Op);
17192
17193     assert(MaskVT.isSimple() && "invalid mask type");
17194
17195     if (isAllOnes(Mask))
17196       return Op;
17197
17198     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
17199     // are extracted by EXTRACT_SUBVECTOR.
17200     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17201                               DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17202                               DAG.getIntPtrConstant(0));
17203
17204     switch (Op.getOpcode()) {
17205       default: break;
17206       case X86ISD::PCMPEQM:
17207       case X86ISD::PCMPGTM:
17208       case X86ISD::CMPM:
17209       case X86ISD::CMPMU:
17210         return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
17211     }
17212     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17213       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17214     return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
17215 }
17216
17217 /// \brief Creates an SDNode for a predicated scalar operation.
17218 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17219 /// The mask is comming as MVT::i8 and it should be truncated
17220 /// to MVT::i1 while lowering masking intrinsics.
17221 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17222 /// "X86select" instead of "vselect". We just can't create the "vselect" node for
17223 /// a scalar instruction.
17224 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17225                                     SDValue PreservedSrc,
17226                                     const X86Subtarget *Subtarget,
17227                                     SelectionDAG &DAG) {
17228     if (isAllOnes(Mask))
17229       return Op;
17230
17231     EVT VT = Op.getValueType();
17232     SDLoc dl(Op);
17233     // The mask should be of type MVT::i1
17234     SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17235
17236     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17237       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17238     return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17239 }
17240
17241 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17242                                        SelectionDAG &DAG) {
17243   SDLoc dl(Op);
17244   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17245   EVT VT = Op.getValueType();
17246   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17247   if (IntrData) {
17248     switch(IntrData->Type) {
17249     case INTR_TYPE_1OP:
17250       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17251     case INTR_TYPE_2OP:
17252       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17253         Op.getOperand(2));
17254     case INTR_TYPE_3OP:
17255       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17256         Op.getOperand(2), Op.getOperand(3));
17257     case INTR_TYPE_1OP_MASK_RM: {
17258       SDValue Src = Op.getOperand(1);
17259       SDValue Src0 = Op.getOperand(2);
17260       SDValue Mask = Op.getOperand(3);
17261       SDValue RoundingMode = Op.getOperand(4);
17262       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17263                                               RoundingMode),
17264                                   Mask, Src0, Subtarget, DAG);
17265     }
17266     case INTR_TYPE_SCALAR_MASK_RM: {
17267       SDValue Src1 = Op.getOperand(1);
17268       SDValue Src2 = Op.getOperand(2);
17269       SDValue Src0 = Op.getOperand(3);
17270       SDValue Mask = Op.getOperand(4);
17271       SDValue RoundingMode = Op.getOperand(5);
17272       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17273                                               RoundingMode),
17274                                   Mask, Src0, Subtarget, DAG);
17275     }
17276     case INTR_TYPE_2OP_MASK: {
17277       SDValue Mask = Op.getOperand(4);
17278       SDValue PassThru = Op.getOperand(3);
17279       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17280       if (IntrWithRoundingModeOpcode != 0) {
17281         unsigned Round = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
17282         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17283           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17284                                       dl, Op.getValueType(),
17285                                       Op.getOperand(1), Op.getOperand(2),
17286                                       Op.getOperand(3), Op.getOperand(5)),
17287                                       Mask, PassThru, Subtarget, DAG);
17288         }
17289       }
17290       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17291                                               Op.getOperand(1),
17292                                               Op.getOperand(2)),
17293                                   Mask, PassThru, Subtarget, DAG);
17294     }
17295     case FMA_OP_MASK: {
17296       SDValue Src1 = Op.getOperand(1);
17297       SDValue Src2 = Op.getOperand(2);
17298       SDValue Src3 = Op.getOperand(3);
17299       SDValue Mask = Op.getOperand(4);
17300       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17301       if (IntrWithRoundingModeOpcode != 0) {
17302         SDValue Rnd = Op.getOperand(5);
17303         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17304             X86::STATIC_ROUNDING::CUR_DIRECTION)
17305           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17306                                                   dl, Op.getValueType(),
17307                                                   Src1, Src2, Src3, Rnd),
17308                                       Mask, Src1, Subtarget, DAG);
17309       }
17310       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17311                                               dl, Op.getValueType(),
17312                                               Src1, Src2, Src3),
17313                                   Mask, Src1, Subtarget, DAG);
17314     }
17315     case CMP_MASK:
17316     case CMP_MASK_CC: {
17317       // Comparison intrinsics with masks.
17318       // Example of transformation:
17319       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17320       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17321       // (i8 (bitcast
17322       //   (v8i1 (insert_subvector undef,
17323       //           (v2i1 (and (PCMPEQM %a, %b),
17324       //                      (extract_subvector
17325       //                         (v8i1 (bitcast %mask)), 0))), 0))))
17326       EVT VT = Op.getOperand(1).getValueType();
17327       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17328                                     VT.getVectorNumElements());
17329       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17330       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17331                                        Mask.getValueType().getSizeInBits());
17332       SDValue Cmp;
17333       if (IntrData->Type == CMP_MASK_CC) {
17334         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17335                     Op.getOperand(2), Op.getOperand(3));
17336       } else {
17337         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17338         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17339                     Op.getOperand(2));
17340       }
17341       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17342                                              DAG.getTargetConstant(0, MaskVT),
17343                                              Subtarget, DAG);
17344       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17345                                 DAG.getUNDEF(BitcastVT), CmpMask,
17346                                 DAG.getIntPtrConstant(0));
17347       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
17348     }
17349     case COMI: { // Comparison intrinsics
17350       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17351       SDValue LHS = Op.getOperand(1);
17352       SDValue RHS = Op.getOperand(2);
17353       unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
17354       assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
17355       SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17356       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17357                                   DAG.getConstant(X86CC, MVT::i8), Cond);
17358       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17359     }
17360     case VSHIFT:
17361       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17362                                  Op.getOperand(1), Op.getOperand(2), DAG);
17363     case VSHIFT_MASK:
17364       return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
17365                                                       Op.getSimpleValueType(),
17366                                                       Op.getOperand(1),
17367                                                       Op.getOperand(2), DAG),
17368                                   Op.getOperand(4), Op.getOperand(3), Subtarget,
17369                                   DAG);
17370     case COMPRESS_EXPAND_IN_REG: {
17371       SDValue Mask = Op.getOperand(3);
17372       SDValue DataToCompress = Op.getOperand(1);
17373       SDValue PassThru = Op.getOperand(2);
17374       if (isAllOnes(Mask)) // return data as is
17375         return Op.getOperand(1);
17376       EVT VT = Op.getValueType();
17377       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17378                                     VT.getVectorNumElements());
17379       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17380                                        Mask.getValueType().getSizeInBits());
17381       SDLoc dl(Op);
17382       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17383                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17384                                   DAG.getIntPtrConstant(0));
17385
17386       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
17387                          PassThru);
17388     }
17389     case BLEND: {
17390       SDValue Mask = Op.getOperand(3);
17391       EVT VT = Op.getValueType();
17392       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17393                                     VT.getVectorNumElements());
17394       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17395                                        Mask.getValueType().getSizeInBits());
17396       SDLoc dl(Op);
17397       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17398                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17399                                   DAG.getIntPtrConstant(0));
17400       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
17401                          Op.getOperand(2));
17402     }
17403     default:
17404       break;
17405     }
17406   }
17407
17408   switch (IntNo) {
17409   default: return SDValue();    // Don't custom lower most intrinsics.
17410
17411   case Intrinsic::x86_avx512_mask_valign_q_512:
17412   case Intrinsic::x86_avx512_mask_valign_d_512:
17413     // Vector source operands are swapped.
17414     return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
17415                                             Op.getValueType(), Op.getOperand(2),
17416                                             Op.getOperand(1),
17417                                             Op.getOperand(3)),
17418                                 Op.getOperand(5), Op.getOperand(4),
17419                                 Subtarget, DAG);
17420
17421   // ptest and testp intrinsics. The intrinsic these come from are designed to
17422   // return an integer value, not just an instruction so lower it to the ptest
17423   // or testp pattern and a setcc for the result.
17424   case Intrinsic::x86_sse41_ptestz:
17425   case Intrinsic::x86_sse41_ptestc:
17426   case Intrinsic::x86_sse41_ptestnzc:
17427   case Intrinsic::x86_avx_ptestz_256:
17428   case Intrinsic::x86_avx_ptestc_256:
17429   case Intrinsic::x86_avx_ptestnzc_256:
17430   case Intrinsic::x86_avx_vtestz_ps:
17431   case Intrinsic::x86_avx_vtestc_ps:
17432   case Intrinsic::x86_avx_vtestnzc_ps:
17433   case Intrinsic::x86_avx_vtestz_pd:
17434   case Intrinsic::x86_avx_vtestc_pd:
17435   case Intrinsic::x86_avx_vtestnzc_pd:
17436   case Intrinsic::x86_avx_vtestz_ps_256:
17437   case Intrinsic::x86_avx_vtestc_ps_256:
17438   case Intrinsic::x86_avx_vtestnzc_ps_256:
17439   case Intrinsic::x86_avx_vtestz_pd_256:
17440   case Intrinsic::x86_avx_vtestc_pd_256:
17441   case Intrinsic::x86_avx_vtestnzc_pd_256: {
17442     bool IsTestPacked = false;
17443     unsigned X86CC;
17444     switch (IntNo) {
17445     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17446     case Intrinsic::x86_avx_vtestz_ps:
17447     case Intrinsic::x86_avx_vtestz_pd:
17448     case Intrinsic::x86_avx_vtestz_ps_256:
17449     case Intrinsic::x86_avx_vtestz_pd_256:
17450       IsTestPacked = true; // Fallthrough
17451     case Intrinsic::x86_sse41_ptestz:
17452     case Intrinsic::x86_avx_ptestz_256:
17453       // ZF = 1
17454       X86CC = X86::COND_E;
17455       break;
17456     case Intrinsic::x86_avx_vtestc_ps:
17457     case Intrinsic::x86_avx_vtestc_pd:
17458     case Intrinsic::x86_avx_vtestc_ps_256:
17459     case Intrinsic::x86_avx_vtestc_pd_256:
17460       IsTestPacked = true; // Fallthrough
17461     case Intrinsic::x86_sse41_ptestc:
17462     case Intrinsic::x86_avx_ptestc_256:
17463       // CF = 1
17464       X86CC = X86::COND_B;
17465       break;
17466     case Intrinsic::x86_avx_vtestnzc_ps:
17467     case Intrinsic::x86_avx_vtestnzc_pd:
17468     case Intrinsic::x86_avx_vtestnzc_ps_256:
17469     case Intrinsic::x86_avx_vtestnzc_pd_256:
17470       IsTestPacked = true; // Fallthrough
17471     case Intrinsic::x86_sse41_ptestnzc:
17472     case Intrinsic::x86_avx_ptestnzc_256:
17473       // ZF and CF = 0
17474       X86CC = X86::COND_A;
17475       break;
17476     }
17477
17478     SDValue LHS = Op.getOperand(1);
17479     SDValue RHS = Op.getOperand(2);
17480     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
17481     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
17482     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17483     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
17484     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17485   }
17486   case Intrinsic::x86_avx512_kortestz_w:
17487   case Intrinsic::x86_avx512_kortestc_w: {
17488     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
17489     SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
17490     SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
17491     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17492     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
17493     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
17494     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17495   }
17496
17497   case Intrinsic::x86_sse42_pcmpistria128:
17498   case Intrinsic::x86_sse42_pcmpestria128:
17499   case Intrinsic::x86_sse42_pcmpistric128:
17500   case Intrinsic::x86_sse42_pcmpestric128:
17501   case Intrinsic::x86_sse42_pcmpistrio128:
17502   case Intrinsic::x86_sse42_pcmpestrio128:
17503   case Intrinsic::x86_sse42_pcmpistris128:
17504   case Intrinsic::x86_sse42_pcmpestris128:
17505   case Intrinsic::x86_sse42_pcmpistriz128:
17506   case Intrinsic::x86_sse42_pcmpestriz128: {
17507     unsigned Opcode;
17508     unsigned X86CC;
17509     switch (IntNo) {
17510     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
17511     case Intrinsic::x86_sse42_pcmpistria128:
17512       Opcode = X86ISD::PCMPISTRI;
17513       X86CC = X86::COND_A;
17514       break;
17515     case Intrinsic::x86_sse42_pcmpestria128:
17516       Opcode = X86ISD::PCMPESTRI;
17517       X86CC = X86::COND_A;
17518       break;
17519     case Intrinsic::x86_sse42_pcmpistric128:
17520       Opcode = X86ISD::PCMPISTRI;
17521       X86CC = X86::COND_B;
17522       break;
17523     case Intrinsic::x86_sse42_pcmpestric128:
17524       Opcode = X86ISD::PCMPESTRI;
17525       X86CC = X86::COND_B;
17526       break;
17527     case Intrinsic::x86_sse42_pcmpistrio128:
17528       Opcode = X86ISD::PCMPISTRI;
17529       X86CC = X86::COND_O;
17530       break;
17531     case Intrinsic::x86_sse42_pcmpestrio128:
17532       Opcode = X86ISD::PCMPESTRI;
17533       X86CC = X86::COND_O;
17534       break;
17535     case Intrinsic::x86_sse42_pcmpistris128:
17536       Opcode = X86ISD::PCMPISTRI;
17537       X86CC = X86::COND_S;
17538       break;
17539     case Intrinsic::x86_sse42_pcmpestris128:
17540       Opcode = X86ISD::PCMPESTRI;
17541       X86CC = X86::COND_S;
17542       break;
17543     case Intrinsic::x86_sse42_pcmpistriz128:
17544       Opcode = X86ISD::PCMPISTRI;
17545       X86CC = X86::COND_E;
17546       break;
17547     case Intrinsic::x86_sse42_pcmpestriz128:
17548       Opcode = X86ISD::PCMPESTRI;
17549       X86CC = X86::COND_E;
17550       break;
17551     }
17552     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17553     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17554     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
17555     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17556                                 DAG.getConstant(X86CC, MVT::i8),
17557                                 SDValue(PCMP.getNode(), 1));
17558     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17559   }
17560
17561   case Intrinsic::x86_sse42_pcmpistri128:
17562   case Intrinsic::x86_sse42_pcmpestri128: {
17563     unsigned Opcode;
17564     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
17565       Opcode = X86ISD::PCMPISTRI;
17566     else
17567       Opcode = X86ISD::PCMPESTRI;
17568
17569     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17570     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17571     return DAG.getNode(Opcode, dl, VTs, NewOps);
17572   }
17573   }
17574 }
17575
17576 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17577                               SDValue Src, SDValue Mask, SDValue Base,
17578                               SDValue Index, SDValue ScaleOp, SDValue Chain,
17579                               const X86Subtarget * Subtarget) {
17580   SDLoc dl(Op);
17581   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17582   assert(C && "Invalid scale type");
17583   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17584   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17585                              Index.getSimpleValueType().getVectorNumElements());
17586   SDValue MaskInReg;
17587   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17588   if (MaskC)
17589     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17590   else
17591     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17592   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
17593   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17594   SDValue Segment = DAG.getRegister(0, MVT::i32);
17595   if (Src.getOpcode() == ISD::UNDEF)
17596     Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
17597   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17598   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17599   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
17600   return DAG.getMergeValues(RetOps, dl);
17601 }
17602
17603 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17604                                SDValue Src, SDValue Mask, SDValue Base,
17605                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
17606   SDLoc dl(Op);
17607   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17608   assert(C && "Invalid scale type");
17609   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17610   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17611   SDValue Segment = DAG.getRegister(0, MVT::i32);
17612   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17613                              Index.getSimpleValueType().getVectorNumElements());
17614   SDValue MaskInReg;
17615   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17616   if (MaskC)
17617     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17618   else
17619     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17620   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
17621   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
17622   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17623   return SDValue(Res, 1);
17624 }
17625
17626 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17627                                SDValue Mask, SDValue Base, SDValue Index,
17628                                SDValue ScaleOp, SDValue Chain) {
17629   SDLoc dl(Op);
17630   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17631   assert(C && "Invalid scale type");
17632   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17633   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17634   SDValue Segment = DAG.getRegister(0, MVT::i32);
17635   EVT MaskVT =
17636     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
17637   SDValue MaskInReg;
17638   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17639   if (MaskC)
17640     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17641   else
17642     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17643   //SDVTList VTs = DAG.getVTList(MVT::Other);
17644   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17645   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
17646   return SDValue(Res, 0);
17647 }
17648
17649 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
17650 // read performance monitor counters (x86_rdpmc).
17651 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
17652                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17653                               SmallVectorImpl<SDValue> &Results) {
17654   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17655   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17656   SDValue LO, HI;
17657
17658   // The ECX register is used to select the index of the performance counter
17659   // to read.
17660   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
17661                                    N->getOperand(2));
17662   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
17663
17664   // Reads the content of a 64-bit performance counter and returns it in the
17665   // registers EDX:EAX.
17666   if (Subtarget->is64Bit()) {
17667     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17668     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17669                             LO.getValue(2));
17670   } else {
17671     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17672     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17673                             LO.getValue(2));
17674   }
17675   Chain = HI.getValue(1);
17676
17677   if (Subtarget->is64Bit()) {
17678     // The EAX register is loaded with the low-order 32 bits. The EDX register
17679     // is loaded with the supported high-order bits of the counter.
17680     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17681                               DAG.getConstant(32, MVT::i8));
17682     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17683     Results.push_back(Chain);
17684     return;
17685   }
17686
17687   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17688   SDValue Ops[] = { LO, HI };
17689   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17690   Results.push_back(Pair);
17691   Results.push_back(Chain);
17692 }
17693
17694 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
17695 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
17696 // also used to custom lower READCYCLECOUNTER nodes.
17697 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
17698                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17699                               SmallVectorImpl<SDValue> &Results) {
17700   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17701   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
17702   SDValue LO, HI;
17703
17704   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
17705   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
17706   // and the EAX register is loaded with the low-order 32 bits.
17707   if (Subtarget->is64Bit()) {
17708     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17709     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17710                             LO.getValue(2));
17711   } else {
17712     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17713     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17714                             LO.getValue(2));
17715   }
17716   SDValue Chain = HI.getValue(1);
17717
17718   if (Opcode == X86ISD::RDTSCP_DAG) {
17719     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17720
17721     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
17722     // the ECX register. Add 'ecx' explicitly to the chain.
17723     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
17724                                      HI.getValue(2));
17725     // Explicitly store the content of ECX at the location passed in input
17726     // to the 'rdtscp' intrinsic.
17727     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
17728                          MachinePointerInfo(), false, false, 0);
17729   }
17730
17731   if (Subtarget->is64Bit()) {
17732     // The EDX register is loaded with the high-order 32 bits of the MSR, and
17733     // the EAX register is loaded with the low-order 32 bits.
17734     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17735                               DAG.getConstant(32, MVT::i8));
17736     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17737     Results.push_back(Chain);
17738     return;
17739   }
17740
17741   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17742   SDValue Ops[] = { LO, HI };
17743   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17744   Results.push_back(Pair);
17745   Results.push_back(Chain);
17746 }
17747
17748 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
17749                                      SelectionDAG &DAG) {
17750   SmallVector<SDValue, 2> Results;
17751   SDLoc DL(Op);
17752   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
17753                           Results);
17754   return DAG.getMergeValues(Results, DL);
17755 }
17756
17757
17758 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17759                                       SelectionDAG &DAG) {
17760   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
17761
17762   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
17763   if (!IntrData)
17764     return SDValue();
17765
17766   SDLoc dl(Op);
17767   switch(IntrData->Type) {
17768   default:
17769     llvm_unreachable("Unknown Intrinsic Type");
17770     break;
17771   case RDSEED:
17772   case RDRAND: {
17773     // Emit the node with the right value type.
17774     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
17775     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17776
17777     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
17778     // Otherwise return the value from Rand, which is always 0, casted to i32.
17779     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
17780                       DAG.getConstant(1, Op->getValueType(1)),
17781                       DAG.getConstant(X86::COND_B, MVT::i32),
17782                       SDValue(Result.getNode(), 1) };
17783     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
17784                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
17785                                   Ops);
17786
17787     // Return { result, isValid, chain }.
17788     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
17789                        SDValue(Result.getNode(), 2));
17790   }
17791   case GATHER: {
17792   //gather(v1, mask, index, base, scale);
17793     SDValue Chain = Op.getOperand(0);
17794     SDValue Src   = Op.getOperand(2);
17795     SDValue Base  = Op.getOperand(3);
17796     SDValue Index = Op.getOperand(4);
17797     SDValue Mask  = Op.getOperand(5);
17798     SDValue Scale = Op.getOperand(6);
17799     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
17800                           Subtarget);
17801   }
17802   case SCATTER: {
17803   //scatter(base, mask, index, v1, scale);
17804     SDValue Chain = Op.getOperand(0);
17805     SDValue Base  = Op.getOperand(2);
17806     SDValue Mask  = Op.getOperand(3);
17807     SDValue Index = Op.getOperand(4);
17808     SDValue Src   = Op.getOperand(5);
17809     SDValue Scale = Op.getOperand(6);
17810     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
17811   }
17812   case PREFETCH: {
17813     SDValue Hint = Op.getOperand(6);
17814     unsigned HintVal;
17815     if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
17816         (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
17817       llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
17818     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
17819     SDValue Chain = Op.getOperand(0);
17820     SDValue Mask  = Op.getOperand(2);
17821     SDValue Index = Op.getOperand(3);
17822     SDValue Base  = Op.getOperand(4);
17823     SDValue Scale = Op.getOperand(5);
17824     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
17825   }
17826   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
17827   case RDTSC: {
17828     SmallVector<SDValue, 2> Results;
17829     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
17830     return DAG.getMergeValues(Results, dl);
17831   }
17832   // Read Performance Monitoring Counters.
17833   case RDPMC: {
17834     SmallVector<SDValue, 2> Results;
17835     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
17836     return DAG.getMergeValues(Results, dl);
17837   }
17838   // XTEST intrinsics.
17839   case XTEST: {
17840     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17841     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17842     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17843                                 DAG.getConstant(X86::COND_NE, MVT::i8),
17844                                 InTrans);
17845     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
17846     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
17847                        Ret, SDValue(InTrans.getNode(), 1));
17848   }
17849   // ADC/ADCX/SBB
17850   case ADX: {
17851     SmallVector<SDValue, 2> Results;
17852     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17853     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
17854     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
17855                                 DAG.getConstant(-1, MVT::i8));
17856     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
17857                               Op.getOperand(4), GenCF.getValue(1));
17858     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
17859                                  Op.getOperand(5), MachinePointerInfo(),
17860                                  false, false, 0);
17861     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17862                                 DAG.getConstant(X86::COND_B, MVT::i8),
17863                                 Res.getValue(1));
17864     Results.push_back(SetCC);
17865     Results.push_back(Store);
17866     return DAG.getMergeValues(Results, dl);
17867   }
17868   case COMPRESS_TO_MEM: {
17869     SDLoc dl(Op);
17870     SDValue Mask = Op.getOperand(4);
17871     SDValue DataToCompress = Op.getOperand(3);
17872     SDValue Addr = Op.getOperand(2);
17873     SDValue Chain = Op.getOperand(0);
17874
17875     if (isAllOnes(Mask)) // return just a store
17876       return DAG.getStore(Chain, dl, DataToCompress, Addr,
17877                           MachinePointerInfo(), false, false, 0);
17878
17879     EVT VT = DataToCompress.getValueType();
17880     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17881                                   VT.getVectorNumElements());
17882     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17883                                      Mask.getValueType().getSizeInBits());
17884     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17885                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17886                                 DAG.getIntPtrConstant(0));
17887
17888     SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
17889                                       DataToCompress, DAG.getUNDEF(VT));
17890     return DAG.getStore(Chain, dl, Compressed, Addr,
17891                         MachinePointerInfo(), false, false, 0);
17892   }
17893   case EXPAND_FROM_MEM: {
17894     SDLoc dl(Op);
17895     SDValue Mask = Op.getOperand(4);
17896     SDValue PathThru = Op.getOperand(3);
17897     SDValue Addr = Op.getOperand(2);
17898     SDValue Chain = Op.getOperand(0);
17899     EVT VT = Op.getValueType();
17900
17901     if (isAllOnes(Mask)) // return just a load
17902       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
17903                          false, 0);
17904     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17905                                   VT.getVectorNumElements());
17906     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17907                                      Mask.getValueType().getSizeInBits());
17908     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17909                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17910                                 DAG.getIntPtrConstant(0));
17911
17912     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
17913                                    false, false, false, 0);
17914
17915     SmallVector<SDValue, 2> Results;
17916     Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
17917                                   PathThru));
17918     Results.push_back(Chain);
17919     return DAG.getMergeValues(Results, dl);
17920   }
17921   }
17922 }
17923
17924 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
17925                                            SelectionDAG &DAG) const {
17926   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17927   MFI->setReturnAddressIsTaken(true);
17928
17929   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
17930     return SDValue();
17931
17932   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17933   SDLoc dl(Op);
17934   EVT PtrVT = getPointerTy();
17935
17936   if (Depth > 0) {
17937     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
17938     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17939     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
17940     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17941                        DAG.getNode(ISD::ADD, dl, PtrVT,
17942                                    FrameAddr, Offset),
17943                        MachinePointerInfo(), false, false, false, 0);
17944   }
17945
17946   // Just load the return address.
17947   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
17948   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17949                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
17950 }
17951
17952 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
17953   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17954   MFI->setFrameAddressIsTaken(true);
17955
17956   EVT VT = Op.getValueType();
17957   SDLoc dl(Op);  // FIXME probably not meaningful
17958   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17959   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17960   unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(
17961       DAG.getMachineFunction());
17962   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
17963           (FrameReg == X86::EBP && VT == MVT::i32)) &&
17964          "Invalid Frame Register!");
17965   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
17966   while (Depth--)
17967     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
17968                             MachinePointerInfo(),
17969                             false, false, false, 0);
17970   return FrameAddr;
17971 }
17972
17973 // FIXME? Maybe this could be a TableGen attribute on some registers and
17974 // this table could be generated automatically from RegInfo.
17975 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
17976                                               EVT VT) const {
17977   unsigned Reg = StringSwitch<unsigned>(RegName)
17978                        .Case("esp", X86::ESP)
17979                        .Case("rsp", X86::RSP)
17980                        .Default(0);
17981   if (Reg)
17982     return Reg;
17983   report_fatal_error("Invalid register name global variable");
17984 }
17985
17986 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
17987                                                      SelectionDAG &DAG) const {
17988   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17989   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
17990 }
17991
17992 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
17993   SDValue Chain     = Op.getOperand(0);
17994   SDValue Offset    = Op.getOperand(1);
17995   SDValue Handler   = Op.getOperand(2);
17996   SDLoc dl      (Op);
17997
17998   EVT PtrVT = getPointerTy();
17999   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18000   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
18001   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
18002           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
18003          "Invalid Frame Register!");
18004   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
18005   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
18006
18007   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
18008                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
18009   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
18010   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
18011                        false, false, 0);
18012   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
18013
18014   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
18015                      DAG.getRegister(StoreAddrReg, PtrVT));
18016 }
18017
18018 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
18019                                                SelectionDAG &DAG) const {
18020   SDLoc DL(Op);
18021   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
18022                      DAG.getVTList(MVT::i32, MVT::Other),
18023                      Op.getOperand(0), Op.getOperand(1));
18024 }
18025
18026 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
18027                                                 SelectionDAG &DAG) const {
18028   SDLoc DL(Op);
18029   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
18030                      Op.getOperand(0), Op.getOperand(1));
18031 }
18032
18033 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
18034   return Op.getOperand(0);
18035 }
18036
18037 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
18038                                                 SelectionDAG &DAG) const {
18039   SDValue Root = Op.getOperand(0);
18040   SDValue Trmp = Op.getOperand(1); // trampoline
18041   SDValue FPtr = Op.getOperand(2); // nested function
18042   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
18043   SDLoc dl (Op);
18044
18045   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18046   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
18047
18048   if (Subtarget->is64Bit()) {
18049     SDValue OutChains[6];
18050
18051     // Large code-model.
18052     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
18053     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
18054
18055     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
18056     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
18057
18058     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
18059
18060     // Load the pointer to the nested function into R11.
18061     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
18062     SDValue Addr = Trmp;
18063     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18064                                 Addr, MachinePointerInfo(TrmpAddr),
18065                                 false, false, 0);
18066
18067     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18068                        DAG.getConstant(2, MVT::i64));
18069     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
18070                                 MachinePointerInfo(TrmpAddr, 2),
18071                                 false, false, 2);
18072
18073     // Load the 'nest' parameter value into R10.
18074     // R10 is specified in X86CallingConv.td
18075     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
18076     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18077                        DAG.getConstant(10, MVT::i64));
18078     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18079                                 Addr, MachinePointerInfo(TrmpAddr, 10),
18080                                 false, false, 0);
18081
18082     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18083                        DAG.getConstant(12, MVT::i64));
18084     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
18085                                 MachinePointerInfo(TrmpAddr, 12),
18086                                 false, false, 2);
18087
18088     // Jump to the nested function.
18089     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
18090     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18091                        DAG.getConstant(20, MVT::i64));
18092     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18093                                 Addr, MachinePointerInfo(TrmpAddr, 20),
18094                                 false, false, 0);
18095
18096     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
18097     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18098                        DAG.getConstant(22, MVT::i64));
18099     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
18100                                 MachinePointerInfo(TrmpAddr, 22),
18101                                 false, false, 0);
18102
18103     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18104   } else {
18105     const Function *Func =
18106       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
18107     CallingConv::ID CC = Func->getCallingConv();
18108     unsigned NestReg;
18109
18110     switch (CC) {
18111     default:
18112       llvm_unreachable("Unsupported calling convention");
18113     case CallingConv::C:
18114     case CallingConv::X86_StdCall: {
18115       // Pass 'nest' parameter in ECX.
18116       // Must be kept in sync with X86CallingConv.td
18117       NestReg = X86::ECX;
18118
18119       // Check that ECX wasn't needed by an 'inreg' parameter.
18120       FunctionType *FTy = Func->getFunctionType();
18121       const AttributeSet &Attrs = Func->getAttributes();
18122
18123       if (!Attrs.isEmpty() && !Func->isVarArg()) {
18124         unsigned InRegCount = 0;
18125         unsigned Idx = 1;
18126
18127         for (FunctionType::param_iterator I = FTy->param_begin(),
18128              E = FTy->param_end(); I != E; ++I, ++Idx)
18129           if (Attrs.hasAttribute(Idx, Attribute::InReg))
18130             // FIXME: should only count parameters that are lowered to integers.
18131             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
18132
18133         if (InRegCount > 2) {
18134           report_fatal_error("Nest register in use - reduce number of inreg"
18135                              " parameters!");
18136         }
18137       }
18138       break;
18139     }
18140     case CallingConv::X86_FastCall:
18141     case CallingConv::X86_ThisCall:
18142     case CallingConv::Fast:
18143       // Pass 'nest' parameter in EAX.
18144       // Must be kept in sync with X86CallingConv.td
18145       NestReg = X86::EAX;
18146       break;
18147     }
18148
18149     SDValue OutChains[4];
18150     SDValue Addr, Disp;
18151
18152     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18153                        DAG.getConstant(10, MVT::i32));
18154     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
18155
18156     // This is storing the opcode for MOV32ri.
18157     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
18158     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
18159     OutChains[0] = DAG.getStore(Root, dl,
18160                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
18161                                 Trmp, MachinePointerInfo(TrmpAddr),
18162                                 false, false, 0);
18163
18164     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18165                        DAG.getConstant(1, MVT::i32));
18166     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
18167                                 MachinePointerInfo(TrmpAddr, 1),
18168                                 false, false, 1);
18169
18170     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18171     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18172                        DAG.getConstant(5, MVT::i32));
18173     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
18174                                 MachinePointerInfo(TrmpAddr, 5),
18175                                 false, false, 1);
18176
18177     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18178                        DAG.getConstant(6, MVT::i32));
18179     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
18180                                 MachinePointerInfo(TrmpAddr, 6),
18181                                 false, false, 1);
18182
18183     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18184   }
18185 }
18186
18187 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18188                                             SelectionDAG &DAG) const {
18189   /*
18190    The rounding mode is in bits 11:10 of FPSR, and has the following
18191    settings:
18192      00 Round to nearest
18193      01 Round to -inf
18194      10 Round to +inf
18195      11 Round to 0
18196
18197   FLT_ROUNDS, on the other hand, expects the following:
18198     -1 Undefined
18199      0 Round to 0
18200      1 Round to nearest
18201      2 Round to +inf
18202      3 Round to -inf
18203
18204   To perform the conversion, we do:
18205     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18206   */
18207
18208   MachineFunction &MF = DAG.getMachineFunction();
18209   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
18210   unsigned StackAlignment = TFI.getStackAlignment();
18211   MVT VT = Op.getSimpleValueType();
18212   SDLoc DL(Op);
18213
18214   // Save FP Control Word to stack slot
18215   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18216   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
18217
18218   MachineMemOperand *MMO =
18219    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
18220                            MachineMemOperand::MOStore, 2, 2);
18221
18222   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18223   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18224                                           DAG.getVTList(MVT::Other),
18225                                           Ops, MVT::i16, MMO);
18226
18227   // Load FP Control Word from stack slot
18228   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18229                             MachinePointerInfo(), false, false, false, 0);
18230
18231   // Transform as necessary
18232   SDValue CWD1 =
18233     DAG.getNode(ISD::SRL, DL, MVT::i16,
18234                 DAG.getNode(ISD::AND, DL, MVT::i16,
18235                             CWD, DAG.getConstant(0x800, MVT::i16)),
18236                 DAG.getConstant(11, MVT::i8));
18237   SDValue CWD2 =
18238     DAG.getNode(ISD::SRL, DL, MVT::i16,
18239                 DAG.getNode(ISD::AND, DL, MVT::i16,
18240                             CWD, DAG.getConstant(0x400, MVT::i16)),
18241                 DAG.getConstant(9, MVT::i8));
18242
18243   SDValue RetVal =
18244     DAG.getNode(ISD::AND, DL, MVT::i16,
18245                 DAG.getNode(ISD::ADD, DL, MVT::i16,
18246                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18247                             DAG.getConstant(1, MVT::i16)),
18248                 DAG.getConstant(3, MVT::i16));
18249
18250   return DAG.getNode((VT.getSizeInBits() < 16 ?
18251                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18252 }
18253
18254 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
18255   MVT VT = Op.getSimpleValueType();
18256   EVT OpVT = VT;
18257   unsigned NumBits = VT.getSizeInBits();
18258   SDLoc dl(Op);
18259
18260   Op = Op.getOperand(0);
18261   if (VT == MVT::i8) {
18262     // Zero extend to i32 since there is not an i8 bsr.
18263     OpVT = MVT::i32;
18264     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18265   }
18266
18267   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
18268   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18269   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18270
18271   // If src is zero (i.e. bsr sets ZF), returns NumBits.
18272   SDValue Ops[] = {
18273     Op,
18274     DAG.getConstant(NumBits+NumBits-1, OpVT),
18275     DAG.getConstant(X86::COND_E, MVT::i8),
18276     Op.getValue(1)
18277   };
18278   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
18279
18280   // Finally xor with NumBits-1.
18281   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18282
18283   if (VT == MVT::i8)
18284     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18285   return Op;
18286 }
18287
18288 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
18289   MVT VT = Op.getSimpleValueType();
18290   EVT OpVT = VT;
18291   unsigned NumBits = VT.getSizeInBits();
18292   SDLoc dl(Op);
18293
18294   Op = Op.getOperand(0);
18295   if (VT == MVT::i8) {
18296     // Zero extend to i32 since there is not an i8 bsr.
18297     OpVT = MVT::i32;
18298     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18299   }
18300
18301   // Issue a bsr (scan bits in reverse).
18302   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18303   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18304
18305   // And xor with NumBits-1.
18306   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18307
18308   if (VT == MVT::i8)
18309     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18310   return Op;
18311 }
18312
18313 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
18314   MVT VT = Op.getSimpleValueType();
18315   unsigned NumBits = VT.getSizeInBits();
18316   SDLoc dl(Op);
18317   Op = Op.getOperand(0);
18318
18319   // Issue a bsf (scan bits forward) which also sets EFLAGS.
18320   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18321   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
18322
18323   // If src is zero (i.e. bsf sets ZF), returns NumBits.
18324   SDValue Ops[] = {
18325     Op,
18326     DAG.getConstant(NumBits, VT),
18327     DAG.getConstant(X86::COND_E, MVT::i8),
18328     Op.getValue(1)
18329   };
18330   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
18331 }
18332
18333 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
18334 // ones, and then concatenate the result back.
18335 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
18336   MVT VT = Op.getSimpleValueType();
18337
18338   assert(VT.is256BitVector() && VT.isInteger() &&
18339          "Unsupported value type for operation");
18340
18341   unsigned NumElems = VT.getVectorNumElements();
18342   SDLoc dl(Op);
18343
18344   // Extract the LHS vectors
18345   SDValue LHS = Op.getOperand(0);
18346   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
18347   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
18348
18349   // Extract the RHS vectors
18350   SDValue RHS = Op.getOperand(1);
18351   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
18352   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
18353
18354   MVT EltVT = VT.getVectorElementType();
18355   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18356
18357   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18358                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
18359                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
18360 }
18361
18362 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
18363   assert(Op.getSimpleValueType().is256BitVector() &&
18364          Op.getSimpleValueType().isInteger() &&
18365          "Only handle AVX 256-bit vector integer operation");
18366   return Lower256IntArith(Op, DAG);
18367 }
18368
18369 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
18370   assert(Op.getSimpleValueType().is256BitVector() &&
18371          Op.getSimpleValueType().isInteger() &&
18372          "Only handle AVX 256-bit vector integer operation");
18373   return Lower256IntArith(Op, DAG);
18374 }
18375
18376 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
18377                         SelectionDAG &DAG) {
18378   SDLoc dl(Op);
18379   MVT VT = Op.getSimpleValueType();
18380
18381   // Decompose 256-bit ops into smaller 128-bit ops.
18382   if (VT.is256BitVector() && !Subtarget->hasInt256())
18383     return Lower256IntArith(Op, DAG);
18384
18385   SDValue A = Op.getOperand(0);
18386   SDValue B = Op.getOperand(1);
18387
18388   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
18389   if (VT == MVT::v4i32) {
18390     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
18391            "Should not custom lower when pmuldq is available!");
18392
18393     // Extract the odd parts.
18394     static const int UnpackMask[] = { 1, -1, 3, -1 };
18395     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
18396     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
18397
18398     // Multiply the even parts.
18399     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
18400     // Now multiply odd parts.
18401     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
18402
18403     Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
18404     Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
18405
18406     // Merge the two vectors back together with a shuffle. This expands into 2
18407     // shuffles.
18408     static const int ShufMask[] = { 0, 4, 2, 6 };
18409     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
18410   }
18411
18412   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
18413          "Only know how to lower V2I64/V4I64/V8I64 multiply");
18414
18415   //  Ahi = psrlqi(a, 32);
18416   //  Bhi = psrlqi(b, 32);
18417   //
18418   //  AloBlo = pmuludq(a, b);
18419   //  AloBhi = pmuludq(a, Bhi);
18420   //  AhiBlo = pmuludq(Ahi, b);
18421
18422   //  AloBhi = psllqi(AloBhi, 32);
18423   //  AhiBlo = psllqi(AhiBlo, 32);
18424   //  return AloBlo + AloBhi + AhiBlo;
18425
18426   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
18427   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
18428
18429   // Bit cast to 32-bit vectors for MULUDQ
18430   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
18431                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
18432   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
18433   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
18434   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
18435   Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
18436
18437   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
18438   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
18439   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
18440
18441   AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
18442   AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
18443
18444   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
18445   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
18446 }
18447
18448 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
18449   assert(Subtarget->isTargetWin64() && "Unexpected target");
18450   EVT VT = Op.getValueType();
18451   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
18452          "Unexpected return type for lowering");
18453
18454   RTLIB::Libcall LC;
18455   bool isSigned;
18456   switch (Op->getOpcode()) {
18457   default: llvm_unreachable("Unexpected request for libcall!");
18458   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
18459   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
18460   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
18461   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
18462   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
18463   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
18464   }
18465
18466   SDLoc dl(Op);
18467   SDValue InChain = DAG.getEntryNode();
18468
18469   TargetLowering::ArgListTy Args;
18470   TargetLowering::ArgListEntry Entry;
18471   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
18472     EVT ArgVT = Op->getOperand(i).getValueType();
18473     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
18474            "Unexpected argument type for lowering");
18475     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
18476     Entry.Node = StackPtr;
18477     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
18478                            false, false, 16);
18479     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18480     Entry.Ty = PointerType::get(ArgTy,0);
18481     Entry.isSExt = false;
18482     Entry.isZExt = false;
18483     Args.push_back(Entry);
18484   }
18485
18486   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18487                                          getPointerTy());
18488
18489   TargetLowering::CallLoweringInfo CLI(DAG);
18490   CLI.setDebugLoc(dl).setChain(InChain)
18491     .setCallee(getLibcallCallingConv(LC),
18492                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
18493                Callee, std::move(Args), 0)
18494     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
18495
18496   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
18497   return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
18498 }
18499
18500 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
18501                              SelectionDAG &DAG) {
18502   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
18503   EVT VT = Op0.getValueType();
18504   SDLoc dl(Op);
18505
18506   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
18507          (VT == MVT::v8i32 && Subtarget->hasInt256()));
18508
18509   // PMULxD operations multiply each even value (starting at 0) of LHS with
18510   // the related value of RHS and produce a widen result.
18511   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18512   // => <2 x i64> <ae|cg>
18513   //
18514   // In other word, to have all the results, we need to perform two PMULxD:
18515   // 1. one with the even values.
18516   // 2. one with the odd values.
18517   // To achieve #2, with need to place the odd values at an even position.
18518   //
18519   // Place the odd value at an even position (basically, shift all values 1
18520   // step to the left):
18521   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
18522   // <a|b|c|d> => <b|undef|d|undef>
18523   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
18524   // <e|f|g|h> => <f|undef|h|undef>
18525   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
18526
18527   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
18528   // ints.
18529   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
18530   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
18531   unsigned Opcode =
18532       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
18533   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18534   // => <2 x i64> <ae|cg>
18535   SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
18536                              DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
18537   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
18538   // => <2 x i64> <bf|dh>
18539   SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
18540                              DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
18541
18542   // Shuffle it back into the right order.
18543   SDValue Highs, Lows;
18544   if (VT == MVT::v8i32) {
18545     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
18546     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18547     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
18548     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18549   } else {
18550     const int HighMask[] = {1, 5, 3, 7};
18551     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18552     const int LowMask[] = {0, 4, 2, 6};
18553     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18554   }
18555
18556   // If we have a signed multiply but no PMULDQ fix up the high parts of a
18557   // unsigned multiply.
18558   if (IsSigned && !Subtarget->hasSSE41()) {
18559     SDValue ShAmt =
18560         DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
18561     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
18562                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
18563     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
18564                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
18565
18566     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
18567     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
18568   }
18569
18570   // The first result of MUL_LOHI is actually the low value, followed by the
18571   // high value.
18572   SDValue Ops[] = {Lows, Highs};
18573   return DAG.getMergeValues(Ops, dl);
18574 }
18575
18576 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
18577                                          const X86Subtarget *Subtarget) {
18578   MVT VT = Op.getSimpleValueType();
18579   SDLoc dl(Op);
18580   SDValue R = Op.getOperand(0);
18581   SDValue Amt = Op.getOperand(1);
18582
18583   // Optimize shl/srl/sra with constant shift amount.
18584   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
18585     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
18586       uint64_t ShiftAmt = ShiftConst->getZExtValue();
18587
18588       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
18589           (Subtarget->hasInt256() &&
18590            (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18591           (Subtarget->hasAVX512() &&
18592            (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18593         if (Op.getOpcode() == ISD::SHL)
18594           return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18595                                             DAG);
18596         if (Op.getOpcode() == ISD::SRL)
18597           return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18598                                             DAG);
18599         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
18600           return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18601                                             DAG);
18602       }
18603
18604       if (VT == MVT::v16i8) {
18605         if (Op.getOpcode() == ISD::SHL) {
18606           // Make a large shift.
18607           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18608                                                    MVT::v8i16, R, ShiftAmt,
18609                                                    DAG);
18610           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18611           // Zero out the rightmost bits.
18612           SmallVector<SDValue, 16> V(16,
18613                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18614                                                      MVT::i8));
18615           return DAG.getNode(ISD::AND, dl, VT, SHL,
18616                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18617         }
18618         if (Op.getOpcode() == ISD::SRL) {
18619           // Make a large shift.
18620           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18621                                                    MVT::v8i16, R, ShiftAmt,
18622                                                    DAG);
18623           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18624           // Zero out the leftmost bits.
18625           SmallVector<SDValue, 16> V(16,
18626                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18627                                                      MVT::i8));
18628           return DAG.getNode(ISD::AND, dl, VT, SRL,
18629                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18630         }
18631         if (Op.getOpcode() == ISD::SRA) {
18632           if (ShiftAmt == 7) {
18633             // R s>> 7  ===  R s< 0
18634             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18635             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18636           }
18637
18638           // R s>> a === ((R u>> a) ^ m) - m
18639           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18640           SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
18641                                                          MVT::i8));
18642           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18643           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18644           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18645           return Res;
18646         }
18647         llvm_unreachable("Unknown shift opcode.");
18648       }
18649
18650       if (Subtarget->hasInt256() && VT == MVT::v32i8) {
18651         if (Op.getOpcode() == ISD::SHL) {
18652           // Make a large shift.
18653           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18654                                                    MVT::v16i16, R, ShiftAmt,
18655                                                    DAG);
18656           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18657           // Zero out the rightmost bits.
18658           SmallVector<SDValue, 32> V(32,
18659                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18660                                                      MVT::i8));
18661           return DAG.getNode(ISD::AND, dl, VT, SHL,
18662                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18663         }
18664         if (Op.getOpcode() == ISD::SRL) {
18665           // Make a large shift.
18666           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18667                                                    MVT::v16i16, R, ShiftAmt,
18668                                                    DAG);
18669           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18670           // Zero out the leftmost bits.
18671           SmallVector<SDValue, 32> V(32,
18672                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18673                                                      MVT::i8));
18674           return DAG.getNode(ISD::AND, dl, VT, SRL,
18675                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18676         }
18677         if (Op.getOpcode() == ISD::SRA) {
18678           if (ShiftAmt == 7) {
18679             // R s>> 7  ===  R s< 0
18680             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18681             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18682           }
18683
18684           // R s>> a === ((R u>> a) ^ m) - m
18685           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18686           SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
18687                                                          MVT::i8));
18688           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18689           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18690           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18691           return Res;
18692         }
18693         llvm_unreachable("Unknown shift opcode.");
18694       }
18695     }
18696   }
18697
18698   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18699   if (!Subtarget->is64Bit() &&
18700       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
18701       Amt.getOpcode() == ISD::BITCAST &&
18702       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18703     Amt = Amt.getOperand(0);
18704     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18705                      VT.getVectorNumElements();
18706     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
18707     uint64_t ShiftAmt = 0;
18708     for (unsigned i = 0; i != Ratio; ++i) {
18709       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
18710       if (!C)
18711         return SDValue();
18712       // 6 == Log2(64)
18713       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
18714     }
18715     // Check remaining shift amounts.
18716     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18717       uint64_t ShAmt = 0;
18718       for (unsigned j = 0; j != Ratio; ++j) {
18719         ConstantSDNode *C =
18720           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
18721         if (!C)
18722           return SDValue();
18723         // 6 == Log2(64)
18724         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
18725       }
18726       if (ShAmt != ShiftAmt)
18727         return SDValue();
18728     }
18729     switch (Op.getOpcode()) {
18730     default:
18731       llvm_unreachable("Unknown shift opcode!");
18732     case ISD::SHL:
18733       return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18734                                         DAG);
18735     case ISD::SRL:
18736       return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18737                                         DAG);
18738     case ISD::SRA:
18739       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18740                                         DAG);
18741     }
18742   }
18743
18744   return SDValue();
18745 }
18746
18747 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
18748                                         const X86Subtarget* Subtarget) {
18749   MVT VT = Op.getSimpleValueType();
18750   SDLoc dl(Op);
18751   SDValue R = Op.getOperand(0);
18752   SDValue Amt = Op.getOperand(1);
18753
18754   if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
18755       VT == MVT::v4i32 || VT == MVT::v8i16 ||
18756       (Subtarget->hasInt256() &&
18757        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
18758         VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18759        (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18760     SDValue BaseShAmt;
18761     EVT EltVT = VT.getVectorElementType();
18762
18763     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
18764       // Check if this build_vector node is doing a splat.
18765       // If so, then set BaseShAmt equal to the splat value.
18766       BaseShAmt = BV->getSplatValue();
18767       if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
18768         BaseShAmt = SDValue();
18769     } else {
18770       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
18771         Amt = Amt.getOperand(0);
18772
18773       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
18774       if (SVN && SVN->isSplat()) {
18775         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
18776         SDValue InVec = Amt.getOperand(0);
18777         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
18778           assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
18779                  "Unexpected shuffle index found!");
18780           BaseShAmt = InVec.getOperand(SplatIdx);
18781         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
18782            if (ConstantSDNode *C =
18783                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
18784              if (C->getZExtValue() == SplatIdx)
18785                BaseShAmt = InVec.getOperand(1);
18786            }
18787         }
18788
18789         if (!BaseShAmt)
18790           // Avoid introducing an extract element from a shuffle.
18791           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
18792                                     DAG.getIntPtrConstant(SplatIdx));
18793       }
18794     }
18795
18796     if (BaseShAmt.getNode()) {
18797       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
18798       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
18799         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
18800       else if (EltVT.bitsLT(MVT::i32))
18801         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
18802
18803       switch (Op.getOpcode()) {
18804       default:
18805         llvm_unreachable("Unknown shift opcode!");
18806       case ISD::SHL:
18807         switch (VT.SimpleTy) {
18808         default: return SDValue();
18809         case MVT::v2i64:
18810         case MVT::v4i32:
18811         case MVT::v8i16:
18812         case MVT::v4i64:
18813         case MVT::v8i32:
18814         case MVT::v16i16:
18815         case MVT::v16i32:
18816         case MVT::v8i64:
18817           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
18818         }
18819       case ISD::SRA:
18820         switch (VT.SimpleTy) {
18821         default: return SDValue();
18822         case MVT::v4i32:
18823         case MVT::v8i16:
18824         case MVT::v8i32:
18825         case MVT::v16i16:
18826         case MVT::v16i32:
18827         case MVT::v8i64:
18828           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
18829         }
18830       case ISD::SRL:
18831         switch (VT.SimpleTy) {
18832         default: return SDValue();
18833         case MVT::v2i64:
18834         case MVT::v4i32:
18835         case MVT::v8i16:
18836         case MVT::v4i64:
18837         case MVT::v8i32:
18838         case MVT::v16i16:
18839         case MVT::v16i32:
18840         case MVT::v8i64:
18841           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
18842         }
18843       }
18844     }
18845   }
18846
18847   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18848   if (!Subtarget->is64Bit() &&
18849       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
18850       (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
18851       Amt.getOpcode() == ISD::BITCAST &&
18852       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18853     Amt = Amt.getOperand(0);
18854     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18855                      VT.getVectorNumElements();
18856     std::vector<SDValue> Vals(Ratio);
18857     for (unsigned i = 0; i != Ratio; ++i)
18858       Vals[i] = Amt.getOperand(i);
18859     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18860       for (unsigned j = 0; j != Ratio; ++j)
18861         if (Vals[j] != Amt.getOperand(i + j))
18862           return SDValue();
18863     }
18864     switch (Op.getOpcode()) {
18865     default:
18866       llvm_unreachable("Unknown shift opcode!");
18867     case ISD::SHL:
18868       return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
18869     case ISD::SRL:
18870       return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
18871     case ISD::SRA:
18872       return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
18873     }
18874   }
18875
18876   return SDValue();
18877 }
18878
18879 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
18880                           SelectionDAG &DAG) {
18881   MVT VT = Op.getSimpleValueType();
18882   SDLoc dl(Op);
18883   SDValue R = Op.getOperand(0);
18884   SDValue Amt = Op.getOperand(1);
18885   SDValue V;
18886
18887   assert(VT.isVector() && "Custom lowering only for vector shifts!");
18888   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
18889
18890   V = LowerScalarImmediateShift(Op, DAG, Subtarget);
18891   if (V.getNode())
18892     return V;
18893
18894   V = LowerScalarVariableShift(Op, DAG, Subtarget);
18895   if (V.getNode())
18896       return V;
18897
18898   if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
18899     return Op;
18900   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
18901   if (Subtarget->hasInt256()) {
18902     if (Op.getOpcode() == ISD::SRL &&
18903         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18904          VT == MVT::v4i64 || VT == MVT::v8i32))
18905       return Op;
18906     if (Op.getOpcode() == ISD::SHL &&
18907         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18908          VT == MVT::v4i64 || VT == MVT::v8i32))
18909       return Op;
18910     if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
18911       return Op;
18912   }
18913
18914   // If possible, lower this packed shift into a vector multiply instead of
18915   // expanding it into a sequence of scalar shifts.
18916   // Do this only if the vector shift count is a constant build_vector.
18917   if (Op.getOpcode() == ISD::SHL &&
18918       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
18919        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
18920       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18921     SmallVector<SDValue, 8> Elts;
18922     EVT SVT = VT.getScalarType();
18923     unsigned SVTBits = SVT.getSizeInBits();
18924     const APInt &One = APInt(SVTBits, 1);
18925     unsigned NumElems = VT.getVectorNumElements();
18926
18927     for (unsigned i=0; i !=NumElems; ++i) {
18928       SDValue Op = Amt->getOperand(i);
18929       if (Op->getOpcode() == ISD::UNDEF) {
18930         Elts.push_back(Op);
18931         continue;
18932       }
18933
18934       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
18935       const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
18936       uint64_t ShAmt = C.getZExtValue();
18937       if (ShAmt >= SVTBits) {
18938         Elts.push_back(DAG.getUNDEF(SVT));
18939         continue;
18940       }
18941       Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
18942     }
18943     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
18944     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
18945   }
18946
18947   // Lower SHL with variable shift amount.
18948   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
18949     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
18950
18951     Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
18952     Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
18953     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
18954     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
18955   }
18956
18957   // If possible, lower this shift as a sequence of two shifts by
18958   // constant plus a MOVSS/MOVSD instead of scalarizing it.
18959   // Example:
18960   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
18961   //
18962   // Could be rewritten as:
18963   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
18964   //
18965   // The advantage is that the two shifts from the example would be
18966   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
18967   // the vector shift into four scalar shifts plus four pairs of vector
18968   // insert/extract.
18969   if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
18970       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18971     unsigned TargetOpcode = X86ISD::MOVSS;
18972     bool CanBeSimplified;
18973     // The splat value for the first packed shift (the 'X' from the example).
18974     SDValue Amt1 = Amt->getOperand(0);
18975     // The splat value for the second packed shift (the 'Y' from the example).
18976     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
18977                                         Amt->getOperand(2);
18978
18979     // See if it is possible to replace this node with a sequence of
18980     // two shifts followed by a MOVSS/MOVSD
18981     if (VT == MVT::v4i32) {
18982       // Check if it is legal to use a MOVSS.
18983       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
18984                         Amt2 == Amt->getOperand(3);
18985       if (!CanBeSimplified) {
18986         // Otherwise, check if we can still simplify this node using a MOVSD.
18987         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
18988                           Amt->getOperand(2) == Amt->getOperand(3);
18989         TargetOpcode = X86ISD::MOVSD;
18990         Amt2 = Amt->getOperand(2);
18991       }
18992     } else {
18993       // Do similar checks for the case where the machine value type
18994       // is MVT::v8i16.
18995       CanBeSimplified = Amt1 == Amt->getOperand(1);
18996       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
18997         CanBeSimplified = Amt2 == Amt->getOperand(i);
18998
18999       if (!CanBeSimplified) {
19000         TargetOpcode = X86ISD::MOVSD;
19001         CanBeSimplified = true;
19002         Amt2 = Amt->getOperand(4);
19003         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
19004           CanBeSimplified = Amt1 == Amt->getOperand(i);
19005         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
19006           CanBeSimplified = Amt2 == Amt->getOperand(j);
19007       }
19008     }
19009
19010     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
19011         isa<ConstantSDNode>(Amt2)) {
19012       // Replace this node with two shifts followed by a MOVSS/MOVSD.
19013       EVT CastVT = MVT::v4i32;
19014       SDValue Splat1 =
19015         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
19016       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
19017       SDValue Splat2 =
19018         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
19019       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
19020       if (TargetOpcode == X86ISD::MOVSD)
19021         CastVT = MVT::v2i64;
19022       SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
19023       SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
19024       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
19025                                             BitCast1, DAG);
19026       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
19027     }
19028   }
19029
19030   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
19031     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
19032
19033     // a = a << 5;
19034     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
19035     Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
19036
19037     // Turn 'a' into a mask suitable for VSELECT
19038     SDValue VSelM = DAG.getConstant(0x80, VT);
19039     SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19040     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19041
19042     SDValue CM1 = DAG.getConstant(0x0f, VT);
19043     SDValue CM2 = DAG.getConstant(0x3f, VT);
19044
19045     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
19046     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
19047     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
19048     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19049     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19050
19051     // a += a
19052     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19053     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19054     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19055
19056     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
19057     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
19058     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
19059     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19060     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19061
19062     // a += a
19063     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19064     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19065     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19066
19067     // return VSELECT(r, r+r, a);
19068     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
19069                     DAG.getNode(ISD::ADD, dl, VT, R, R), R);
19070     return R;
19071   }
19072
19073   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
19074   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
19075   // solution better.
19076   if (Subtarget->hasInt256() && VT == MVT::v8i16) {
19077     MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
19078     unsigned ExtOpc =
19079         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
19080     R = DAG.getNode(ExtOpc, dl, NewVT, R);
19081     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
19082     return DAG.getNode(ISD::TRUNCATE, dl, VT,
19083                        DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
19084     }
19085
19086   // Decompose 256-bit shifts into smaller 128-bit shifts.
19087   if (VT.is256BitVector()) {
19088     unsigned NumElems = VT.getVectorNumElements();
19089     MVT EltVT = VT.getVectorElementType();
19090     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19091
19092     // Extract the two vectors
19093     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
19094     SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
19095
19096     // Recreate the shift amount vectors
19097     SDValue Amt1, Amt2;
19098     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
19099       // Constant shift amount
19100       SmallVector<SDValue, 4> Amt1Csts;
19101       SmallVector<SDValue, 4> Amt2Csts;
19102       for (unsigned i = 0; i != NumElems/2; ++i)
19103         Amt1Csts.push_back(Amt->getOperand(i));
19104       for (unsigned i = NumElems/2; i != NumElems; ++i)
19105         Amt2Csts.push_back(Amt->getOperand(i));
19106
19107       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
19108       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
19109     } else {
19110       // Variable shift amount
19111       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
19112       Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
19113     }
19114
19115     // Issue new vector shifts for the smaller types
19116     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
19117     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
19118
19119     // Concatenate the result back
19120     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
19121   }
19122
19123   return SDValue();
19124 }
19125
19126 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
19127   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
19128   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
19129   // looks for this combo and may remove the "setcc" instruction if the "setcc"
19130   // has only one use.
19131   SDNode *N = Op.getNode();
19132   SDValue LHS = N->getOperand(0);
19133   SDValue RHS = N->getOperand(1);
19134   unsigned BaseOp = 0;
19135   unsigned Cond = 0;
19136   SDLoc DL(Op);
19137   switch (Op.getOpcode()) {
19138   default: llvm_unreachable("Unknown ovf instruction!");
19139   case ISD::SADDO:
19140     // A subtract of one will be selected as a INC. Note that INC doesn't
19141     // set CF, so we can't do this for UADDO.
19142     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19143       if (C->isOne()) {
19144         BaseOp = X86ISD::INC;
19145         Cond = X86::COND_O;
19146         break;
19147       }
19148     BaseOp = X86ISD::ADD;
19149     Cond = X86::COND_O;
19150     break;
19151   case ISD::UADDO:
19152     BaseOp = X86ISD::ADD;
19153     Cond = X86::COND_B;
19154     break;
19155   case ISD::SSUBO:
19156     // A subtract of one will be selected as a DEC. Note that DEC doesn't
19157     // set CF, so we can't do this for USUBO.
19158     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19159       if (C->isOne()) {
19160         BaseOp = X86ISD::DEC;
19161         Cond = X86::COND_O;
19162         break;
19163       }
19164     BaseOp = X86ISD::SUB;
19165     Cond = X86::COND_O;
19166     break;
19167   case ISD::USUBO:
19168     BaseOp = X86ISD::SUB;
19169     Cond = X86::COND_B;
19170     break;
19171   case ISD::SMULO:
19172     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
19173     Cond = X86::COND_O;
19174     break;
19175   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
19176     if (N->getValueType(0) == MVT::i8) {
19177       BaseOp = X86ISD::UMUL8;
19178       Cond = X86::COND_O;
19179       break;
19180     }
19181     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
19182                                  MVT::i32);
19183     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
19184
19185     SDValue SetCC =
19186       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
19187                   DAG.getConstant(X86::COND_O, MVT::i32),
19188                   SDValue(Sum.getNode(), 2));
19189
19190     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19191   }
19192   }
19193
19194   // Also sets EFLAGS.
19195   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
19196   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
19197
19198   SDValue SetCC =
19199     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
19200                 DAG.getConstant(Cond, MVT::i32),
19201                 SDValue(Sum.getNode(), 1));
19202
19203   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19204 }
19205
19206 // Sign extension of the low part of vector elements. This may be used either
19207 // when sign extend instructions are not available or if the vector element
19208 // sizes already match the sign-extended size. If the vector elements are in
19209 // their pre-extended size and sign extend instructions are available, that will
19210 // be handled by LowerSIGN_EXTEND.
19211 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
19212                                                   SelectionDAG &DAG) const {
19213   SDLoc dl(Op);
19214   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
19215   MVT VT = Op.getSimpleValueType();
19216
19217   if (!Subtarget->hasSSE2() || !VT.isVector())
19218     return SDValue();
19219
19220   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
19221                       ExtraVT.getScalarType().getSizeInBits();
19222
19223   switch (VT.SimpleTy) {
19224     default: return SDValue();
19225     case MVT::v8i32:
19226     case MVT::v16i16:
19227       if (!Subtarget->hasFp256())
19228         return SDValue();
19229       if (!Subtarget->hasInt256()) {
19230         // needs to be split
19231         unsigned NumElems = VT.getVectorNumElements();
19232
19233         // Extract the LHS vectors
19234         SDValue LHS = Op.getOperand(0);
19235         SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
19236         SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
19237
19238         MVT EltVT = VT.getVectorElementType();
19239         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19240
19241         EVT ExtraEltVT = ExtraVT.getVectorElementType();
19242         unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
19243         ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
19244                                    ExtraNumElems/2);
19245         SDValue Extra = DAG.getValueType(ExtraVT);
19246
19247         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
19248         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
19249
19250         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
19251       }
19252       // fall through
19253     case MVT::v4i32:
19254     case MVT::v8i16: {
19255       SDValue Op0 = Op.getOperand(0);
19256
19257       // This is a sign extension of some low part of vector elements without
19258       // changing the size of the vector elements themselves:
19259       // Shift-Left + Shift-Right-Algebraic.
19260       SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
19261                                                BitsDiff, DAG);
19262       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
19263                                         DAG);
19264     }
19265   }
19266 }
19267
19268 /// Returns true if the operand type is exactly twice the native width, and
19269 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
19270 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
19271 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
19272 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
19273   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
19274
19275   if (OpWidth == 64)
19276     return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
19277   else if (OpWidth == 128)
19278     return Subtarget->hasCmpxchg16b();
19279   else
19280     return false;
19281 }
19282
19283 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19284   return needsCmpXchgNb(SI->getValueOperand()->getType());
19285 }
19286
19287 // Note: this turns large loads into lock cmpxchg8b/16b.
19288 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
19289 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19290   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
19291   return needsCmpXchgNb(PTy->getElementType());
19292 }
19293
19294 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
19295   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19296   const Type *MemType = AI->getType();
19297
19298   // If the operand is too big, we must see if cmpxchg8/16b is available
19299   // and default to library calls otherwise.
19300   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19301     return needsCmpXchgNb(MemType);
19302
19303   AtomicRMWInst::BinOp Op = AI->getOperation();
19304   switch (Op) {
19305   default:
19306     llvm_unreachable("Unknown atomic operation");
19307   case AtomicRMWInst::Xchg:
19308   case AtomicRMWInst::Add:
19309   case AtomicRMWInst::Sub:
19310     // It's better to use xadd, xsub or xchg for these in all cases.
19311     return false;
19312   case AtomicRMWInst::Or:
19313   case AtomicRMWInst::And:
19314   case AtomicRMWInst::Xor:
19315     // If the atomicrmw's result isn't actually used, we can just add a "lock"
19316     // prefix to a normal instruction for these operations.
19317     return !AI->use_empty();
19318   case AtomicRMWInst::Nand:
19319   case AtomicRMWInst::Max:
19320   case AtomicRMWInst::Min:
19321   case AtomicRMWInst::UMax:
19322   case AtomicRMWInst::UMin:
19323     // These always require a non-trivial set of data operations on x86. We must
19324     // use a cmpxchg loop.
19325     return true;
19326   }
19327 }
19328
19329 static bool hasMFENCE(const X86Subtarget& Subtarget) {
19330   // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
19331   // no-sse2). There isn't any reason to disable it if the target processor
19332   // supports it.
19333   return Subtarget.hasSSE2() || Subtarget.is64Bit();
19334 }
19335
19336 LoadInst *
19337 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19338   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19339   const Type *MemType = AI->getType();
19340   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
19341   // there is no benefit in turning such RMWs into loads, and it is actually
19342   // harmful as it introduces a mfence.
19343   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19344     return nullptr;
19345
19346   auto Builder = IRBuilder<>(AI);
19347   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
19348   auto SynchScope = AI->getSynchScope();
19349   // We must restrict the ordering to avoid generating loads with Release or
19350   // ReleaseAcquire orderings.
19351   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
19352   auto Ptr = AI->getPointerOperand();
19353
19354   // Before the load we need a fence. Here is an example lifted from
19355   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
19356   // is required:
19357   // Thread 0:
19358   //   x.store(1, relaxed);
19359   //   r1 = y.fetch_add(0, release);
19360   // Thread 1:
19361   //   y.fetch_add(42, acquire);
19362   //   r2 = x.load(relaxed);
19363   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
19364   // lowered to just a load without a fence. A mfence flushes the store buffer,
19365   // making the optimization clearly correct.
19366   // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
19367   // otherwise, we might be able to be more agressive on relaxed idempotent
19368   // rmw. In practice, they do not look useful, so we don't try to be
19369   // especially clever.
19370   if (SynchScope == SingleThread) {
19371     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
19372     // the IR level, so we must wrap it in an intrinsic.
19373     return nullptr;
19374   } else if (hasMFENCE(*Subtarget)) {
19375     Function *MFence = llvm::Intrinsic::getDeclaration(M,
19376             Intrinsic::x86_sse2_mfence);
19377     Builder.CreateCall(MFence);
19378   } else {
19379     // FIXME: it might make sense to use a locked operation here but on a
19380     // different cache-line to prevent cache-line bouncing. In practice it
19381     // is probably a small win, and x86 processors without mfence are rare
19382     // enough that we do not bother.
19383     return nullptr;
19384   }
19385
19386   // Finally we can emit the atomic load.
19387   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
19388           AI->getType()->getPrimitiveSizeInBits());
19389   Loaded->setAtomic(Order, SynchScope);
19390   AI->replaceAllUsesWith(Loaded);
19391   AI->eraseFromParent();
19392   return Loaded;
19393 }
19394
19395 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
19396                                  SelectionDAG &DAG) {
19397   SDLoc dl(Op);
19398   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
19399     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
19400   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
19401     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
19402
19403   // The only fence that needs an instruction is a sequentially-consistent
19404   // cross-thread fence.
19405   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
19406     if (hasMFENCE(*Subtarget))
19407       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
19408
19409     SDValue Chain = Op.getOperand(0);
19410     SDValue Zero = DAG.getConstant(0, MVT::i32);
19411     SDValue Ops[] = {
19412       DAG.getRegister(X86::ESP, MVT::i32), // Base
19413       DAG.getTargetConstant(1, MVT::i8),   // Scale
19414       DAG.getRegister(0, MVT::i32),        // Index
19415       DAG.getTargetConstant(0, MVT::i32),  // Disp
19416       DAG.getRegister(0, MVT::i32),        // Segment.
19417       Zero,
19418       Chain
19419     };
19420     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
19421     return SDValue(Res, 0);
19422   }
19423
19424   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
19425   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
19426 }
19427
19428 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
19429                              SelectionDAG &DAG) {
19430   MVT T = Op.getSimpleValueType();
19431   SDLoc DL(Op);
19432   unsigned Reg = 0;
19433   unsigned size = 0;
19434   switch(T.SimpleTy) {
19435   default: llvm_unreachable("Invalid value type!");
19436   case MVT::i8:  Reg = X86::AL;  size = 1; break;
19437   case MVT::i16: Reg = X86::AX;  size = 2; break;
19438   case MVT::i32: Reg = X86::EAX; size = 4; break;
19439   case MVT::i64:
19440     assert(Subtarget->is64Bit() && "Node not type legal!");
19441     Reg = X86::RAX; size = 8;
19442     break;
19443   }
19444   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
19445                                   Op.getOperand(2), SDValue());
19446   SDValue Ops[] = { cpIn.getValue(0),
19447                     Op.getOperand(1),
19448                     Op.getOperand(3),
19449                     DAG.getTargetConstant(size, MVT::i8),
19450                     cpIn.getValue(1) };
19451   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19452   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
19453   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
19454                                            Ops, T, MMO);
19455
19456   SDValue cpOut =
19457     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
19458   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
19459                                       MVT::i32, cpOut.getValue(2));
19460   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
19461                                 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19462
19463   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
19464   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
19465   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
19466   return SDValue();
19467 }
19468
19469 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
19470                             SelectionDAG &DAG) {
19471   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
19472   MVT DstVT = Op.getSimpleValueType();
19473
19474   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
19475     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19476     if (DstVT != MVT::f64)
19477       // This conversion needs to be expanded.
19478       return SDValue();
19479
19480     SDValue InVec = Op->getOperand(0);
19481     SDLoc dl(Op);
19482     unsigned NumElts = SrcVT.getVectorNumElements();
19483     EVT SVT = SrcVT.getVectorElementType();
19484
19485     // Widen the vector in input in the case of MVT::v2i32.
19486     // Example: from MVT::v2i32 to MVT::v4i32.
19487     SmallVector<SDValue, 16> Elts;
19488     for (unsigned i = 0, e = NumElts; i != e; ++i)
19489       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
19490                                  DAG.getIntPtrConstant(i)));
19491
19492     // Explicitly mark the extra elements as Undef.
19493     SDValue Undef = DAG.getUNDEF(SVT);
19494     for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
19495       Elts.push_back(Undef);
19496
19497     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19498     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
19499     SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
19500     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
19501                        DAG.getIntPtrConstant(0));
19502   }
19503
19504   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
19505          Subtarget->hasMMX() && "Unexpected custom BITCAST");
19506   assert((DstVT == MVT::i64 ||
19507           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
19508          "Unexpected custom BITCAST");
19509   // i64 <=> MMX conversions are Legal.
19510   if (SrcVT==MVT::i64 && DstVT.isVector())
19511     return Op;
19512   if (DstVT==MVT::i64 && SrcVT.isVector())
19513     return Op;
19514   // MMX <=> MMX conversions are Legal.
19515   if (SrcVT.isVector() && DstVT.isVector())
19516     return Op;
19517   // All other conversions need to be expanded.
19518   return SDValue();
19519 }
19520
19521 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
19522                           SelectionDAG &DAG) {
19523   SDNode *Node = Op.getNode();
19524   SDLoc dl(Node);
19525
19526   Op = Op.getOperand(0);
19527   EVT VT = Op.getValueType();
19528   assert((VT.is128BitVector() || VT.is256BitVector()) &&
19529          "CTPOP lowering only implemented for 128/256-bit wide vector types");
19530
19531   unsigned NumElts = VT.getVectorNumElements();
19532   EVT EltVT = VT.getVectorElementType();
19533   unsigned Len = EltVT.getSizeInBits();
19534
19535   // This is the vectorized version of the "best" algorithm from
19536   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
19537   // with a minor tweak to use a series of adds + shifts instead of vector
19538   // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
19539   //
19540   //  v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
19541   //  v8i32 => Always profitable
19542   //
19543   // FIXME: There a couple of possible improvements:
19544   //
19545   // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
19546   // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
19547   //
19548   assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
19549          "CTPOP not implemented for this vector element type.");
19550
19551   // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
19552   // extra legalization.
19553   bool NeedsBitcast = EltVT == MVT::i32;
19554   MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
19555
19556   SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
19557   SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
19558   SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
19559
19560   // v = v - ((v >> 1) & 0x55555555...)
19561   SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
19562   SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
19563   SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
19564   if (NeedsBitcast)
19565     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19566
19567   SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
19568   SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
19569   if (NeedsBitcast)
19570     M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
19571
19572   SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
19573   if (VT != And.getValueType())
19574     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19575   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
19576
19577   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
19578   SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
19579   SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
19580   SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
19581   SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
19582
19583   Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
19584   if (NeedsBitcast) {
19585     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19586     M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
19587     Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
19588   }
19589
19590   SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
19591   SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
19592   if (VT != AndRHS.getValueType()) {
19593     AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
19594     AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
19595   }
19596   SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
19597
19598   // v = (v + (v >> 4)) & 0x0F0F0F0F...
19599   SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
19600   SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
19601   Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
19602   Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19603
19604   SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
19605   SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
19606   if (NeedsBitcast) {
19607     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19608     M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
19609   }
19610   And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
19611   if (VT != And.getValueType())
19612     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19613
19614   // The algorithm mentioned above uses:
19615   //    v = (v * 0x01010101...) >> (Len - 8)
19616   //
19617   // Change it to use vector adds + vector shifts which yield faster results on
19618   // Haswell than using vector integer multiplication.
19619   //
19620   // For i32 elements:
19621   //    v = v + (v >> 8)
19622   //    v = v + (v >> 16)
19623   //
19624   // For i64 elements:
19625   //    v = v + (v >> 8)
19626   //    v = v + (v >> 16)
19627   //    v = v + (v >> 32)
19628   //
19629   Add = And;
19630   SmallVector<SDValue, 8> Csts;
19631   for (unsigned i = 8; i <= Len/2; i *= 2) {
19632     Csts.assign(NumElts, DAG.getConstant(i, EltVT));
19633     SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
19634     Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
19635     Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19636     Csts.clear();
19637   }
19638
19639   // The result is on the least significant 6-bits on i32 and 7-bits on i64.
19640   SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
19641   SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
19642   SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
19643   if (NeedsBitcast) {
19644     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19645     M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
19646   }
19647   And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
19648   if (VT != And.getValueType())
19649     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19650
19651   return And;
19652 }
19653
19654 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
19655   SDNode *Node = Op.getNode();
19656   SDLoc dl(Node);
19657   EVT T = Node->getValueType(0);
19658   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
19659                               DAG.getConstant(0, T), Node->getOperand(2));
19660   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
19661                        cast<AtomicSDNode>(Node)->getMemoryVT(),
19662                        Node->getOperand(0),
19663                        Node->getOperand(1), negOp,
19664                        cast<AtomicSDNode>(Node)->getMemOperand(),
19665                        cast<AtomicSDNode>(Node)->getOrdering(),
19666                        cast<AtomicSDNode>(Node)->getSynchScope());
19667 }
19668
19669 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
19670   SDNode *Node = Op.getNode();
19671   SDLoc dl(Node);
19672   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
19673
19674   // Convert seq_cst store -> xchg
19675   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
19676   // FIXME: On 32-bit, store -> fist or movq would be more efficient
19677   //        (The only way to get a 16-byte store is cmpxchg16b)
19678   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
19679   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
19680       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
19681     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
19682                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
19683                                  Node->getOperand(0),
19684                                  Node->getOperand(1), Node->getOperand(2),
19685                                  cast<AtomicSDNode>(Node)->getMemOperand(),
19686                                  cast<AtomicSDNode>(Node)->getOrdering(),
19687                                  cast<AtomicSDNode>(Node)->getSynchScope());
19688     return Swap.getValue(1);
19689   }
19690   // Other atomic stores have a simple pattern.
19691   return Op;
19692 }
19693
19694 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
19695   EVT VT = Op.getNode()->getSimpleValueType(0);
19696
19697   // Let legalize expand this if it isn't a legal type yet.
19698   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19699     return SDValue();
19700
19701   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19702
19703   unsigned Opc;
19704   bool ExtraOp = false;
19705   switch (Op.getOpcode()) {
19706   default: llvm_unreachable("Invalid code");
19707   case ISD::ADDC: Opc = X86ISD::ADD; break;
19708   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
19709   case ISD::SUBC: Opc = X86ISD::SUB; break;
19710   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
19711   }
19712
19713   if (!ExtraOp)
19714     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19715                        Op.getOperand(1));
19716   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19717                      Op.getOperand(1), Op.getOperand(2));
19718 }
19719
19720 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
19721                             SelectionDAG &DAG) {
19722   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
19723
19724   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
19725   // which returns the values as { float, float } (in XMM0) or
19726   // { double, double } (which is returned in XMM0, XMM1).
19727   SDLoc dl(Op);
19728   SDValue Arg = Op.getOperand(0);
19729   EVT ArgVT = Arg.getValueType();
19730   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19731
19732   TargetLowering::ArgListTy Args;
19733   TargetLowering::ArgListEntry Entry;
19734
19735   Entry.Node = Arg;
19736   Entry.Ty = ArgTy;
19737   Entry.isSExt = false;
19738   Entry.isZExt = false;
19739   Args.push_back(Entry);
19740
19741   bool isF64 = ArgVT == MVT::f64;
19742   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
19743   // the small struct {f32, f32} is returned in (eax, edx). For f64,
19744   // the results are returned via SRet in memory.
19745   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
19746   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19747   SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
19748
19749   Type *RetTy = isF64
19750     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
19751     : (Type*)VectorType::get(ArgTy, 4);
19752
19753   TargetLowering::CallLoweringInfo CLI(DAG);
19754   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
19755     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
19756
19757   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
19758
19759   if (isF64)
19760     // Returned in xmm0 and xmm1.
19761     return CallResult.first;
19762
19763   // Returned in bits 0:31 and 32:64 xmm0.
19764   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19765                                CallResult.first, DAG.getIntPtrConstant(0));
19766   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19767                                CallResult.first, DAG.getIntPtrConstant(1));
19768   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
19769   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
19770 }
19771
19772 /// LowerOperation - Provide custom lowering hooks for some operations.
19773 ///
19774 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
19775   switch (Op.getOpcode()) {
19776   default: llvm_unreachable("Should not custom lower this!");
19777   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
19778   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
19779   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
19780     return LowerCMP_SWAP(Op, Subtarget, DAG);
19781   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
19782   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
19783   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
19784   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
19785   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
19786   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
19787   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
19788   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
19789   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
19790   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
19791   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
19792   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
19793   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
19794   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
19795   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
19796   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
19797   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
19798   case ISD::SHL_PARTS:
19799   case ISD::SRA_PARTS:
19800   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
19801   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
19802   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
19803   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
19804   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
19805   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
19806   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
19807   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
19808   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
19809   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
19810   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
19811   case ISD::FABS:
19812   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
19813   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
19814   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
19815   case ISD::SETCC:              return LowerSETCC(Op, DAG);
19816   case ISD::SELECT:             return LowerSELECT(Op, DAG);
19817   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
19818   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
19819   case ISD::VASTART:            return LowerVASTART(Op, DAG);
19820   case ISD::VAARG:              return LowerVAARG(Op, DAG);
19821   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
19822   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
19823   case ISD::INTRINSIC_VOID:
19824   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
19825   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
19826   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
19827   case ISD::FRAME_TO_ARGS_OFFSET:
19828                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
19829   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
19830   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
19831   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
19832   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
19833   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
19834   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
19835   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
19836   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
19837   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
19838   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
19839   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
19840   case ISD::UMUL_LOHI:
19841   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
19842   case ISD::SRA:
19843   case ISD::SRL:
19844   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
19845   case ISD::SADDO:
19846   case ISD::UADDO:
19847   case ISD::SSUBO:
19848   case ISD::USUBO:
19849   case ISD::SMULO:
19850   case ISD::UMULO:              return LowerXALUO(Op, DAG);
19851   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
19852   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
19853   case ISD::ADDC:
19854   case ISD::ADDE:
19855   case ISD::SUBC:
19856   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
19857   case ISD::ADD:                return LowerADD(Op, DAG);
19858   case ISD::SUB:                return LowerSUB(Op, DAG);
19859   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
19860   }
19861 }
19862
19863 /// ReplaceNodeResults - Replace a node with an illegal result type
19864 /// with a new node built out of custom code.
19865 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
19866                                            SmallVectorImpl<SDValue>&Results,
19867                                            SelectionDAG &DAG) const {
19868   SDLoc dl(N);
19869   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19870   switch (N->getOpcode()) {
19871   default:
19872     llvm_unreachable("Do not know how to custom type legalize this operation!");
19873   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
19874   case X86ISD::FMINC:
19875   case X86ISD::FMIN:
19876   case X86ISD::FMAXC:
19877   case X86ISD::FMAX: {
19878     EVT VT = N->getValueType(0);
19879     if (VT != MVT::v2f32)
19880       llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
19881     SDValue UNDEF = DAG.getUNDEF(VT);
19882     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19883                               N->getOperand(0), UNDEF);
19884     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19885                               N->getOperand(1), UNDEF);
19886     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
19887     return;
19888   }
19889   case ISD::SIGN_EXTEND_INREG:
19890   case ISD::ADDC:
19891   case ISD::ADDE:
19892   case ISD::SUBC:
19893   case ISD::SUBE:
19894     // We don't want to expand or promote these.
19895     return;
19896   case ISD::SDIV:
19897   case ISD::UDIV:
19898   case ISD::SREM:
19899   case ISD::UREM:
19900   case ISD::SDIVREM:
19901   case ISD::UDIVREM: {
19902     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
19903     Results.push_back(V);
19904     return;
19905   }
19906   case ISD::FP_TO_SINT:
19907   case ISD::FP_TO_UINT: {
19908     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
19909
19910     if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
19911       return;
19912
19913     std::pair<SDValue,SDValue> Vals =
19914         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
19915     SDValue FIST = Vals.first, StackSlot = Vals.second;
19916     if (FIST.getNode()) {
19917       EVT VT = N->getValueType(0);
19918       // Return a load from the stack slot.
19919       if (StackSlot.getNode())
19920         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
19921                                       MachinePointerInfo(),
19922                                       false, false, false, 0));
19923       else
19924         Results.push_back(FIST);
19925     }
19926     return;
19927   }
19928   case ISD::UINT_TO_FP: {
19929     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19930     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
19931         N->getValueType(0) != MVT::v2f32)
19932       return;
19933     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
19934                                  N->getOperand(0));
19935     SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
19936                                      MVT::f64);
19937     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
19938     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
19939                              DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
19940     Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
19941     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
19942     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
19943     return;
19944   }
19945   case ISD::FP_ROUND: {
19946     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
19947         return;
19948     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
19949     Results.push_back(V);
19950     return;
19951   }
19952   case ISD::INTRINSIC_W_CHAIN: {
19953     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
19954     switch (IntNo) {
19955     default : llvm_unreachable("Do not know how to custom type "
19956                                "legalize this intrinsic operation!");
19957     case Intrinsic::x86_rdtsc:
19958       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19959                                      Results);
19960     case Intrinsic::x86_rdtscp:
19961       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
19962                                      Results);
19963     case Intrinsic::x86_rdpmc:
19964       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
19965     }
19966   }
19967   case ISD::READCYCLECOUNTER: {
19968     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19969                                    Results);
19970   }
19971   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
19972     EVT T = N->getValueType(0);
19973     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
19974     bool Regs64bit = T == MVT::i128;
19975     EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
19976     SDValue cpInL, cpInH;
19977     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19978                         DAG.getConstant(0, HalfT));
19979     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19980                         DAG.getConstant(1, HalfT));
19981     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
19982                              Regs64bit ? X86::RAX : X86::EAX,
19983                              cpInL, SDValue());
19984     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
19985                              Regs64bit ? X86::RDX : X86::EDX,
19986                              cpInH, cpInL.getValue(1));
19987     SDValue swapInL, swapInH;
19988     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19989                           DAG.getConstant(0, HalfT));
19990     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19991                           DAG.getConstant(1, HalfT));
19992     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
19993                                Regs64bit ? X86::RBX : X86::EBX,
19994                                swapInL, cpInH.getValue(1));
19995     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
19996                                Regs64bit ? X86::RCX : X86::ECX,
19997                                swapInH, swapInL.getValue(1));
19998     SDValue Ops[] = { swapInH.getValue(0),
19999                       N->getOperand(1),
20000                       swapInH.getValue(1) };
20001     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20002     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
20003     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
20004                                   X86ISD::LCMPXCHG8_DAG;
20005     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
20006     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
20007                                         Regs64bit ? X86::RAX : X86::EAX,
20008                                         HalfT, Result.getValue(1));
20009     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
20010                                         Regs64bit ? X86::RDX : X86::EDX,
20011                                         HalfT, cpOutL.getValue(2));
20012     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
20013
20014     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
20015                                         MVT::i32, cpOutH.getValue(2));
20016     SDValue Success =
20017         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
20018                     DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
20019     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
20020
20021     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
20022     Results.push_back(Success);
20023     Results.push_back(EFLAGS.getValue(1));
20024     return;
20025   }
20026   case ISD::ATOMIC_SWAP:
20027   case ISD::ATOMIC_LOAD_ADD:
20028   case ISD::ATOMIC_LOAD_SUB:
20029   case ISD::ATOMIC_LOAD_AND:
20030   case ISD::ATOMIC_LOAD_OR:
20031   case ISD::ATOMIC_LOAD_XOR:
20032   case ISD::ATOMIC_LOAD_NAND:
20033   case ISD::ATOMIC_LOAD_MIN:
20034   case ISD::ATOMIC_LOAD_MAX:
20035   case ISD::ATOMIC_LOAD_UMIN:
20036   case ISD::ATOMIC_LOAD_UMAX:
20037   case ISD::ATOMIC_LOAD: {
20038     // Delegate to generic TypeLegalization. Situations we can really handle
20039     // should have already been dealt with by AtomicExpandPass.cpp.
20040     break;
20041   }
20042   case ISD::BITCAST: {
20043     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
20044     EVT DstVT = N->getValueType(0);
20045     EVT SrcVT = N->getOperand(0)->getValueType(0);
20046
20047     if (SrcVT != MVT::f64 ||
20048         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
20049       return;
20050
20051     unsigned NumElts = DstVT.getVectorNumElements();
20052     EVT SVT = DstVT.getVectorElementType();
20053     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
20054     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
20055                                    MVT::v2f64, N->getOperand(0));
20056     SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
20057
20058     if (ExperimentalVectorWideningLegalization) {
20059       // If we are legalizing vectors by widening, we already have the desired
20060       // legal vector type, just return it.
20061       Results.push_back(ToVecInt);
20062       return;
20063     }
20064
20065     SmallVector<SDValue, 8> Elts;
20066     for (unsigned i = 0, e = NumElts; i != e; ++i)
20067       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
20068                                    ToVecInt, DAG.getIntPtrConstant(i)));
20069
20070     Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
20071   }
20072   }
20073 }
20074
20075 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
20076   switch (Opcode) {
20077   default: return nullptr;
20078   case X86ISD::BSF:                return "X86ISD::BSF";
20079   case X86ISD::BSR:                return "X86ISD::BSR";
20080   case X86ISD::SHLD:               return "X86ISD::SHLD";
20081   case X86ISD::SHRD:               return "X86ISD::SHRD";
20082   case X86ISD::FAND:               return "X86ISD::FAND";
20083   case X86ISD::FANDN:              return "X86ISD::FANDN";
20084   case X86ISD::FOR:                return "X86ISD::FOR";
20085   case X86ISD::FXOR:               return "X86ISD::FXOR";
20086   case X86ISD::FSRL:               return "X86ISD::FSRL";
20087   case X86ISD::FILD:               return "X86ISD::FILD";
20088   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
20089   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
20090   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
20091   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
20092   case X86ISD::FLD:                return "X86ISD::FLD";
20093   case X86ISD::FST:                return "X86ISD::FST";
20094   case X86ISD::CALL:               return "X86ISD::CALL";
20095   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
20096   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
20097   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
20098   case X86ISD::BT:                 return "X86ISD::BT";
20099   case X86ISD::CMP:                return "X86ISD::CMP";
20100   case X86ISD::COMI:               return "X86ISD::COMI";
20101   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
20102   case X86ISD::CMPM:               return "X86ISD::CMPM";
20103   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
20104   case X86ISD::SETCC:              return "X86ISD::SETCC";
20105   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
20106   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
20107   case X86ISD::CMOV:               return "X86ISD::CMOV";
20108   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
20109   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
20110   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
20111   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
20112   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
20113   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
20114   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
20115   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
20116   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
20117   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
20118   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
20119   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
20120   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
20121   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
20122   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
20123   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
20124   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
20125   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
20126   case X86ISD::HADD:               return "X86ISD::HADD";
20127   case X86ISD::HSUB:               return "X86ISD::HSUB";
20128   case X86ISD::FHADD:              return "X86ISD::FHADD";
20129   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
20130   case X86ISD::UMAX:               return "X86ISD::UMAX";
20131   case X86ISD::UMIN:               return "X86ISD::UMIN";
20132   case X86ISD::SMAX:               return "X86ISD::SMAX";
20133   case X86ISD::SMIN:               return "X86ISD::SMIN";
20134   case X86ISD::FMAX:               return "X86ISD::FMAX";
20135   case X86ISD::FMIN:               return "X86ISD::FMIN";
20136   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
20137   case X86ISD::FMINC:              return "X86ISD::FMINC";
20138   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
20139   case X86ISD::FRCP:               return "X86ISD::FRCP";
20140   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
20141   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
20142   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
20143   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
20144   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
20145   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
20146   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
20147   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
20148   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
20149   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
20150   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
20151   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
20152   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
20153   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
20154   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
20155   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
20156   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
20157   case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
20158   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
20159   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
20160   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
20161   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
20162   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
20163   case X86ISD::VSHL:               return "X86ISD::VSHL";
20164   case X86ISD::VSRL:               return "X86ISD::VSRL";
20165   case X86ISD::VSRA:               return "X86ISD::VSRA";
20166   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
20167   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
20168   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
20169   case X86ISD::CMPP:               return "X86ISD::CMPP";
20170   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
20171   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
20172   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
20173   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
20174   case X86ISD::ADD:                return "X86ISD::ADD";
20175   case X86ISD::SUB:                return "X86ISD::SUB";
20176   case X86ISD::ADC:                return "X86ISD::ADC";
20177   case X86ISD::SBB:                return "X86ISD::SBB";
20178   case X86ISD::SMUL:               return "X86ISD::SMUL";
20179   case X86ISD::UMUL:               return "X86ISD::UMUL";
20180   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
20181   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
20182   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
20183   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
20184   case X86ISD::INC:                return "X86ISD::INC";
20185   case X86ISD::DEC:                return "X86ISD::DEC";
20186   case X86ISD::OR:                 return "X86ISD::OR";
20187   case X86ISD::XOR:                return "X86ISD::XOR";
20188   case X86ISD::AND:                return "X86ISD::AND";
20189   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
20190   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
20191   case X86ISD::PTEST:              return "X86ISD::PTEST";
20192   case X86ISD::TESTP:              return "X86ISD::TESTP";
20193   case X86ISD::TESTM:              return "X86ISD::TESTM";
20194   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
20195   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
20196   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
20197   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
20198   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
20199   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
20200   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
20201   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
20202   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
20203   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
20204   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
20205   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
20206   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
20207   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
20208   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
20209   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
20210   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
20211   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
20212   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
20213   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
20214   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
20215   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
20216   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
20217   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
20218   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
20219   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
20220   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
20221   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
20222   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
20223   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
20224   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
20225   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
20226   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
20227   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
20228   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
20229   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
20230   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
20231   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
20232   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
20233   case X86ISD::SAHF:               return "X86ISD::SAHF";
20234   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
20235   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
20236   case X86ISD::FMADD:              return "X86ISD::FMADD";
20237   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
20238   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
20239   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
20240   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
20241   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
20242   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
20243   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
20244   case X86ISD::XTEST:              return "X86ISD::XTEST";
20245   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
20246   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
20247   case X86ISD::SELECT:             return "X86ISD::SELECT";
20248   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
20249   case X86ISD::RCP28:              return "X86ISD::RCP28";
20250   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
20251   }
20252 }
20253
20254 // isLegalAddressingMode - Return true if the addressing mode represented
20255 // by AM is legal for this target, for a load/store of the specified type.
20256 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
20257                                               Type *Ty) const {
20258   // X86 supports extremely general addressing modes.
20259   CodeModel::Model M = getTargetMachine().getCodeModel();
20260   Reloc::Model R = getTargetMachine().getRelocationModel();
20261
20262   // X86 allows a sign-extended 32-bit immediate field as a displacement.
20263   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
20264     return false;
20265
20266   if (AM.BaseGV) {
20267     unsigned GVFlags =
20268       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
20269
20270     // If a reference to this global requires an extra load, we can't fold it.
20271     if (isGlobalStubReference(GVFlags))
20272       return false;
20273
20274     // If BaseGV requires a register for the PIC base, we cannot also have a
20275     // BaseReg specified.
20276     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
20277       return false;
20278
20279     // If lower 4G is not available, then we must use rip-relative addressing.
20280     if ((M != CodeModel::Small || R != Reloc::Static) &&
20281         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
20282       return false;
20283   }
20284
20285   switch (AM.Scale) {
20286   case 0:
20287   case 1:
20288   case 2:
20289   case 4:
20290   case 8:
20291     // These scales always work.
20292     break;
20293   case 3:
20294   case 5:
20295   case 9:
20296     // These scales are formed with basereg+scalereg.  Only accept if there is
20297     // no basereg yet.
20298     if (AM.HasBaseReg)
20299       return false;
20300     break;
20301   default:  // Other stuff never works.
20302     return false;
20303   }
20304
20305   return true;
20306 }
20307
20308 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
20309   unsigned Bits = Ty->getScalarSizeInBits();
20310
20311   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
20312   // particularly cheaper than those without.
20313   if (Bits == 8)
20314     return false;
20315
20316   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
20317   // variable shifts just as cheap as scalar ones.
20318   if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
20319     return false;
20320
20321   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
20322   // fully general vector.
20323   return true;
20324 }
20325
20326 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
20327   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20328     return false;
20329   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
20330   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
20331   return NumBits1 > NumBits2;
20332 }
20333
20334 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
20335   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20336     return false;
20337
20338   if (!isTypeLegal(EVT::getEVT(Ty1)))
20339     return false;
20340
20341   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
20342
20343   // Assuming the caller doesn't have a zeroext or signext return parameter,
20344   // truncation all the way down to i1 is valid.
20345   return true;
20346 }
20347
20348 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
20349   return isInt<32>(Imm);
20350 }
20351
20352 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
20353   // Can also use sub to handle negated immediates.
20354   return isInt<32>(Imm);
20355 }
20356
20357 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
20358   if (!VT1.isInteger() || !VT2.isInteger())
20359     return false;
20360   unsigned NumBits1 = VT1.getSizeInBits();
20361   unsigned NumBits2 = VT2.getSizeInBits();
20362   return NumBits1 > NumBits2;
20363 }
20364
20365 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
20366   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20367   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
20368 }
20369
20370 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
20371   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20372   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
20373 }
20374
20375 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
20376   EVT VT1 = Val.getValueType();
20377   if (isZExtFree(VT1, VT2))
20378     return true;
20379
20380   if (Val.getOpcode() != ISD::LOAD)
20381     return false;
20382
20383   if (!VT1.isSimple() || !VT1.isInteger() ||
20384       !VT2.isSimple() || !VT2.isInteger())
20385     return false;
20386
20387   switch (VT1.getSimpleVT().SimpleTy) {
20388   default: break;
20389   case MVT::i8:
20390   case MVT::i16:
20391   case MVT::i32:
20392     // X86 has 8, 16, and 32-bit zero-extending loads.
20393     return true;
20394   }
20395
20396   return false;
20397 }
20398
20399 bool
20400 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
20401   if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
20402     return false;
20403
20404   VT = VT.getScalarType();
20405
20406   if (!VT.isSimple())
20407     return false;
20408
20409   switch (VT.getSimpleVT().SimpleTy) {
20410   case MVT::f32:
20411   case MVT::f64:
20412     return true;
20413   default:
20414     break;
20415   }
20416
20417   return false;
20418 }
20419
20420 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
20421   // i16 instructions are longer (0x66 prefix) and potentially slower.
20422   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
20423 }
20424
20425 /// isShuffleMaskLegal - Targets can use this to indicate that they only
20426 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
20427 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
20428 /// are assumed to be legal.
20429 bool
20430 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
20431                                       EVT VT) const {
20432   if (!VT.isSimple())
20433     return false;
20434
20435   MVT SVT = VT.getSimpleVT();
20436
20437   // Very little shuffling can be done for 64-bit vectors right now.
20438   if (VT.getSizeInBits() == 64)
20439     return false;
20440
20441   // This is an experimental legality test that is tailored to match the
20442   // legality test of the experimental lowering more closely. They are gated
20443   // separately to ease testing of performance differences.
20444   if (ExperimentalVectorShuffleLegality)
20445     // We only care that the types being shuffled are legal. The lowering can
20446     // handle any possible shuffle mask that results.
20447     return isTypeLegal(SVT);
20448
20449   // If this is a single-input shuffle with no 128 bit lane crossings we can
20450   // lower it into pshufb.
20451   if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
20452       (SVT.is256BitVector() && Subtarget->hasInt256())) {
20453     bool isLegal = true;
20454     for (unsigned I = 0, E = M.size(); I != E; ++I) {
20455       if (M[I] >= (int)SVT.getVectorNumElements() ||
20456           ShuffleCrosses128bitLane(SVT, I, M[I])) {
20457         isLegal = false;
20458         break;
20459       }
20460     }
20461     if (isLegal)
20462       return true;
20463   }
20464
20465   // FIXME: blends, shifts.
20466   return (SVT.getVectorNumElements() == 2 ||
20467           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
20468           isMOVLMask(M, SVT) ||
20469           isCommutedMOVLMask(M, SVT) ||
20470           isMOVHLPSMask(M, SVT) ||
20471           isSHUFPMask(M, SVT) ||
20472           isSHUFPMask(M, SVT, /* Commuted */ true) ||
20473           isPSHUFDMask(M, SVT) ||
20474           isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
20475           isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
20476           isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
20477           isPALIGNRMask(M, SVT, Subtarget) ||
20478           isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
20479           isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
20480           isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20481           isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20482           isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
20483           (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
20484 }
20485
20486 bool
20487 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
20488                                           EVT VT) const {
20489   if (!VT.isSimple())
20490     return false;
20491
20492   MVT SVT = VT.getSimpleVT();
20493
20494   // This is an experimental legality test that is tailored to match the
20495   // legality test of the experimental lowering more closely. They are gated
20496   // separately to ease testing of performance differences.
20497   if (ExperimentalVectorShuffleLegality)
20498     // The new vector shuffle lowering is very good at managing zero-inputs.
20499     return isShuffleMaskLegal(Mask, VT);
20500
20501   unsigned NumElts = SVT.getVectorNumElements();
20502   // FIXME: This collection of masks seems suspect.
20503   if (NumElts == 2)
20504     return true;
20505   if (NumElts == 4 && SVT.is128BitVector()) {
20506     return (isMOVLMask(Mask, SVT)  ||
20507             isCommutedMOVLMask(Mask, SVT, true) ||
20508             isSHUFPMask(Mask, SVT) ||
20509             isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
20510             isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
20511                         Subtarget->hasInt256()));
20512   }
20513   return false;
20514 }
20515
20516 //===----------------------------------------------------------------------===//
20517 //                           X86 Scheduler Hooks
20518 //===----------------------------------------------------------------------===//
20519
20520 /// Utility function to emit xbegin specifying the start of an RTM region.
20521 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
20522                                      const TargetInstrInfo *TII) {
20523   DebugLoc DL = MI->getDebugLoc();
20524
20525   const BasicBlock *BB = MBB->getBasicBlock();
20526   MachineFunction::iterator I = MBB;
20527   ++I;
20528
20529   // For the v = xbegin(), we generate
20530   //
20531   // thisMBB:
20532   //  xbegin sinkMBB
20533   //
20534   // mainMBB:
20535   //  eax = -1
20536   //
20537   // sinkMBB:
20538   //  v = eax
20539
20540   MachineBasicBlock *thisMBB = MBB;
20541   MachineFunction *MF = MBB->getParent();
20542   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
20543   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
20544   MF->insert(I, mainMBB);
20545   MF->insert(I, sinkMBB);
20546
20547   // Transfer the remainder of BB and its successor edges to sinkMBB.
20548   sinkMBB->splice(sinkMBB->begin(), MBB,
20549                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20550   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
20551
20552   // thisMBB:
20553   //  xbegin sinkMBB
20554   //  # fallthrough to mainMBB
20555   //  # abortion to sinkMBB
20556   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
20557   thisMBB->addSuccessor(mainMBB);
20558   thisMBB->addSuccessor(sinkMBB);
20559
20560   // mainMBB:
20561   //  EAX = -1
20562   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
20563   mainMBB->addSuccessor(sinkMBB);
20564
20565   // sinkMBB:
20566   // EAX is live into the sinkMBB
20567   sinkMBB->addLiveIn(X86::EAX);
20568   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20569           TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20570     .addReg(X86::EAX);
20571
20572   MI->eraseFromParent();
20573   return sinkMBB;
20574 }
20575
20576 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
20577 // or XMM0_V32I8 in AVX all of this code can be replaced with that
20578 // in the .td file.
20579 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
20580                                        const TargetInstrInfo *TII) {
20581   unsigned Opc;
20582   switch (MI->getOpcode()) {
20583   default: llvm_unreachable("illegal opcode!");
20584   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
20585   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
20586   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
20587   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
20588   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
20589   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
20590   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
20591   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
20592   }
20593
20594   DebugLoc dl = MI->getDebugLoc();
20595   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20596
20597   unsigned NumArgs = MI->getNumOperands();
20598   for (unsigned i = 1; i < NumArgs; ++i) {
20599     MachineOperand &Op = MI->getOperand(i);
20600     if (!(Op.isReg() && Op.isImplicit()))
20601       MIB.addOperand(Op);
20602   }
20603   if (MI->hasOneMemOperand())
20604     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20605
20606   BuildMI(*BB, MI, dl,
20607     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20608     .addReg(X86::XMM0);
20609
20610   MI->eraseFromParent();
20611   return BB;
20612 }
20613
20614 // FIXME: Custom handling because TableGen doesn't support multiple implicit
20615 // defs in an instruction pattern
20616 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
20617                                        const TargetInstrInfo *TII) {
20618   unsigned Opc;
20619   switch (MI->getOpcode()) {
20620   default: llvm_unreachable("illegal opcode!");
20621   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
20622   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
20623   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
20624   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
20625   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
20626   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
20627   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
20628   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
20629   }
20630
20631   DebugLoc dl = MI->getDebugLoc();
20632   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20633
20634   unsigned NumArgs = MI->getNumOperands(); // remove the results
20635   for (unsigned i = 1; i < NumArgs; ++i) {
20636     MachineOperand &Op = MI->getOperand(i);
20637     if (!(Op.isReg() && Op.isImplicit()))
20638       MIB.addOperand(Op);
20639   }
20640   if (MI->hasOneMemOperand())
20641     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20642
20643   BuildMI(*BB, MI, dl,
20644     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20645     .addReg(X86::ECX);
20646
20647   MI->eraseFromParent();
20648   return BB;
20649 }
20650
20651 static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
20652                                       const X86Subtarget *Subtarget) {
20653   DebugLoc dl = MI->getDebugLoc();
20654   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20655   // Address into RAX/EAX, other two args into ECX, EDX.
20656   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
20657   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
20658   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
20659   for (int i = 0; i < X86::AddrNumOperands; ++i)
20660     MIB.addOperand(MI->getOperand(i));
20661
20662   unsigned ValOps = X86::AddrNumOperands;
20663   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
20664     .addReg(MI->getOperand(ValOps).getReg());
20665   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
20666     .addReg(MI->getOperand(ValOps+1).getReg());
20667
20668   // The instruction doesn't actually take any operands though.
20669   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
20670
20671   MI->eraseFromParent(); // The pseudo is gone now.
20672   return BB;
20673 }
20674
20675 MachineBasicBlock *
20676 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
20677                                                  MachineBasicBlock *MBB) const {
20678   // Emit va_arg instruction on X86-64.
20679
20680   // Operands to this pseudo-instruction:
20681   // 0  ) Output        : destination address (reg)
20682   // 1-5) Input         : va_list address (addr, i64mem)
20683   // 6  ) ArgSize       : Size (in bytes) of vararg type
20684   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
20685   // 8  ) Align         : Alignment of type
20686   // 9  ) EFLAGS (implicit-def)
20687
20688   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
20689   assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
20690
20691   unsigned DestReg = MI->getOperand(0).getReg();
20692   MachineOperand &Base = MI->getOperand(1);
20693   MachineOperand &Scale = MI->getOperand(2);
20694   MachineOperand &Index = MI->getOperand(3);
20695   MachineOperand &Disp = MI->getOperand(4);
20696   MachineOperand &Segment = MI->getOperand(5);
20697   unsigned ArgSize = MI->getOperand(6).getImm();
20698   unsigned ArgMode = MI->getOperand(7).getImm();
20699   unsigned Align = MI->getOperand(8).getImm();
20700
20701   // Memory Reference
20702   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
20703   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
20704   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
20705
20706   // Machine Information
20707   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20708   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
20709   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
20710   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
20711   DebugLoc DL = MI->getDebugLoc();
20712
20713   // struct va_list {
20714   //   i32   gp_offset
20715   //   i32   fp_offset
20716   //   i64   overflow_area (address)
20717   //   i64   reg_save_area (address)
20718   // }
20719   // sizeof(va_list) = 24
20720   // alignment(va_list) = 8
20721
20722   unsigned TotalNumIntRegs = 6;
20723   unsigned TotalNumXMMRegs = 8;
20724   bool UseGPOffset = (ArgMode == 1);
20725   bool UseFPOffset = (ArgMode == 2);
20726   unsigned MaxOffset = TotalNumIntRegs * 8 +
20727                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
20728
20729   /* Align ArgSize to a multiple of 8 */
20730   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
20731   bool NeedsAlign = (Align > 8);
20732
20733   MachineBasicBlock *thisMBB = MBB;
20734   MachineBasicBlock *overflowMBB;
20735   MachineBasicBlock *offsetMBB;
20736   MachineBasicBlock *endMBB;
20737
20738   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
20739   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
20740   unsigned OffsetReg = 0;
20741
20742   if (!UseGPOffset && !UseFPOffset) {
20743     // If we only pull from the overflow region, we don't create a branch.
20744     // We don't need to alter control flow.
20745     OffsetDestReg = 0; // unused
20746     OverflowDestReg = DestReg;
20747
20748     offsetMBB = nullptr;
20749     overflowMBB = thisMBB;
20750     endMBB = thisMBB;
20751   } else {
20752     // First emit code to check if gp_offset (or fp_offset) is below the bound.
20753     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
20754     // If not, pull from overflow_area. (branch to overflowMBB)
20755     //
20756     //       thisMBB
20757     //         |     .
20758     //         |        .
20759     //     offsetMBB   overflowMBB
20760     //         |        .
20761     //         |     .
20762     //        endMBB
20763
20764     // Registers for the PHI in endMBB
20765     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
20766     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
20767
20768     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20769     MachineFunction *MF = MBB->getParent();
20770     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20771     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20772     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20773
20774     MachineFunction::iterator MBBIter = MBB;
20775     ++MBBIter;
20776
20777     // Insert the new basic blocks
20778     MF->insert(MBBIter, offsetMBB);
20779     MF->insert(MBBIter, overflowMBB);
20780     MF->insert(MBBIter, endMBB);
20781
20782     // Transfer the remainder of MBB and its successor edges to endMBB.
20783     endMBB->splice(endMBB->begin(), thisMBB,
20784                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
20785     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
20786
20787     // Make offsetMBB and overflowMBB successors of thisMBB
20788     thisMBB->addSuccessor(offsetMBB);
20789     thisMBB->addSuccessor(overflowMBB);
20790
20791     // endMBB is a successor of both offsetMBB and overflowMBB
20792     offsetMBB->addSuccessor(endMBB);
20793     overflowMBB->addSuccessor(endMBB);
20794
20795     // Load the offset value into a register
20796     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20797     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
20798       .addOperand(Base)
20799       .addOperand(Scale)
20800       .addOperand(Index)
20801       .addDisp(Disp, UseFPOffset ? 4 : 0)
20802       .addOperand(Segment)
20803       .setMemRefs(MMOBegin, MMOEnd);
20804
20805     // Check if there is enough room left to pull this argument.
20806     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
20807       .addReg(OffsetReg)
20808       .addImm(MaxOffset + 8 - ArgSizeA8);
20809
20810     // Branch to "overflowMBB" if offset >= max
20811     // Fall through to "offsetMBB" otherwise
20812     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
20813       .addMBB(overflowMBB);
20814   }
20815
20816   // In offsetMBB, emit code to use the reg_save_area.
20817   if (offsetMBB) {
20818     assert(OffsetReg != 0);
20819
20820     // Read the reg_save_area address.
20821     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
20822     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
20823       .addOperand(Base)
20824       .addOperand(Scale)
20825       .addOperand(Index)
20826       .addDisp(Disp, 16)
20827       .addOperand(Segment)
20828       .setMemRefs(MMOBegin, MMOEnd);
20829
20830     // Zero-extend the offset
20831     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
20832       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
20833         .addImm(0)
20834         .addReg(OffsetReg)
20835         .addImm(X86::sub_32bit);
20836
20837     // Add the offset to the reg_save_area to get the final address.
20838     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
20839       .addReg(OffsetReg64)
20840       .addReg(RegSaveReg);
20841
20842     // Compute the offset for the next argument
20843     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20844     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
20845       .addReg(OffsetReg)
20846       .addImm(UseFPOffset ? 16 : 8);
20847
20848     // Store it back into the va_list.
20849     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
20850       .addOperand(Base)
20851       .addOperand(Scale)
20852       .addOperand(Index)
20853       .addDisp(Disp, UseFPOffset ? 4 : 0)
20854       .addOperand(Segment)
20855       .addReg(NextOffsetReg)
20856       .setMemRefs(MMOBegin, MMOEnd);
20857
20858     // Jump to endMBB
20859     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
20860       .addMBB(endMBB);
20861   }
20862
20863   //
20864   // Emit code to use overflow area
20865   //
20866
20867   // Load the overflow_area address into a register.
20868   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
20869   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
20870     .addOperand(Base)
20871     .addOperand(Scale)
20872     .addOperand(Index)
20873     .addDisp(Disp, 8)
20874     .addOperand(Segment)
20875     .setMemRefs(MMOBegin, MMOEnd);
20876
20877   // If we need to align it, do so. Otherwise, just copy the address
20878   // to OverflowDestReg.
20879   if (NeedsAlign) {
20880     // Align the overflow address
20881     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
20882     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
20883
20884     // aligned_addr = (addr + (align-1)) & ~(align-1)
20885     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
20886       .addReg(OverflowAddrReg)
20887       .addImm(Align-1);
20888
20889     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
20890       .addReg(TmpReg)
20891       .addImm(~(uint64_t)(Align-1));
20892   } else {
20893     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
20894       .addReg(OverflowAddrReg);
20895   }
20896
20897   // Compute the next overflow address after this argument.
20898   // (the overflow address should be kept 8-byte aligned)
20899   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
20900   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
20901     .addReg(OverflowDestReg)
20902     .addImm(ArgSizeA8);
20903
20904   // Store the new overflow address.
20905   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
20906     .addOperand(Base)
20907     .addOperand(Scale)
20908     .addOperand(Index)
20909     .addDisp(Disp, 8)
20910     .addOperand(Segment)
20911     .addReg(NextAddrReg)
20912     .setMemRefs(MMOBegin, MMOEnd);
20913
20914   // If we branched, emit the PHI to the front of endMBB.
20915   if (offsetMBB) {
20916     BuildMI(*endMBB, endMBB->begin(), DL,
20917             TII->get(X86::PHI), DestReg)
20918       .addReg(OffsetDestReg).addMBB(offsetMBB)
20919       .addReg(OverflowDestReg).addMBB(overflowMBB);
20920   }
20921
20922   // Erase the pseudo instruction
20923   MI->eraseFromParent();
20924
20925   return endMBB;
20926 }
20927
20928 MachineBasicBlock *
20929 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
20930                                                  MachineInstr *MI,
20931                                                  MachineBasicBlock *MBB) const {
20932   // Emit code to save XMM registers to the stack. The ABI says that the
20933   // number of registers to save is given in %al, so it's theoretically
20934   // possible to do an indirect jump trick to avoid saving all of them,
20935   // however this code takes a simpler approach and just executes all
20936   // of the stores if %al is non-zero. It's less code, and it's probably
20937   // easier on the hardware branch predictor, and stores aren't all that
20938   // expensive anyway.
20939
20940   // Create the new basic blocks. One block contains all the XMM stores,
20941   // and one block is the final destination regardless of whether any
20942   // stores were performed.
20943   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20944   MachineFunction *F = MBB->getParent();
20945   MachineFunction::iterator MBBIter = MBB;
20946   ++MBBIter;
20947   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
20948   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
20949   F->insert(MBBIter, XMMSaveMBB);
20950   F->insert(MBBIter, EndMBB);
20951
20952   // Transfer the remainder of MBB and its successor edges to EndMBB.
20953   EndMBB->splice(EndMBB->begin(), MBB,
20954                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20955   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
20956
20957   // The original block will now fall through to the XMM save block.
20958   MBB->addSuccessor(XMMSaveMBB);
20959   // The XMMSaveMBB will fall through to the end block.
20960   XMMSaveMBB->addSuccessor(EndMBB);
20961
20962   // Now add the instructions.
20963   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20964   DebugLoc DL = MI->getDebugLoc();
20965
20966   unsigned CountReg = MI->getOperand(0).getReg();
20967   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
20968   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
20969
20970   if (!Subtarget->isTargetWin64()) {
20971     // If %al is 0, branch around the XMM save block.
20972     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
20973     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
20974     MBB->addSuccessor(EndMBB);
20975   }
20976
20977   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
20978   // that was just emitted, but clearly shouldn't be "saved".
20979   assert((MI->getNumOperands() <= 3 ||
20980           !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
20981           MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
20982          && "Expected last argument to be EFLAGS");
20983   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
20984   // In the XMM save block, save all the XMM argument registers.
20985   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
20986     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
20987     MachineMemOperand *MMO =
20988       F->getMachineMemOperand(
20989           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
20990         MachineMemOperand::MOStore,
20991         /*Size=*/16, /*Align=*/16);
20992     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
20993       .addFrameIndex(RegSaveFrameIndex)
20994       .addImm(/*Scale=*/1)
20995       .addReg(/*IndexReg=*/0)
20996       .addImm(/*Disp=*/Offset)
20997       .addReg(/*Segment=*/0)
20998       .addReg(MI->getOperand(i).getReg())
20999       .addMemOperand(MMO);
21000   }
21001
21002   MI->eraseFromParent();   // The pseudo instruction is gone now.
21003
21004   return EndMBB;
21005 }
21006
21007 // The EFLAGS operand of SelectItr might be missing a kill marker
21008 // because there were multiple uses of EFLAGS, and ISel didn't know
21009 // which to mark. Figure out whether SelectItr should have had a
21010 // kill marker, and set it if it should. Returns the correct kill
21011 // marker value.
21012 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
21013                                      MachineBasicBlock* BB,
21014                                      const TargetRegisterInfo* TRI) {
21015   // Scan forward through BB for a use/def of EFLAGS.
21016   MachineBasicBlock::iterator miI(std::next(SelectItr));
21017   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
21018     const MachineInstr& mi = *miI;
21019     if (mi.readsRegister(X86::EFLAGS))
21020       return false;
21021     if (mi.definesRegister(X86::EFLAGS))
21022       break; // Should have kill-flag - update below.
21023   }
21024
21025   // If we hit the end of the block, check whether EFLAGS is live into a
21026   // successor.
21027   if (miI == BB->end()) {
21028     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
21029                                           sEnd = BB->succ_end();
21030          sItr != sEnd; ++sItr) {
21031       MachineBasicBlock* succ = *sItr;
21032       if (succ->isLiveIn(X86::EFLAGS))
21033         return false;
21034     }
21035   }
21036
21037   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
21038   // out. SelectMI should have a kill flag on EFLAGS.
21039   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
21040   return true;
21041 }
21042
21043 MachineBasicBlock *
21044 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
21045                                      MachineBasicBlock *BB) const {
21046   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21047   DebugLoc DL = MI->getDebugLoc();
21048
21049   // To "insert" a SELECT_CC instruction, we actually have to insert the
21050   // diamond control-flow pattern.  The incoming instruction knows the
21051   // destination vreg to set, the condition code register to branch on, the
21052   // true/false values to select between, and a branch opcode to use.
21053   const BasicBlock *LLVM_BB = BB->getBasicBlock();
21054   MachineFunction::iterator It = BB;
21055   ++It;
21056
21057   //  thisMBB:
21058   //  ...
21059   //   TrueVal = ...
21060   //   cmpTY ccX, r1, r2
21061   //   bCC copy1MBB
21062   //   fallthrough --> copy0MBB
21063   MachineBasicBlock *thisMBB = BB;
21064   MachineFunction *F = BB->getParent();
21065   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
21066   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
21067   F->insert(It, copy0MBB);
21068   F->insert(It, sinkMBB);
21069
21070   // If the EFLAGS register isn't dead in the terminator, then claim that it's
21071   // live into the sink and copy blocks.
21072   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
21073   if (!MI->killsRegister(X86::EFLAGS) &&
21074       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
21075     copy0MBB->addLiveIn(X86::EFLAGS);
21076     sinkMBB->addLiveIn(X86::EFLAGS);
21077   }
21078
21079   // Transfer the remainder of BB and its successor edges to sinkMBB.
21080   sinkMBB->splice(sinkMBB->begin(), BB,
21081                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
21082   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
21083
21084   // Add the true and fallthrough blocks as its successors.
21085   BB->addSuccessor(copy0MBB);
21086   BB->addSuccessor(sinkMBB);
21087
21088   // Create the conditional branch instruction.
21089   unsigned Opc =
21090     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
21091   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
21092
21093   //  copy0MBB:
21094   //   %FalseValue = ...
21095   //   # fallthrough to sinkMBB
21096   copy0MBB->addSuccessor(sinkMBB);
21097
21098   //  sinkMBB:
21099   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
21100   //  ...
21101   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21102           TII->get(X86::PHI), MI->getOperand(0).getReg())
21103     .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
21104     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
21105
21106   MI->eraseFromParent();   // The pseudo instruction is gone now.
21107   return sinkMBB;
21108 }
21109
21110 MachineBasicBlock *
21111 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
21112                                         MachineBasicBlock *BB) const {
21113   MachineFunction *MF = BB->getParent();
21114   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21115   DebugLoc DL = MI->getDebugLoc();
21116   const BasicBlock *LLVM_BB = BB->getBasicBlock();
21117
21118   assert(MF->shouldSplitStack());
21119
21120   const bool Is64Bit = Subtarget->is64Bit();
21121   const bool IsLP64 = Subtarget->isTarget64BitLP64();
21122
21123   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
21124   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
21125
21126   // BB:
21127   //  ... [Till the alloca]
21128   // If stacklet is not large enough, jump to mallocMBB
21129   //
21130   // bumpMBB:
21131   //  Allocate by subtracting from RSP
21132   //  Jump to continueMBB
21133   //
21134   // mallocMBB:
21135   //  Allocate by call to runtime
21136   //
21137   // continueMBB:
21138   //  ...
21139   //  [rest of original BB]
21140   //
21141
21142   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21143   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21144   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21145
21146   MachineRegisterInfo &MRI = MF->getRegInfo();
21147   const TargetRegisterClass *AddrRegClass =
21148     getRegClassFor(getPointerTy());
21149
21150   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21151     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21152     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
21153     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
21154     sizeVReg = MI->getOperand(1).getReg(),
21155     physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
21156
21157   MachineFunction::iterator MBBIter = BB;
21158   ++MBBIter;
21159
21160   MF->insert(MBBIter, bumpMBB);
21161   MF->insert(MBBIter, mallocMBB);
21162   MF->insert(MBBIter, continueMBB);
21163
21164   continueMBB->splice(continueMBB->begin(), BB,
21165                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
21166   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
21167
21168   // Add code to the main basic block to check if the stack limit has been hit,
21169   // and if so, jump to mallocMBB otherwise to bumpMBB.
21170   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
21171   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
21172     .addReg(tmpSPVReg).addReg(sizeVReg);
21173   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
21174     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
21175     .addReg(SPLimitVReg);
21176   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
21177
21178   // bumpMBB simply decreases the stack pointer, since we know the current
21179   // stacklet has enough space.
21180   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
21181     .addReg(SPLimitVReg);
21182   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
21183     .addReg(SPLimitVReg);
21184   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21185
21186   // Calls into a routine in libgcc to allocate more space from the heap.
21187   const uint32_t *RegMask =
21188       Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21189   if (IsLP64) {
21190     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
21191       .addReg(sizeVReg);
21192     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21193       .addExternalSymbol("__morestack_allocate_stack_space")
21194       .addRegMask(RegMask)
21195       .addReg(X86::RDI, RegState::Implicit)
21196       .addReg(X86::RAX, RegState::ImplicitDefine);
21197   } else if (Is64Bit) {
21198     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
21199       .addReg(sizeVReg);
21200     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21201       .addExternalSymbol("__morestack_allocate_stack_space")
21202       .addRegMask(RegMask)
21203       .addReg(X86::EDI, RegState::Implicit)
21204       .addReg(X86::EAX, RegState::ImplicitDefine);
21205   } else {
21206     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
21207       .addImm(12);
21208     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
21209     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
21210       .addExternalSymbol("__morestack_allocate_stack_space")
21211       .addRegMask(RegMask)
21212       .addReg(X86::EAX, RegState::ImplicitDefine);
21213   }
21214
21215   if (!Is64Bit)
21216     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
21217       .addImm(16);
21218
21219   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
21220     .addReg(IsLP64 ? X86::RAX : X86::EAX);
21221   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21222
21223   // Set up the CFG correctly.
21224   BB->addSuccessor(bumpMBB);
21225   BB->addSuccessor(mallocMBB);
21226   mallocMBB->addSuccessor(continueMBB);
21227   bumpMBB->addSuccessor(continueMBB);
21228
21229   // Take care of the PHI nodes.
21230   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
21231           MI->getOperand(0).getReg())
21232     .addReg(mallocPtrVReg).addMBB(mallocMBB)
21233     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
21234
21235   // Delete the original pseudo instruction.
21236   MI->eraseFromParent();
21237
21238   // And we're done.
21239   return continueMBB;
21240 }
21241
21242 MachineBasicBlock *
21243 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
21244                                         MachineBasicBlock *BB) const {
21245   DebugLoc DL = MI->getDebugLoc();
21246
21247   assert(!Subtarget->isTargetMachO());
21248
21249   X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
21250
21251   MI->eraseFromParent();   // The pseudo instruction is gone now.
21252   return BB;
21253 }
21254
21255 MachineBasicBlock *
21256 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
21257                                       MachineBasicBlock *BB) const {
21258   // This is pretty easy.  We're taking the value that we received from
21259   // our load from the relocation, sticking it in either RDI (x86-64)
21260   // or EAX and doing an indirect call.  The return value will then
21261   // be in the normal return register.
21262   MachineFunction *F = BB->getParent();
21263   const X86InstrInfo *TII = Subtarget->getInstrInfo();
21264   DebugLoc DL = MI->getDebugLoc();
21265
21266   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
21267   assert(MI->getOperand(3).isGlobal() && "This should be a global");
21268
21269   // Get a register mask for the lowered call.
21270   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
21271   // proper register mask.
21272   const uint32_t *RegMask =
21273       Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21274   if (Subtarget->is64Bit()) {
21275     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21276                                       TII->get(X86::MOV64rm), X86::RDI)
21277     .addReg(X86::RIP)
21278     .addImm(0).addReg(0)
21279     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21280                       MI->getOperand(3).getTargetFlags())
21281     .addReg(0);
21282     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
21283     addDirectMem(MIB, X86::RDI);
21284     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
21285   } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
21286     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21287                                       TII->get(X86::MOV32rm), X86::EAX)
21288     .addReg(0)
21289     .addImm(0).addReg(0)
21290     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21291                       MI->getOperand(3).getTargetFlags())
21292     .addReg(0);
21293     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21294     addDirectMem(MIB, X86::EAX);
21295     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21296   } else {
21297     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21298                                       TII->get(X86::MOV32rm), X86::EAX)
21299     .addReg(TII->getGlobalBaseReg(F))
21300     .addImm(0).addReg(0)
21301     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21302                       MI->getOperand(3).getTargetFlags())
21303     .addReg(0);
21304     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21305     addDirectMem(MIB, X86::EAX);
21306     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21307   }
21308
21309   MI->eraseFromParent(); // The pseudo instruction is gone now.
21310   return BB;
21311 }
21312
21313 MachineBasicBlock *
21314 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
21315                                     MachineBasicBlock *MBB) const {
21316   DebugLoc DL = MI->getDebugLoc();
21317   MachineFunction *MF = MBB->getParent();
21318   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21319   MachineRegisterInfo &MRI = MF->getRegInfo();
21320
21321   const BasicBlock *BB = MBB->getBasicBlock();
21322   MachineFunction::iterator I = MBB;
21323   ++I;
21324
21325   // Memory Reference
21326   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21327   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21328
21329   unsigned DstReg;
21330   unsigned MemOpndSlot = 0;
21331
21332   unsigned CurOp = 0;
21333
21334   DstReg = MI->getOperand(CurOp++).getReg();
21335   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
21336   assert(RC->hasType(MVT::i32) && "Invalid destination!");
21337   unsigned mainDstReg = MRI.createVirtualRegister(RC);
21338   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
21339
21340   MemOpndSlot = CurOp;
21341
21342   MVT PVT = getPointerTy();
21343   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21344          "Invalid Pointer Size!");
21345
21346   // For v = setjmp(buf), we generate
21347   //
21348   // thisMBB:
21349   //  buf[LabelOffset] = restoreMBB
21350   //  SjLjSetup restoreMBB
21351   //
21352   // mainMBB:
21353   //  v_main = 0
21354   //
21355   // sinkMBB:
21356   //  v = phi(main, restore)
21357   //
21358   // restoreMBB:
21359   //  if base pointer being used, load it from frame
21360   //  v_restore = 1
21361
21362   MachineBasicBlock *thisMBB = MBB;
21363   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
21364   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
21365   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
21366   MF->insert(I, mainMBB);
21367   MF->insert(I, sinkMBB);
21368   MF->push_back(restoreMBB);
21369
21370   MachineInstrBuilder MIB;
21371
21372   // Transfer the remainder of BB and its successor edges to sinkMBB.
21373   sinkMBB->splice(sinkMBB->begin(), MBB,
21374                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21375   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
21376
21377   // thisMBB:
21378   unsigned PtrStoreOpc = 0;
21379   unsigned LabelReg = 0;
21380   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21381   Reloc::Model RM = MF->getTarget().getRelocationModel();
21382   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
21383                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
21384
21385   // Prepare IP either in reg or imm.
21386   if (!UseImmLabel) {
21387     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
21388     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
21389     LabelReg = MRI.createVirtualRegister(PtrRC);
21390     if (Subtarget->is64Bit()) {
21391       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
21392               .addReg(X86::RIP)
21393               .addImm(0)
21394               .addReg(0)
21395               .addMBB(restoreMBB)
21396               .addReg(0);
21397     } else {
21398       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
21399       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
21400               .addReg(XII->getGlobalBaseReg(MF))
21401               .addImm(0)
21402               .addReg(0)
21403               .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
21404               .addReg(0);
21405     }
21406   } else
21407     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
21408   // Store IP
21409   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
21410   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21411     if (i == X86::AddrDisp)
21412       MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
21413     else
21414       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
21415   }
21416   if (!UseImmLabel)
21417     MIB.addReg(LabelReg);
21418   else
21419     MIB.addMBB(restoreMBB);
21420   MIB.setMemRefs(MMOBegin, MMOEnd);
21421   // Setup
21422   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
21423           .addMBB(restoreMBB);
21424
21425   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21426   MIB.addRegMask(RegInfo->getNoPreservedMask());
21427   thisMBB->addSuccessor(mainMBB);
21428   thisMBB->addSuccessor(restoreMBB);
21429
21430   // mainMBB:
21431   //  EAX = 0
21432   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
21433   mainMBB->addSuccessor(sinkMBB);
21434
21435   // sinkMBB:
21436   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21437           TII->get(X86::PHI), DstReg)
21438     .addReg(mainDstReg).addMBB(mainMBB)
21439     .addReg(restoreDstReg).addMBB(restoreMBB);
21440
21441   // restoreMBB:
21442   if (RegInfo->hasBasePointer(*MF)) {
21443     const bool Uses64BitFramePtr =
21444         Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
21445     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
21446     X86FI->setRestoreBasePointer(MF);
21447     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
21448     unsigned BasePtr = RegInfo->getBaseRegister();
21449     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
21450     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
21451                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
21452       .setMIFlag(MachineInstr::FrameSetup);
21453   }
21454   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
21455   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
21456   restoreMBB->addSuccessor(sinkMBB);
21457
21458   MI->eraseFromParent();
21459   return sinkMBB;
21460 }
21461
21462 MachineBasicBlock *
21463 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
21464                                      MachineBasicBlock *MBB) const {
21465   DebugLoc DL = MI->getDebugLoc();
21466   MachineFunction *MF = MBB->getParent();
21467   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21468   MachineRegisterInfo &MRI = MF->getRegInfo();
21469
21470   // Memory Reference
21471   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21472   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21473
21474   MVT PVT = getPointerTy();
21475   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21476          "Invalid Pointer Size!");
21477
21478   const TargetRegisterClass *RC =
21479     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
21480   unsigned Tmp = MRI.createVirtualRegister(RC);
21481   // Since FP is only updated here but NOT referenced, it's treated as GPR.
21482   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21483   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
21484   unsigned SP = RegInfo->getStackRegister();
21485
21486   MachineInstrBuilder MIB;
21487
21488   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21489   const int64_t SPOffset = 2 * PVT.getStoreSize();
21490
21491   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
21492   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
21493
21494   // Reload FP
21495   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
21496   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
21497     MIB.addOperand(MI->getOperand(i));
21498   MIB.setMemRefs(MMOBegin, MMOEnd);
21499   // Reload IP
21500   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
21501   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21502     if (i == X86::AddrDisp)
21503       MIB.addDisp(MI->getOperand(i), LabelOffset);
21504     else
21505       MIB.addOperand(MI->getOperand(i));
21506   }
21507   MIB.setMemRefs(MMOBegin, MMOEnd);
21508   // Reload SP
21509   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
21510   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21511     if (i == X86::AddrDisp)
21512       MIB.addDisp(MI->getOperand(i), SPOffset);
21513     else
21514       MIB.addOperand(MI->getOperand(i));
21515   }
21516   MIB.setMemRefs(MMOBegin, MMOEnd);
21517   // Jump
21518   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
21519
21520   MI->eraseFromParent();
21521   return MBB;
21522 }
21523
21524 // Replace 213-type (isel default) FMA3 instructions with 231-type for
21525 // accumulator loops. Writing back to the accumulator allows the coalescer
21526 // to remove extra copies in the loop.
21527 MachineBasicBlock *
21528 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
21529                                  MachineBasicBlock *MBB) const {
21530   MachineOperand &AddendOp = MI->getOperand(3);
21531
21532   // Bail out early if the addend isn't a register - we can't switch these.
21533   if (!AddendOp.isReg())
21534     return MBB;
21535
21536   MachineFunction &MF = *MBB->getParent();
21537   MachineRegisterInfo &MRI = MF.getRegInfo();
21538
21539   // Check whether the addend is defined by a PHI:
21540   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
21541   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
21542   if (!AddendDef.isPHI())
21543     return MBB;
21544
21545   // Look for the following pattern:
21546   // loop:
21547   //   %addend = phi [%entry, 0], [%loop, %result]
21548   //   ...
21549   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
21550
21551   // Replace with:
21552   //   loop:
21553   //   %addend = phi [%entry, 0], [%loop, %result]
21554   //   ...
21555   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
21556
21557   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
21558     assert(AddendDef.getOperand(i).isReg());
21559     MachineOperand PHISrcOp = AddendDef.getOperand(i);
21560     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
21561     if (&PHISrcInst == MI) {
21562       // Found a matching instruction.
21563       unsigned NewFMAOpc = 0;
21564       switch (MI->getOpcode()) {
21565         case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
21566         case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
21567         case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
21568         case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
21569         case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
21570         case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
21571         case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
21572         case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
21573         case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
21574         case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
21575         case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
21576         case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
21577         case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
21578         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
21579         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
21580         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
21581         case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
21582         case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
21583         case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
21584         case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
21585
21586         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
21587         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
21588         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
21589         case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
21590         case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
21591         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
21592         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
21593         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
21594         case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
21595         case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
21596         case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
21597         case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
21598         default: llvm_unreachable("Unrecognized FMA variant.");
21599       }
21600
21601       const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
21602       MachineInstrBuilder MIB =
21603         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
21604         .addOperand(MI->getOperand(0))
21605         .addOperand(MI->getOperand(3))
21606         .addOperand(MI->getOperand(2))
21607         .addOperand(MI->getOperand(1));
21608       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
21609       MI->eraseFromParent();
21610     }
21611   }
21612
21613   return MBB;
21614 }
21615
21616 MachineBasicBlock *
21617 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
21618                                                MachineBasicBlock *BB) const {
21619   switch (MI->getOpcode()) {
21620   default: llvm_unreachable("Unexpected instr type to insert");
21621   case X86::TAILJMPd64:
21622   case X86::TAILJMPr64:
21623   case X86::TAILJMPm64:
21624   case X86::TAILJMPd64_REX:
21625   case X86::TAILJMPr64_REX:
21626   case X86::TAILJMPm64_REX:
21627     llvm_unreachable("TAILJMP64 would not be touched here.");
21628   case X86::TCRETURNdi64:
21629   case X86::TCRETURNri64:
21630   case X86::TCRETURNmi64:
21631     return BB;
21632   case X86::WIN_ALLOCA:
21633     return EmitLoweredWinAlloca(MI, BB);
21634   case X86::SEG_ALLOCA_32:
21635   case X86::SEG_ALLOCA_64:
21636     return EmitLoweredSegAlloca(MI, BB);
21637   case X86::TLSCall_32:
21638   case X86::TLSCall_64:
21639     return EmitLoweredTLSCall(MI, BB);
21640   case X86::CMOV_GR8:
21641   case X86::CMOV_FR32:
21642   case X86::CMOV_FR64:
21643   case X86::CMOV_V4F32:
21644   case X86::CMOV_V2F64:
21645   case X86::CMOV_V2I64:
21646   case X86::CMOV_V8F32:
21647   case X86::CMOV_V4F64:
21648   case X86::CMOV_V4I64:
21649   case X86::CMOV_V16F32:
21650   case X86::CMOV_V8F64:
21651   case X86::CMOV_V8I64:
21652   case X86::CMOV_GR16:
21653   case X86::CMOV_GR32:
21654   case X86::CMOV_RFP32:
21655   case X86::CMOV_RFP64:
21656   case X86::CMOV_RFP80:
21657     return EmitLoweredSelect(MI, BB);
21658
21659   case X86::FP32_TO_INT16_IN_MEM:
21660   case X86::FP32_TO_INT32_IN_MEM:
21661   case X86::FP32_TO_INT64_IN_MEM:
21662   case X86::FP64_TO_INT16_IN_MEM:
21663   case X86::FP64_TO_INT32_IN_MEM:
21664   case X86::FP64_TO_INT64_IN_MEM:
21665   case X86::FP80_TO_INT16_IN_MEM:
21666   case X86::FP80_TO_INT32_IN_MEM:
21667   case X86::FP80_TO_INT64_IN_MEM: {
21668     MachineFunction *F = BB->getParent();
21669     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21670     DebugLoc DL = MI->getDebugLoc();
21671
21672     // Change the floating point control register to use "round towards zero"
21673     // mode when truncating to an integer value.
21674     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
21675     addFrameReference(BuildMI(*BB, MI, DL,
21676                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
21677
21678     // Load the old value of the high byte of the control word...
21679     unsigned OldCW =
21680       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
21681     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
21682                       CWFrameIdx);
21683
21684     // Set the high part to be round to zero...
21685     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
21686       .addImm(0xC7F);
21687
21688     // Reload the modified control word now...
21689     addFrameReference(BuildMI(*BB, MI, DL,
21690                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21691
21692     // Restore the memory image of control word to original value
21693     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
21694       .addReg(OldCW);
21695
21696     // Get the X86 opcode to use.
21697     unsigned Opc;
21698     switch (MI->getOpcode()) {
21699     default: llvm_unreachable("illegal opcode!");
21700     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
21701     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
21702     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
21703     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
21704     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
21705     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
21706     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
21707     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
21708     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
21709     }
21710
21711     X86AddressMode AM;
21712     MachineOperand &Op = MI->getOperand(0);
21713     if (Op.isReg()) {
21714       AM.BaseType = X86AddressMode::RegBase;
21715       AM.Base.Reg = Op.getReg();
21716     } else {
21717       AM.BaseType = X86AddressMode::FrameIndexBase;
21718       AM.Base.FrameIndex = Op.getIndex();
21719     }
21720     Op = MI->getOperand(1);
21721     if (Op.isImm())
21722       AM.Scale = Op.getImm();
21723     Op = MI->getOperand(2);
21724     if (Op.isImm())
21725       AM.IndexReg = Op.getImm();
21726     Op = MI->getOperand(3);
21727     if (Op.isGlobal()) {
21728       AM.GV = Op.getGlobal();
21729     } else {
21730       AM.Disp = Op.getImm();
21731     }
21732     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
21733                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
21734
21735     // Reload the original control word now.
21736     addFrameReference(BuildMI(*BB, MI, DL,
21737                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21738
21739     MI->eraseFromParent();   // The pseudo instruction is gone now.
21740     return BB;
21741   }
21742     // String/text processing lowering.
21743   case X86::PCMPISTRM128REG:
21744   case X86::VPCMPISTRM128REG:
21745   case X86::PCMPISTRM128MEM:
21746   case X86::VPCMPISTRM128MEM:
21747   case X86::PCMPESTRM128REG:
21748   case X86::VPCMPESTRM128REG:
21749   case X86::PCMPESTRM128MEM:
21750   case X86::VPCMPESTRM128MEM:
21751     assert(Subtarget->hasSSE42() &&
21752            "Target must have SSE4.2 or AVX features enabled");
21753     return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
21754
21755   // String/text processing lowering.
21756   case X86::PCMPISTRIREG:
21757   case X86::VPCMPISTRIREG:
21758   case X86::PCMPISTRIMEM:
21759   case X86::VPCMPISTRIMEM:
21760   case X86::PCMPESTRIREG:
21761   case X86::VPCMPESTRIREG:
21762   case X86::PCMPESTRIMEM:
21763   case X86::VPCMPESTRIMEM:
21764     assert(Subtarget->hasSSE42() &&
21765            "Target must have SSE4.2 or AVX features enabled");
21766     return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
21767
21768   // Thread synchronization.
21769   case X86::MONITOR:
21770     return EmitMonitor(MI, BB, Subtarget);
21771
21772   // xbegin
21773   case X86::XBEGIN:
21774     return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
21775
21776   case X86::VASTART_SAVE_XMM_REGS:
21777     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
21778
21779   case X86::VAARG_64:
21780     return EmitVAARG64WithCustomInserter(MI, BB);
21781
21782   case X86::EH_SjLj_SetJmp32:
21783   case X86::EH_SjLj_SetJmp64:
21784     return emitEHSjLjSetJmp(MI, BB);
21785
21786   case X86::EH_SjLj_LongJmp32:
21787   case X86::EH_SjLj_LongJmp64:
21788     return emitEHSjLjLongJmp(MI, BB);
21789
21790   case TargetOpcode::STATEPOINT:
21791     // As an implementation detail, STATEPOINT shares the STACKMAP format at
21792     // this point in the process.  We diverge later.
21793     return emitPatchPoint(MI, BB);
21794
21795   case TargetOpcode::STACKMAP:
21796   case TargetOpcode::PATCHPOINT:
21797     return emitPatchPoint(MI, BB);
21798
21799   case X86::VFMADDPDr213r:
21800   case X86::VFMADDPSr213r:
21801   case X86::VFMADDSDr213r:
21802   case X86::VFMADDSSr213r:
21803   case X86::VFMSUBPDr213r:
21804   case X86::VFMSUBPSr213r:
21805   case X86::VFMSUBSDr213r:
21806   case X86::VFMSUBSSr213r:
21807   case X86::VFNMADDPDr213r:
21808   case X86::VFNMADDPSr213r:
21809   case X86::VFNMADDSDr213r:
21810   case X86::VFNMADDSSr213r:
21811   case X86::VFNMSUBPDr213r:
21812   case X86::VFNMSUBPSr213r:
21813   case X86::VFNMSUBSDr213r:
21814   case X86::VFNMSUBSSr213r:
21815   case X86::VFMADDSUBPDr213r:
21816   case X86::VFMADDSUBPSr213r:
21817   case X86::VFMSUBADDPDr213r:
21818   case X86::VFMSUBADDPSr213r:
21819   case X86::VFMADDPDr213rY:
21820   case X86::VFMADDPSr213rY:
21821   case X86::VFMSUBPDr213rY:
21822   case X86::VFMSUBPSr213rY:
21823   case X86::VFNMADDPDr213rY:
21824   case X86::VFNMADDPSr213rY:
21825   case X86::VFNMSUBPDr213rY:
21826   case X86::VFNMSUBPSr213rY:
21827   case X86::VFMADDSUBPDr213rY:
21828   case X86::VFMADDSUBPSr213rY:
21829   case X86::VFMSUBADDPDr213rY:
21830   case X86::VFMSUBADDPSr213rY:
21831     return emitFMA3Instr(MI, BB);
21832   }
21833 }
21834
21835 //===----------------------------------------------------------------------===//
21836 //                           X86 Optimization Hooks
21837 //===----------------------------------------------------------------------===//
21838
21839 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
21840                                                       APInt &KnownZero,
21841                                                       APInt &KnownOne,
21842                                                       const SelectionDAG &DAG,
21843                                                       unsigned Depth) const {
21844   unsigned BitWidth = KnownZero.getBitWidth();
21845   unsigned Opc = Op.getOpcode();
21846   assert((Opc >= ISD::BUILTIN_OP_END ||
21847           Opc == ISD::INTRINSIC_WO_CHAIN ||
21848           Opc == ISD::INTRINSIC_W_CHAIN ||
21849           Opc == ISD::INTRINSIC_VOID) &&
21850          "Should use MaskedValueIsZero if you don't know whether Op"
21851          " is a target node!");
21852
21853   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
21854   switch (Opc) {
21855   default: break;
21856   case X86ISD::ADD:
21857   case X86ISD::SUB:
21858   case X86ISD::ADC:
21859   case X86ISD::SBB:
21860   case X86ISD::SMUL:
21861   case X86ISD::UMUL:
21862   case X86ISD::INC:
21863   case X86ISD::DEC:
21864   case X86ISD::OR:
21865   case X86ISD::XOR:
21866   case X86ISD::AND:
21867     // These nodes' second result is a boolean.
21868     if (Op.getResNo() == 0)
21869       break;
21870     // Fallthrough
21871   case X86ISD::SETCC:
21872     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
21873     break;
21874   case ISD::INTRINSIC_WO_CHAIN: {
21875     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21876     unsigned NumLoBits = 0;
21877     switch (IntId) {
21878     default: break;
21879     case Intrinsic::x86_sse_movmsk_ps:
21880     case Intrinsic::x86_avx_movmsk_ps_256:
21881     case Intrinsic::x86_sse2_movmsk_pd:
21882     case Intrinsic::x86_avx_movmsk_pd_256:
21883     case Intrinsic::x86_mmx_pmovmskb:
21884     case Intrinsic::x86_sse2_pmovmskb_128:
21885     case Intrinsic::x86_avx2_pmovmskb: {
21886       // High bits of movmskp{s|d}, pmovmskb are known zero.
21887       switch (IntId) {
21888         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
21889         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
21890         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
21891         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
21892         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
21893         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
21894         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
21895         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
21896       }
21897       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
21898       break;
21899     }
21900     }
21901     break;
21902   }
21903   }
21904 }
21905
21906 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
21907   SDValue Op,
21908   const SelectionDAG &,
21909   unsigned Depth) const {
21910   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
21911   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
21912     return Op.getValueType().getScalarType().getSizeInBits();
21913
21914   // Fallback case.
21915   return 1;
21916 }
21917
21918 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
21919 /// node is a GlobalAddress + offset.
21920 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
21921                                        const GlobalValue* &GA,
21922                                        int64_t &Offset) const {
21923   if (N->getOpcode() == X86ISD::Wrapper) {
21924     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
21925       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
21926       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
21927       return true;
21928     }
21929   }
21930   return TargetLowering::isGAPlusOffset(N, GA, Offset);
21931 }
21932
21933 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
21934 /// same as extracting the high 128-bit part of 256-bit vector and then
21935 /// inserting the result into the low part of a new 256-bit vector
21936 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
21937   EVT VT = SVOp->getValueType(0);
21938   unsigned NumElems = VT.getVectorNumElements();
21939
21940   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21941   for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
21942     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21943         SVOp->getMaskElt(j) >= 0)
21944       return false;
21945
21946   return true;
21947 }
21948
21949 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
21950 /// same as extracting the low 128-bit part of 256-bit vector and then
21951 /// inserting the result into the high part of a new 256-bit vector
21952 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
21953   EVT VT = SVOp->getValueType(0);
21954   unsigned NumElems = VT.getVectorNumElements();
21955
21956   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21957   for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
21958     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21959         SVOp->getMaskElt(j) >= 0)
21960       return false;
21961
21962   return true;
21963 }
21964
21965 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
21966 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
21967                                         TargetLowering::DAGCombinerInfo &DCI,
21968                                         const X86Subtarget* Subtarget) {
21969   SDLoc dl(N);
21970   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
21971   SDValue V1 = SVOp->getOperand(0);
21972   SDValue V2 = SVOp->getOperand(1);
21973   EVT VT = SVOp->getValueType(0);
21974   unsigned NumElems = VT.getVectorNumElements();
21975
21976   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
21977       V2.getOpcode() == ISD::CONCAT_VECTORS) {
21978     //
21979     //                   0,0,0,...
21980     //                      |
21981     //    V      UNDEF    BUILD_VECTOR    UNDEF
21982     //     \      /           \           /
21983     //  CONCAT_VECTOR         CONCAT_VECTOR
21984     //         \                  /
21985     //          \                /
21986     //          RESULT: V + zero extended
21987     //
21988     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
21989         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
21990         V1.getOperand(1).getOpcode() != ISD::UNDEF)
21991       return SDValue();
21992
21993     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
21994       return SDValue();
21995
21996     // To match the shuffle mask, the first half of the mask should
21997     // be exactly the first vector, and all the rest a splat with the
21998     // first element of the second one.
21999     for (unsigned i = 0; i != NumElems/2; ++i)
22000       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
22001           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
22002         return SDValue();
22003
22004     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
22005     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
22006       if (Ld->hasNUsesOfValue(1, 0)) {
22007         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
22008         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
22009         SDValue ResNode =
22010           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
22011                                   Ld->getMemoryVT(),
22012                                   Ld->getPointerInfo(),
22013                                   Ld->getAlignment(),
22014                                   false/*isVolatile*/, true/*ReadMem*/,
22015                                   false/*WriteMem*/);
22016
22017         // Make sure the newly-created LOAD is in the same position as Ld in
22018         // terms of dependency. We create a TokenFactor for Ld and ResNode,
22019         // and update uses of Ld's output chain to use the TokenFactor.
22020         if (Ld->hasAnyUseOfValue(1)) {
22021           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
22022                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
22023           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
22024           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
22025                                  SDValue(ResNode.getNode(), 1));
22026         }
22027
22028         return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
22029       }
22030     }
22031
22032     // Emit a zeroed vector and insert the desired subvector on its
22033     // first half.
22034     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22035     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
22036     return DCI.CombineTo(N, InsV);
22037   }
22038
22039   //===--------------------------------------------------------------------===//
22040   // Combine some shuffles into subvector extracts and inserts:
22041   //
22042
22043   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
22044   if (isShuffleHigh128VectorInsertLow(SVOp)) {
22045     SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
22046     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
22047     return DCI.CombineTo(N, InsV);
22048   }
22049
22050   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
22051   if (isShuffleLow128VectorInsertHigh(SVOp)) {
22052     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
22053     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
22054     return DCI.CombineTo(N, InsV);
22055   }
22056
22057   return SDValue();
22058 }
22059
22060 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
22061 /// possible.
22062 ///
22063 /// This is the leaf of the recursive combinine below. When we have found some
22064 /// chain of single-use x86 shuffle instructions and accumulated the combined
22065 /// shuffle mask represented by them, this will try to pattern match that mask
22066 /// into either a single instruction if there is a special purpose instruction
22067 /// for this operation, or into a PSHUFB instruction which is a fully general
22068 /// instruction but should only be used to replace chains over a certain depth.
22069 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
22070                                    int Depth, bool HasPSHUFB, SelectionDAG &DAG,
22071                                    TargetLowering::DAGCombinerInfo &DCI,
22072                                    const X86Subtarget *Subtarget) {
22073   assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
22074
22075   // Find the operand that enters the chain. Note that multiple uses are OK
22076   // here, we're not going to remove the operand we find.
22077   SDValue Input = Op.getOperand(0);
22078   while (Input.getOpcode() == ISD::BITCAST)
22079     Input = Input.getOperand(0);
22080
22081   MVT VT = Input.getSimpleValueType();
22082   MVT RootVT = Root.getSimpleValueType();
22083   SDLoc DL(Root);
22084
22085   // Just remove no-op shuffle masks.
22086   if (Mask.size() == 1) {
22087     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
22088                   /*AddTo*/ true);
22089     return true;
22090   }
22091
22092   // Use the float domain if the operand type is a floating point type.
22093   bool FloatDomain = VT.isFloatingPoint();
22094
22095   // For floating point shuffles, we don't have free copies in the shuffle
22096   // instructions or the ability to load as part of the instruction, so
22097   // canonicalize their shuffles to UNPCK or MOV variants.
22098   //
22099   // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
22100   // vectors because it can have a load folded into it that UNPCK cannot. This
22101   // doesn't preclude something switching to the shorter encoding post-RA.
22102   if (FloatDomain) {
22103     if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
22104       bool Lo = Mask.equals(0, 0);
22105       unsigned Shuffle;
22106       MVT ShuffleVT;
22107       // Check if we have SSE3 which will let us use MOVDDUP. That instruction
22108       // is no slower than UNPCKLPD but has the option to fold the input operand
22109       // into even an unaligned memory load.
22110       if (Lo && Subtarget->hasSSE3()) {
22111         Shuffle = X86ISD::MOVDDUP;
22112         ShuffleVT = MVT::v2f64;
22113       } else {
22114         // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
22115         // than the UNPCK variants.
22116         Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
22117         ShuffleVT = MVT::v4f32;
22118       }
22119       if (Depth == 1 && Root->getOpcode() == Shuffle)
22120         return false; // Nothing to do!
22121       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22122       DCI.AddToWorklist(Op.getNode());
22123       if (Shuffle == X86ISD::MOVDDUP)
22124         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22125       else
22126         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22127       DCI.AddToWorklist(Op.getNode());
22128       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22129                     /*AddTo*/ true);
22130       return true;
22131     }
22132     if (Subtarget->hasSSE3() &&
22133         (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
22134       bool Lo = Mask.equals(0, 0, 2, 2);
22135       unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
22136       MVT ShuffleVT = MVT::v4f32;
22137       if (Depth == 1 && Root->getOpcode() == Shuffle)
22138         return false; // Nothing to do!
22139       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22140       DCI.AddToWorklist(Op.getNode());
22141       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22142       DCI.AddToWorklist(Op.getNode());
22143       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22144                     /*AddTo*/ true);
22145       return true;
22146     }
22147     if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
22148       bool Lo = Mask.equals(0, 0, 1, 1);
22149       unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22150       MVT ShuffleVT = MVT::v4f32;
22151       if (Depth == 1 && Root->getOpcode() == Shuffle)
22152         return false; // Nothing to do!
22153       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22154       DCI.AddToWorklist(Op.getNode());
22155       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22156       DCI.AddToWorklist(Op.getNode());
22157       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22158                     /*AddTo*/ true);
22159       return true;
22160     }
22161   }
22162
22163   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
22164   // variants as none of these have single-instruction variants that are
22165   // superior to the UNPCK formulation.
22166   if (!FloatDomain &&
22167       (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
22168        Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
22169        Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
22170        Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
22171                    15))) {
22172     bool Lo = Mask[0] == 0;
22173     unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22174     if (Depth == 1 && Root->getOpcode() == Shuffle)
22175       return false; // Nothing to do!
22176     MVT ShuffleVT;
22177     switch (Mask.size()) {
22178     case 8:
22179       ShuffleVT = MVT::v8i16;
22180       break;
22181     case 16:
22182       ShuffleVT = MVT::v16i8;
22183       break;
22184     default:
22185       llvm_unreachable("Impossible mask size!");
22186     };
22187     Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22188     DCI.AddToWorklist(Op.getNode());
22189     Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22190     DCI.AddToWorklist(Op.getNode());
22191     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22192                   /*AddTo*/ true);
22193     return true;
22194   }
22195
22196   // Don't try to re-form single instruction chains under any circumstances now
22197   // that we've done encoding canonicalization for them.
22198   if (Depth < 2)
22199     return false;
22200
22201   // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
22202   // can replace them with a single PSHUFB instruction profitably. Intel's
22203   // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
22204   // in practice PSHUFB tends to be *very* fast so we're more aggressive.
22205   if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
22206     SmallVector<SDValue, 16> PSHUFBMask;
22207     assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
22208     int Ratio = 16 / Mask.size();
22209     for (unsigned i = 0; i < 16; ++i) {
22210       if (Mask[i / Ratio] == SM_SentinelUndef) {
22211         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
22212         continue;
22213       }
22214       int M = Mask[i / Ratio] != SM_SentinelZero
22215                   ? Ratio * Mask[i / Ratio] + i % Ratio
22216                   : 255;
22217       PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
22218     }
22219     Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
22220     DCI.AddToWorklist(Op.getNode());
22221     SDValue PSHUFBMaskOp =
22222         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
22223     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
22224     Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
22225     DCI.AddToWorklist(Op.getNode());
22226     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22227                   /*AddTo*/ true);
22228     return true;
22229   }
22230
22231   // Failed to find any combines.
22232   return false;
22233 }
22234
22235 /// \brief Fully generic combining of x86 shuffle instructions.
22236 ///
22237 /// This should be the last combine run over the x86 shuffle instructions. Once
22238 /// they have been fully optimized, this will recursively consider all chains
22239 /// of single-use shuffle instructions, build a generic model of the cumulative
22240 /// shuffle operation, and check for simpler instructions which implement this
22241 /// operation. We use this primarily for two purposes:
22242 ///
22243 /// 1) Collapse generic shuffles to specialized single instructions when
22244 ///    equivalent. In most cases, this is just an encoding size win, but
22245 ///    sometimes we will collapse multiple generic shuffles into a single
22246 ///    special-purpose shuffle.
22247 /// 2) Look for sequences of shuffle instructions with 3 or more total
22248 ///    instructions, and replace them with the slightly more expensive SSSE3
22249 ///    PSHUFB instruction if available. We do this as the last combining step
22250 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
22251 ///    a suitable short sequence of other instructions. The PHUFB will either
22252 ///    use a register or have to read from memory and so is slightly (but only
22253 ///    slightly) more expensive than the other shuffle instructions.
22254 ///
22255 /// Because this is inherently a quadratic operation (for each shuffle in
22256 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
22257 /// This should never be an issue in practice as the shuffle lowering doesn't
22258 /// produce sequences of more than 8 instructions.
22259 ///
22260 /// FIXME: We will currently miss some cases where the redundant shuffling
22261 /// would simplify under the threshold for PSHUFB formation because of
22262 /// combine-ordering. To fix this, we should do the redundant instruction
22263 /// combining in this recursive walk.
22264 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
22265                                           ArrayRef<int> RootMask,
22266                                           int Depth, bool HasPSHUFB,
22267                                           SelectionDAG &DAG,
22268                                           TargetLowering::DAGCombinerInfo &DCI,
22269                                           const X86Subtarget *Subtarget) {
22270   // Bound the depth of our recursive combine because this is ultimately
22271   // quadratic in nature.
22272   if (Depth > 8)
22273     return false;
22274
22275   // Directly rip through bitcasts to find the underlying operand.
22276   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
22277     Op = Op.getOperand(0);
22278
22279   MVT VT = Op.getSimpleValueType();
22280   if (!VT.isVector())
22281     return false; // Bail if we hit a non-vector.
22282   // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
22283   // version should be added.
22284   if (VT.getSizeInBits() != 128)
22285     return false;
22286
22287   assert(Root.getSimpleValueType().isVector() &&
22288          "Shuffles operate on vector types!");
22289   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
22290          "Can only combine shuffles of the same vector register size.");
22291
22292   if (!isTargetShuffle(Op.getOpcode()))
22293     return false;
22294   SmallVector<int, 16> OpMask;
22295   bool IsUnary;
22296   bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
22297   // We only can combine unary shuffles which we can decode the mask for.
22298   if (!HaveMask || !IsUnary)
22299     return false;
22300
22301   assert(VT.getVectorNumElements() == OpMask.size() &&
22302          "Different mask size from vector size!");
22303   assert(((RootMask.size() > OpMask.size() &&
22304            RootMask.size() % OpMask.size() == 0) ||
22305           (OpMask.size() > RootMask.size() &&
22306            OpMask.size() % RootMask.size() == 0) ||
22307           OpMask.size() == RootMask.size()) &&
22308          "The smaller number of elements must divide the larger.");
22309   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
22310   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
22311   assert(((RootRatio == 1 && OpRatio == 1) ||
22312           (RootRatio == 1) != (OpRatio == 1)) &&
22313          "Must not have a ratio for both incoming and op masks!");
22314
22315   SmallVector<int, 16> Mask;
22316   Mask.reserve(std::max(OpMask.size(), RootMask.size()));
22317
22318   // Merge this shuffle operation's mask into our accumulated mask. Note that
22319   // this shuffle's mask will be the first applied to the input, followed by the
22320   // root mask to get us all the way to the root value arrangement. The reason
22321   // for this order is that we are recursing up the operation chain.
22322   for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
22323     int RootIdx = i / RootRatio;
22324     if (RootMask[RootIdx] < 0) {
22325       // This is a zero or undef lane, we're done.
22326       Mask.push_back(RootMask[RootIdx]);
22327       continue;
22328     }
22329
22330     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
22331     int OpIdx = RootMaskedIdx / OpRatio;
22332     if (OpMask[OpIdx] < 0) {
22333       // The incoming lanes are zero or undef, it doesn't matter which ones we
22334       // are using.
22335       Mask.push_back(OpMask[OpIdx]);
22336       continue;
22337     }
22338
22339     // Ok, we have non-zero lanes, map them through.
22340     Mask.push_back(OpMask[OpIdx] * OpRatio +
22341                    RootMaskedIdx % OpRatio);
22342   }
22343
22344   // See if we can recurse into the operand to combine more things.
22345   switch (Op.getOpcode()) {
22346     case X86ISD::PSHUFB:
22347       HasPSHUFB = true;
22348     case X86ISD::PSHUFD:
22349     case X86ISD::PSHUFHW:
22350     case X86ISD::PSHUFLW:
22351       if (Op.getOperand(0).hasOneUse() &&
22352           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22353                                         HasPSHUFB, DAG, DCI, Subtarget))
22354         return true;
22355       break;
22356
22357     case X86ISD::UNPCKL:
22358     case X86ISD::UNPCKH:
22359       assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
22360       // We can't check for single use, we have to check that this shuffle is the only user.
22361       if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
22362           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22363                                         HasPSHUFB, DAG, DCI, Subtarget))
22364           return true;
22365       break;
22366   }
22367
22368   // Minor canonicalization of the accumulated shuffle mask to make it easier
22369   // to match below. All this does is detect masks with squential pairs of
22370   // elements, and shrink them to the half-width mask. It does this in a loop
22371   // so it will reduce the size of the mask to the minimal width mask which
22372   // performs an equivalent shuffle.
22373   SmallVector<int, 16> WidenedMask;
22374   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
22375     Mask = std::move(WidenedMask);
22376     WidenedMask.clear();
22377   }
22378
22379   return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
22380                                 Subtarget);
22381 }
22382
22383 /// \brief Get the PSHUF-style mask from PSHUF node.
22384 ///
22385 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
22386 /// PSHUF-style masks that can be reused with such instructions.
22387 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
22388   SmallVector<int, 4> Mask;
22389   bool IsUnary;
22390   bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
22391   (void)HaveMask;
22392   assert(HaveMask);
22393
22394   switch (N.getOpcode()) {
22395   case X86ISD::PSHUFD:
22396     return Mask;
22397   case X86ISD::PSHUFLW:
22398     Mask.resize(4);
22399     return Mask;
22400   case X86ISD::PSHUFHW:
22401     Mask.erase(Mask.begin(), Mask.begin() + 4);
22402     for (int &M : Mask)
22403       M -= 4;
22404     return Mask;
22405   default:
22406     llvm_unreachable("No valid shuffle instruction found!");
22407   }
22408 }
22409
22410 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
22411 ///
22412 /// We walk up the chain and look for a combinable shuffle, skipping over
22413 /// shuffles that we could hoist this shuffle's transformation past without
22414 /// altering anything.
22415 static SDValue
22416 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
22417                              SelectionDAG &DAG,
22418                              TargetLowering::DAGCombinerInfo &DCI) {
22419   assert(N.getOpcode() == X86ISD::PSHUFD &&
22420          "Called with something other than an x86 128-bit half shuffle!");
22421   SDLoc DL(N);
22422
22423   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
22424   // of the shuffles in the chain so that we can form a fresh chain to replace
22425   // this one.
22426   SmallVector<SDValue, 8> Chain;
22427   SDValue V = N.getOperand(0);
22428   for (; V.hasOneUse(); V = V.getOperand(0)) {
22429     switch (V.getOpcode()) {
22430     default:
22431       return SDValue(); // Nothing combined!
22432
22433     case ISD::BITCAST:
22434       // Skip bitcasts as we always know the type for the target specific
22435       // instructions.
22436       continue;
22437
22438     case X86ISD::PSHUFD:
22439       // Found another dword shuffle.
22440       break;
22441
22442     case X86ISD::PSHUFLW:
22443       // Check that the low words (being shuffled) are the identity in the
22444       // dword shuffle, and the high words are self-contained.
22445       if (Mask[0] != 0 || Mask[1] != 1 ||
22446           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
22447         return SDValue();
22448
22449       Chain.push_back(V);
22450       continue;
22451
22452     case X86ISD::PSHUFHW:
22453       // Check that the high words (being shuffled) are the identity in the
22454       // dword shuffle, and the low words are self-contained.
22455       if (Mask[2] != 2 || Mask[3] != 3 ||
22456           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
22457         return SDValue();
22458
22459       Chain.push_back(V);
22460       continue;
22461
22462     case X86ISD::UNPCKL:
22463     case X86ISD::UNPCKH:
22464       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
22465       // shuffle into a preceding word shuffle.
22466       if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
22467         return SDValue();
22468
22469       // Search for a half-shuffle which we can combine with.
22470       unsigned CombineOp =
22471           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
22472       if (V.getOperand(0) != V.getOperand(1) ||
22473           !V->isOnlyUserOf(V.getOperand(0).getNode()))
22474         return SDValue();
22475       Chain.push_back(V);
22476       V = V.getOperand(0);
22477       do {
22478         switch (V.getOpcode()) {
22479         default:
22480           return SDValue(); // Nothing to combine.
22481
22482         case X86ISD::PSHUFLW:
22483         case X86ISD::PSHUFHW:
22484           if (V.getOpcode() == CombineOp)
22485             break;
22486
22487           Chain.push_back(V);
22488
22489           // Fallthrough!
22490         case ISD::BITCAST:
22491           V = V.getOperand(0);
22492           continue;
22493         }
22494         break;
22495       } while (V.hasOneUse());
22496       break;
22497     }
22498     // Break out of the loop if we break out of the switch.
22499     break;
22500   }
22501
22502   if (!V.hasOneUse())
22503     // We fell out of the loop without finding a viable combining instruction.
22504     return SDValue();
22505
22506   // Merge this node's mask and our incoming mask.
22507   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22508   for (int &M : Mask)
22509     M = VMask[M];
22510   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
22511                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22512
22513   // Rebuild the chain around this new shuffle.
22514   while (!Chain.empty()) {
22515     SDValue W = Chain.pop_back_val();
22516
22517     if (V.getValueType() != W.getOperand(0).getValueType())
22518       V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
22519
22520     switch (W.getOpcode()) {
22521     default:
22522       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
22523
22524     case X86ISD::UNPCKL:
22525     case X86ISD::UNPCKH:
22526       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
22527       break;
22528
22529     case X86ISD::PSHUFD:
22530     case X86ISD::PSHUFLW:
22531     case X86ISD::PSHUFHW:
22532       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
22533       break;
22534     }
22535   }
22536   if (V.getValueType() != N.getValueType())
22537     V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
22538
22539   // Return the new chain to replace N.
22540   return V;
22541 }
22542
22543 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
22544 ///
22545 /// We walk up the chain, skipping shuffles of the other half and looking
22546 /// through shuffles which switch halves trying to find a shuffle of the same
22547 /// pair of dwords.
22548 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
22549                                         SelectionDAG &DAG,
22550                                         TargetLowering::DAGCombinerInfo &DCI) {
22551   assert(
22552       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
22553       "Called with something other than an x86 128-bit half shuffle!");
22554   SDLoc DL(N);
22555   unsigned CombineOpcode = N.getOpcode();
22556
22557   // Walk up a single-use chain looking for a combinable shuffle.
22558   SDValue V = N.getOperand(0);
22559   for (; V.hasOneUse(); V = V.getOperand(0)) {
22560     switch (V.getOpcode()) {
22561     default:
22562       return false; // Nothing combined!
22563
22564     case ISD::BITCAST:
22565       // Skip bitcasts as we always know the type for the target specific
22566       // instructions.
22567       continue;
22568
22569     case X86ISD::PSHUFLW:
22570     case X86ISD::PSHUFHW:
22571       if (V.getOpcode() == CombineOpcode)
22572         break;
22573
22574       // Other-half shuffles are no-ops.
22575       continue;
22576     }
22577     // Break out of the loop if we break out of the switch.
22578     break;
22579   }
22580
22581   if (!V.hasOneUse())
22582     // We fell out of the loop without finding a viable combining instruction.
22583     return false;
22584
22585   // Combine away the bottom node as its shuffle will be accumulated into
22586   // a preceding shuffle.
22587   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22588
22589   // Record the old value.
22590   SDValue Old = V;
22591
22592   // Merge this node's mask and our incoming mask (adjusted to account for all
22593   // the pshufd instructions encountered).
22594   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22595   for (int &M : Mask)
22596     M = VMask[M];
22597   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
22598                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22599
22600   // Check that the shuffles didn't cancel each other out. If not, we need to
22601   // combine to the new one.
22602   if (Old != V)
22603     // Replace the combinable shuffle with the combined one, updating all users
22604     // so that we re-evaluate the chain here.
22605     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
22606
22607   return true;
22608 }
22609
22610 /// \brief Try to combine x86 target specific shuffles.
22611 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
22612                                            TargetLowering::DAGCombinerInfo &DCI,
22613                                            const X86Subtarget *Subtarget) {
22614   SDLoc DL(N);
22615   MVT VT = N.getSimpleValueType();
22616   SmallVector<int, 4> Mask;
22617
22618   switch (N.getOpcode()) {
22619   case X86ISD::PSHUFD:
22620   case X86ISD::PSHUFLW:
22621   case X86ISD::PSHUFHW:
22622     Mask = getPSHUFShuffleMask(N);
22623     assert(Mask.size() == 4);
22624     break;
22625   default:
22626     return SDValue();
22627   }
22628
22629   // Nuke no-op shuffles that show up after combining.
22630   if (isNoopShuffleMask(Mask))
22631     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22632
22633   // Look for simplifications involving one or two shuffle instructions.
22634   SDValue V = N.getOperand(0);
22635   switch (N.getOpcode()) {
22636   default:
22637     break;
22638   case X86ISD::PSHUFLW:
22639   case X86ISD::PSHUFHW:
22640     assert(VT == MVT::v8i16);
22641     (void)VT;
22642
22643     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
22644       return SDValue(); // We combined away this shuffle, so we're done.
22645
22646     // See if this reduces to a PSHUFD which is no more expensive and can
22647     // combine with more operations. Note that it has to at least flip the
22648     // dwords as otherwise it would have been removed as a no-op.
22649     if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
22650       int DMask[] = {0, 1, 2, 3};
22651       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
22652       DMask[DOffset + 0] = DOffset + 1;
22653       DMask[DOffset + 1] = DOffset + 0;
22654       V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
22655       DCI.AddToWorklist(V.getNode());
22656       V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
22657                       getV4X86ShuffleImm8ForMask(DMask, DAG));
22658       DCI.AddToWorklist(V.getNode());
22659       return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
22660     }
22661
22662     // Look for shuffle patterns which can be implemented as a single unpack.
22663     // FIXME: This doesn't handle the location of the PSHUFD generically, and
22664     // only works when we have a PSHUFD followed by two half-shuffles.
22665     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
22666         (V.getOpcode() == X86ISD::PSHUFLW ||
22667          V.getOpcode() == X86ISD::PSHUFHW) &&
22668         V.getOpcode() != N.getOpcode() &&
22669         V.hasOneUse()) {
22670       SDValue D = V.getOperand(0);
22671       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
22672         D = D.getOperand(0);
22673       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
22674         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22675         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
22676         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22677         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22678         int WordMask[8];
22679         for (int i = 0; i < 4; ++i) {
22680           WordMask[i + NOffset] = Mask[i] + NOffset;
22681           WordMask[i + VOffset] = VMask[i] + VOffset;
22682         }
22683         // Map the word mask through the DWord mask.
22684         int MappedMask[8];
22685         for (int i = 0; i < 8; ++i)
22686           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
22687         const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
22688         const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
22689         if (std::equal(std::begin(MappedMask), std::end(MappedMask),
22690                        std::begin(UnpackLoMask)) ||
22691             std::equal(std::begin(MappedMask), std::end(MappedMask),
22692                        std::begin(UnpackHiMask))) {
22693           // We can replace all three shuffles with an unpack.
22694           V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
22695           DCI.AddToWorklist(V.getNode());
22696           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
22697                                                 : X86ISD::UNPCKH,
22698                              DL, MVT::v8i16, V, V);
22699         }
22700       }
22701     }
22702
22703     break;
22704
22705   case X86ISD::PSHUFD:
22706     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
22707       return NewN;
22708
22709     break;
22710   }
22711
22712   return SDValue();
22713 }
22714
22715 /// \brief Try to combine a shuffle into a target-specific add-sub node.
22716 ///
22717 /// We combine this directly on the abstract vector shuffle nodes so it is
22718 /// easier to generically match. We also insert dummy vector shuffle nodes for
22719 /// the operands which explicitly discard the lanes which are unused by this
22720 /// operation to try to flow through the rest of the combiner the fact that
22721 /// they're unused.
22722 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
22723   SDLoc DL(N);
22724   EVT VT = N->getValueType(0);
22725
22726   // We only handle target-independent shuffles.
22727   // FIXME: It would be easy and harmless to use the target shuffle mask
22728   // extraction tool to support more.
22729   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
22730     return SDValue();
22731
22732   auto *SVN = cast<ShuffleVectorSDNode>(N);
22733   ArrayRef<int> Mask = SVN->getMask();
22734   SDValue V1 = N->getOperand(0);
22735   SDValue V2 = N->getOperand(1);
22736
22737   // We require the first shuffle operand to be the SUB node, and the second to
22738   // be the ADD node.
22739   // FIXME: We should support the commuted patterns.
22740   if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
22741     return SDValue();
22742
22743   // If there are other uses of these operations we can't fold them.
22744   if (!V1->hasOneUse() || !V2->hasOneUse())
22745     return SDValue();
22746
22747   // Ensure that both operations have the same operands. Note that we can
22748   // commute the FADD operands.
22749   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
22750   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
22751       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
22752     return SDValue();
22753
22754   // We're looking for blends between FADD and FSUB nodes. We insist on these
22755   // nodes being lined up in a specific expected pattern.
22756   if (!(isShuffleEquivalent(Mask, 0, 3) ||
22757         isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
22758         isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
22759     return SDValue();
22760
22761   // Only specific types are legal at this point, assert so we notice if and
22762   // when these change.
22763   assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
22764           VT == MVT::v4f64) &&
22765          "Unknown vector type encountered!");
22766
22767   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
22768 }
22769
22770 /// PerformShuffleCombine - Performs several different shuffle combines.
22771 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
22772                                      TargetLowering::DAGCombinerInfo &DCI,
22773                                      const X86Subtarget *Subtarget) {
22774   SDLoc dl(N);
22775   SDValue N0 = N->getOperand(0);
22776   SDValue N1 = N->getOperand(1);
22777   EVT VT = N->getValueType(0);
22778
22779   // Don't create instructions with illegal types after legalize types has run.
22780   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22781   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
22782     return SDValue();
22783
22784   // If we have legalized the vector types, look for blends of FADD and FSUB
22785   // nodes that we can fuse into an ADDSUB node.
22786   if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
22787     if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
22788       return AddSub;
22789
22790   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
22791   if (Subtarget->hasFp256() && VT.is256BitVector() &&
22792       N->getOpcode() == ISD::VECTOR_SHUFFLE)
22793     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
22794
22795   // During Type Legalization, when promoting illegal vector types,
22796   // the backend might introduce new shuffle dag nodes and bitcasts.
22797   //
22798   // This code performs the following transformation:
22799   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
22800   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
22801   //
22802   // We do this only if both the bitcast and the BINOP dag nodes have
22803   // one use. Also, perform this transformation only if the new binary
22804   // operation is legal. This is to avoid introducing dag nodes that
22805   // potentially need to be further expanded (or custom lowered) into a
22806   // less optimal sequence of dag nodes.
22807   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
22808       N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
22809       N0.getOpcode() == ISD::BITCAST) {
22810     SDValue BC0 = N0.getOperand(0);
22811     EVT SVT = BC0.getValueType();
22812     unsigned Opcode = BC0.getOpcode();
22813     unsigned NumElts = VT.getVectorNumElements();
22814
22815     if (BC0.hasOneUse() && SVT.isVector() &&
22816         SVT.getVectorNumElements() * 2 == NumElts &&
22817         TLI.isOperationLegal(Opcode, VT)) {
22818       bool CanFold = false;
22819       switch (Opcode) {
22820       default : break;
22821       case ISD::ADD :
22822       case ISD::FADD :
22823       case ISD::SUB :
22824       case ISD::FSUB :
22825       case ISD::MUL :
22826       case ISD::FMUL :
22827         CanFold = true;
22828       }
22829
22830       unsigned SVTNumElts = SVT.getVectorNumElements();
22831       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
22832       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
22833         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
22834       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
22835         CanFold = SVOp->getMaskElt(i) < 0;
22836
22837       if (CanFold) {
22838         SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
22839         SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
22840         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
22841         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
22842       }
22843     }
22844   }
22845
22846   // Only handle 128 wide vector from here on.
22847   if (!VT.is128BitVector())
22848     return SDValue();
22849
22850   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
22851   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
22852   // consecutive, non-overlapping, and in the right order.
22853   SmallVector<SDValue, 16> Elts;
22854   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
22855     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
22856
22857   SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
22858   if (LD.getNode())
22859     return LD;
22860
22861   if (isTargetShuffle(N->getOpcode())) {
22862     SDValue Shuffle =
22863         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
22864     if (Shuffle.getNode())
22865       return Shuffle;
22866
22867     // Try recursively combining arbitrary sequences of x86 shuffle
22868     // instructions into higher-order shuffles. We do this after combining
22869     // specific PSHUF instruction sequences into their minimal form so that we
22870     // can evaluate how many specialized shuffle instructions are involved in
22871     // a particular chain.
22872     SmallVector<int, 1> NonceMask; // Just a placeholder.
22873     NonceMask.push_back(0);
22874     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
22875                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
22876                                       DCI, Subtarget))
22877       return SDValue(); // This routine will use CombineTo to replace N.
22878   }
22879
22880   return SDValue();
22881 }
22882
22883 /// PerformTruncateCombine - Converts truncate operation to
22884 /// a sequence of vector shuffle operations.
22885 /// It is possible when we truncate 256-bit vector to 128-bit vector
22886 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
22887                                       TargetLowering::DAGCombinerInfo &DCI,
22888                                       const X86Subtarget *Subtarget)  {
22889   return SDValue();
22890 }
22891
22892 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
22893 /// specific shuffle of a load can be folded into a single element load.
22894 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
22895 /// shuffles have been custom lowered so we need to handle those here.
22896 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
22897                                          TargetLowering::DAGCombinerInfo &DCI) {
22898   if (DCI.isBeforeLegalizeOps())
22899     return SDValue();
22900
22901   SDValue InVec = N->getOperand(0);
22902   SDValue EltNo = N->getOperand(1);
22903
22904   if (!isa<ConstantSDNode>(EltNo))
22905     return SDValue();
22906
22907   EVT OriginalVT = InVec.getValueType();
22908
22909   if (InVec.getOpcode() == ISD::BITCAST) {
22910     // Don't duplicate a load with other uses.
22911     if (!InVec.hasOneUse())
22912       return SDValue();
22913     EVT BCVT = InVec.getOperand(0).getValueType();
22914     if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
22915       return SDValue();
22916     InVec = InVec.getOperand(0);
22917   }
22918
22919   EVT CurrentVT = InVec.getValueType();
22920
22921   if (!isTargetShuffle(InVec.getOpcode()))
22922     return SDValue();
22923
22924   // Don't duplicate a load with other uses.
22925   if (!InVec.hasOneUse())
22926     return SDValue();
22927
22928   SmallVector<int, 16> ShuffleMask;
22929   bool UnaryShuffle;
22930   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
22931                             ShuffleMask, UnaryShuffle))
22932     return SDValue();
22933
22934   // Select the input vector, guarding against out of range extract vector.
22935   unsigned NumElems = CurrentVT.getVectorNumElements();
22936   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
22937   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
22938   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
22939                                          : InVec.getOperand(1);
22940
22941   // If inputs to shuffle are the same for both ops, then allow 2 uses
22942   unsigned AllowedUses = InVec.getNumOperands() > 1 &&
22943                          InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
22944
22945   if (LdNode.getOpcode() == ISD::BITCAST) {
22946     // Don't duplicate a load with other uses.
22947     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
22948       return SDValue();
22949
22950     AllowedUses = 1; // only allow 1 load use if we have a bitcast
22951     LdNode = LdNode.getOperand(0);
22952   }
22953
22954   if (!ISD::isNormalLoad(LdNode.getNode()))
22955     return SDValue();
22956
22957   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
22958
22959   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
22960     return SDValue();
22961
22962   EVT EltVT = N->getValueType(0);
22963   // If there's a bitcast before the shuffle, check if the load type and
22964   // alignment is valid.
22965   unsigned Align = LN0->getAlignment();
22966   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22967   unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
22968       EltVT.getTypeForEVT(*DAG.getContext()));
22969
22970   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
22971     return SDValue();
22972
22973   // All checks match so transform back to vector_shuffle so that DAG combiner
22974   // can finish the job
22975   SDLoc dl(N);
22976
22977   // Create shuffle node taking into account the case that its a unary shuffle
22978   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
22979                                    : InVec.getOperand(1);
22980   Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
22981                                  InVec.getOperand(0), Shuffle,
22982                                  &ShuffleMask[0]);
22983   Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
22984   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
22985                      EltNo);
22986 }
22987
22988 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
22989 /// generation and convert it from being a bunch of shuffles and extracts
22990 /// into a somewhat faster sequence. For i686, the best sequence is apparently
22991 /// storing the value and loading scalars back, while for x64 we should
22992 /// use 64-bit extracts and shifts.
22993 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
22994                                          TargetLowering::DAGCombinerInfo &DCI) {
22995   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
22996   if (NewOp.getNode())
22997     return NewOp;
22998
22999   SDValue InputVector = N->getOperand(0);
23000
23001   // Detect mmx to i32 conversion through a v2i32 elt extract.
23002   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
23003       N->getValueType(0) == MVT::i32 &&
23004       InputVector.getValueType() == MVT::v2i32) {
23005
23006     // The bitcast source is a direct mmx result.
23007     SDValue MMXSrc = InputVector.getNode()->getOperand(0);
23008     if (MMXSrc.getValueType() == MVT::x86mmx)
23009       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23010                          N->getValueType(0),
23011                          InputVector.getNode()->getOperand(0));
23012
23013     // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
23014     SDValue MMXSrcOp = MMXSrc.getOperand(0);
23015     if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
23016         MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
23017         MMXSrcOp.getOpcode() == ISD::BITCAST &&
23018         MMXSrcOp.getValueType() == MVT::v1i64 &&
23019         MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
23020       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23021                          N->getValueType(0),
23022                          MMXSrcOp.getOperand(0));
23023   }
23024
23025   // Only operate on vectors of 4 elements, where the alternative shuffling
23026   // gets to be more expensive.
23027   if (InputVector.getValueType() != MVT::v4i32)
23028     return SDValue();
23029
23030   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
23031   // single use which is a sign-extend or zero-extend, and all elements are
23032   // used.
23033   SmallVector<SDNode *, 4> Uses;
23034   unsigned ExtractedElements = 0;
23035   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
23036        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
23037     if (UI.getUse().getResNo() != InputVector.getResNo())
23038       return SDValue();
23039
23040     SDNode *Extract = *UI;
23041     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23042       return SDValue();
23043
23044     if (Extract->getValueType(0) != MVT::i32)
23045       return SDValue();
23046     if (!Extract->hasOneUse())
23047       return SDValue();
23048     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
23049         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
23050       return SDValue();
23051     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
23052       return SDValue();
23053
23054     // Record which element was extracted.
23055     ExtractedElements |=
23056       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
23057
23058     Uses.push_back(Extract);
23059   }
23060
23061   // If not all the elements were used, this may not be worthwhile.
23062   if (ExtractedElements != 15)
23063     return SDValue();
23064
23065   // Ok, we've now decided to do the transformation.
23066   // If 64-bit shifts are legal, use the extract-shift sequence,
23067   // otherwise bounce the vector off the cache.
23068   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23069   SDValue Vals[4];
23070   SDLoc dl(InputVector);
23071
23072   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
23073     SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
23074     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
23075     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23076       DAG.getConstant(0, VecIdxTy));
23077     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23078       DAG.getConstant(1, VecIdxTy));
23079
23080     SDValue ShAmt = DAG.getConstant(32,
23081       DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
23082     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
23083     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23084       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
23085     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
23086     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23087       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
23088   } else {
23089     // Store the value to a temporary stack slot.
23090     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
23091     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
23092       MachinePointerInfo(), false, false, 0);
23093
23094     EVT ElementType = InputVector.getValueType().getVectorElementType();
23095     unsigned EltSize = ElementType.getSizeInBits() / 8;
23096
23097     // Replace each use (extract) with a load of the appropriate element.
23098     for (unsigned i = 0; i < 4; ++i) {
23099       uint64_t Offset = EltSize * i;
23100       SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
23101
23102       SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
23103                                        StackPtr, OffsetVal);
23104
23105       // Load the scalar.
23106       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
23107                             ScalarAddr, MachinePointerInfo(),
23108                             false, false, false, 0);
23109
23110     }
23111   }
23112
23113   // Replace the extracts
23114   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
23115     UE = Uses.end(); UI != UE; ++UI) {
23116     SDNode *Extract = *UI;
23117
23118     SDValue Idx = Extract->getOperand(1);
23119     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
23120     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
23121   }
23122
23123   // The replacement was made in place; don't return anything.
23124   return SDValue();
23125 }
23126
23127 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
23128 static std::pair<unsigned, bool>
23129 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
23130                    SelectionDAG &DAG, const X86Subtarget *Subtarget) {
23131   if (!VT.isVector())
23132     return std::make_pair(0, false);
23133
23134   bool NeedSplit = false;
23135   switch (VT.getSimpleVT().SimpleTy) {
23136   default: return std::make_pair(0, false);
23137   case MVT::v4i64:
23138   case MVT::v2i64:
23139     if (!Subtarget->hasVLX())
23140       return std::make_pair(0, false);
23141     break;
23142   case MVT::v64i8:
23143   case MVT::v32i16:
23144     if (!Subtarget->hasBWI())
23145       return std::make_pair(0, false);
23146     break;
23147   case MVT::v16i32:
23148   case MVT::v8i64:
23149     if (!Subtarget->hasAVX512())
23150       return std::make_pair(0, false);
23151     break;
23152   case MVT::v32i8:
23153   case MVT::v16i16:
23154   case MVT::v8i32:
23155     if (!Subtarget->hasAVX2())
23156       NeedSplit = true;
23157     if (!Subtarget->hasAVX())
23158       return std::make_pair(0, false);
23159     break;
23160   case MVT::v16i8:
23161   case MVT::v8i16:
23162   case MVT::v4i32:
23163     if (!Subtarget->hasSSE2())
23164       return std::make_pair(0, false);
23165   }
23166
23167   // SSE2 has only a small subset of the operations.
23168   bool hasUnsigned = Subtarget->hasSSE41() ||
23169                      (Subtarget->hasSSE2() && VT == MVT::v16i8);
23170   bool hasSigned = Subtarget->hasSSE41() ||
23171                    (Subtarget->hasSSE2() && VT == MVT::v8i16);
23172
23173   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23174
23175   unsigned Opc = 0;
23176   // Check for x CC y ? x : y.
23177   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23178       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23179     switch (CC) {
23180     default: break;
23181     case ISD::SETULT:
23182     case ISD::SETULE:
23183       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23184     case ISD::SETUGT:
23185     case ISD::SETUGE:
23186       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23187     case ISD::SETLT:
23188     case ISD::SETLE:
23189       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23190     case ISD::SETGT:
23191     case ISD::SETGE:
23192       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23193     }
23194   // Check for x CC y ? y : x -- a min/max with reversed arms.
23195   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23196              DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23197     switch (CC) {
23198     default: break;
23199     case ISD::SETULT:
23200     case ISD::SETULE:
23201       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23202     case ISD::SETUGT:
23203     case ISD::SETUGE:
23204       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23205     case ISD::SETLT:
23206     case ISD::SETLE:
23207       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23208     case ISD::SETGT:
23209     case ISD::SETGE:
23210       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23211     }
23212   }
23213
23214   return std::make_pair(Opc, NeedSplit);
23215 }
23216
23217 static SDValue
23218 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
23219                                       const X86Subtarget *Subtarget) {
23220   SDLoc dl(N);
23221   SDValue Cond = N->getOperand(0);
23222   SDValue LHS = N->getOperand(1);
23223   SDValue RHS = N->getOperand(2);
23224
23225   if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
23226     SDValue CondSrc = Cond->getOperand(0);
23227     if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
23228       Cond = CondSrc->getOperand(0);
23229   }
23230
23231   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
23232     return SDValue();
23233
23234   // A vselect where all conditions and data are constants can be optimized into
23235   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
23236   if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
23237       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
23238     return SDValue();
23239
23240   unsigned MaskValue = 0;
23241   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
23242     return SDValue();
23243
23244   MVT VT = N->getSimpleValueType(0);
23245   unsigned NumElems = VT.getVectorNumElements();
23246   SmallVector<int, 8> ShuffleMask(NumElems, -1);
23247   for (unsigned i = 0; i < NumElems; ++i) {
23248     // Be sure we emit undef where we can.
23249     if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
23250       ShuffleMask[i] = -1;
23251     else
23252       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
23253   }
23254
23255   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23256   if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
23257     return SDValue();
23258   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
23259 }
23260
23261 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
23262 /// nodes.
23263 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
23264                                     TargetLowering::DAGCombinerInfo &DCI,
23265                                     const X86Subtarget *Subtarget) {
23266   SDLoc DL(N);
23267   SDValue Cond = N->getOperand(0);
23268   // Get the LHS/RHS of the select.
23269   SDValue LHS = N->getOperand(1);
23270   SDValue RHS = N->getOperand(2);
23271   EVT VT = LHS.getValueType();
23272   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23273
23274   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
23275   // instructions match the semantics of the common C idiom x<y?x:y but not
23276   // x<=y?x:y, because of how they handle negative zero (which can be
23277   // ignored in unsafe-math mode).
23278   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
23279   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
23280       VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
23281       (Subtarget->hasSSE2() ||
23282        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
23283     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23284
23285     unsigned Opcode = 0;
23286     // Check for x CC y ? x : y.
23287     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23288         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23289       switch (CC) {
23290       default: break;
23291       case ISD::SETULT:
23292         // Converting this to a min would handle NaNs incorrectly, and swapping
23293         // the operands would cause it to handle comparisons between positive
23294         // and negative zero incorrectly.
23295         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23296           if (!DAG.getTarget().Options.UnsafeFPMath &&
23297               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23298             break;
23299           std::swap(LHS, RHS);
23300         }
23301         Opcode = X86ISD::FMIN;
23302         break;
23303       case ISD::SETOLE:
23304         // Converting this to a min would handle comparisons between positive
23305         // and negative zero incorrectly.
23306         if (!DAG.getTarget().Options.UnsafeFPMath &&
23307             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23308           break;
23309         Opcode = X86ISD::FMIN;
23310         break;
23311       case ISD::SETULE:
23312         // Converting this to a min would handle both negative zeros and NaNs
23313         // incorrectly, but we can swap the operands to fix both.
23314         std::swap(LHS, RHS);
23315       case ISD::SETOLT:
23316       case ISD::SETLT:
23317       case ISD::SETLE:
23318         Opcode = X86ISD::FMIN;
23319         break;
23320
23321       case ISD::SETOGE:
23322         // Converting this to a max would handle comparisons between positive
23323         // and negative zero incorrectly.
23324         if (!DAG.getTarget().Options.UnsafeFPMath &&
23325             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23326           break;
23327         Opcode = X86ISD::FMAX;
23328         break;
23329       case ISD::SETUGT:
23330         // Converting this to a max would handle NaNs incorrectly, and swapping
23331         // the operands would cause it to handle comparisons between positive
23332         // and negative zero incorrectly.
23333         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23334           if (!DAG.getTarget().Options.UnsafeFPMath &&
23335               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23336             break;
23337           std::swap(LHS, RHS);
23338         }
23339         Opcode = X86ISD::FMAX;
23340         break;
23341       case ISD::SETUGE:
23342         // Converting this to a max would handle both negative zeros and NaNs
23343         // incorrectly, but we can swap the operands to fix both.
23344         std::swap(LHS, RHS);
23345       case ISD::SETOGT:
23346       case ISD::SETGT:
23347       case ISD::SETGE:
23348         Opcode = X86ISD::FMAX;
23349         break;
23350       }
23351     // Check for x CC y ? y : x -- a min/max with reversed arms.
23352     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23353                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23354       switch (CC) {
23355       default: break;
23356       case ISD::SETOGE:
23357         // Converting this to a min would handle comparisons between positive
23358         // and negative zero incorrectly, and swapping the operands would
23359         // cause it to handle NaNs incorrectly.
23360         if (!DAG.getTarget().Options.UnsafeFPMath &&
23361             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
23362           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23363             break;
23364           std::swap(LHS, RHS);
23365         }
23366         Opcode = X86ISD::FMIN;
23367         break;
23368       case ISD::SETUGT:
23369         // Converting this to a min would handle NaNs incorrectly.
23370         if (!DAG.getTarget().Options.UnsafeFPMath &&
23371             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
23372           break;
23373         Opcode = X86ISD::FMIN;
23374         break;
23375       case ISD::SETUGE:
23376         // Converting this to a min would handle both negative zeros and NaNs
23377         // incorrectly, but we can swap the operands to fix both.
23378         std::swap(LHS, RHS);
23379       case ISD::SETOGT:
23380       case ISD::SETGT:
23381       case ISD::SETGE:
23382         Opcode = X86ISD::FMIN;
23383         break;
23384
23385       case ISD::SETULT:
23386         // Converting this to a max would handle NaNs incorrectly.
23387         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23388           break;
23389         Opcode = X86ISD::FMAX;
23390         break;
23391       case ISD::SETOLE:
23392         // Converting this to a max would handle comparisons between positive
23393         // and negative zero incorrectly, and swapping the operands would
23394         // cause it to handle NaNs incorrectly.
23395         if (!DAG.getTarget().Options.UnsafeFPMath &&
23396             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
23397           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23398             break;
23399           std::swap(LHS, RHS);
23400         }
23401         Opcode = X86ISD::FMAX;
23402         break;
23403       case ISD::SETULE:
23404         // Converting this to a max would handle both negative zeros and NaNs
23405         // incorrectly, but we can swap the operands to fix both.
23406         std::swap(LHS, RHS);
23407       case ISD::SETOLT:
23408       case ISD::SETLT:
23409       case ISD::SETLE:
23410         Opcode = X86ISD::FMAX;
23411         break;
23412       }
23413     }
23414
23415     if (Opcode)
23416       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
23417   }
23418
23419   EVT CondVT = Cond.getValueType();
23420   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
23421       CondVT.getVectorElementType() == MVT::i1) {
23422     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
23423     // lowering on KNL. In this case we convert it to
23424     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
23425     // The same situation for all 128 and 256-bit vectors of i8 and i16.
23426     // Since SKX these selects have a proper lowering.
23427     EVT OpVT = LHS.getValueType();
23428     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
23429         (OpVT.getVectorElementType() == MVT::i8 ||
23430          OpVT.getVectorElementType() == MVT::i16) &&
23431         !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
23432       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
23433       DCI.AddToWorklist(Cond.getNode());
23434       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
23435     }
23436   }
23437   // If this is a select between two integer constants, try to do some
23438   // optimizations.
23439   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
23440     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
23441       // Don't do this for crazy integer types.
23442       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
23443         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
23444         // so that TrueC (the true value) is larger than FalseC.
23445         bool NeedsCondInvert = false;
23446
23447         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
23448             // Efficiently invertible.
23449             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
23450              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
23451               isa<ConstantSDNode>(Cond.getOperand(1))))) {
23452           NeedsCondInvert = true;
23453           std::swap(TrueC, FalseC);
23454         }
23455
23456         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
23457         if (FalseC->getAPIntValue() == 0 &&
23458             TrueC->getAPIntValue().isPowerOf2()) {
23459           if (NeedsCondInvert) // Invert the condition if needed.
23460             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23461                                DAG.getConstant(1, Cond.getValueType()));
23462
23463           // Zero extend the condition if needed.
23464           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
23465
23466           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23467           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
23468                              DAG.getConstant(ShAmt, MVT::i8));
23469         }
23470
23471         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
23472         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23473           if (NeedsCondInvert) // Invert the condition if needed.
23474             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23475                                DAG.getConstant(1, Cond.getValueType()));
23476
23477           // Zero extend the condition if needed.
23478           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23479                              FalseC->getValueType(0), Cond);
23480           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23481                              SDValue(FalseC, 0));
23482         }
23483
23484         // Optimize cases that will turn into an LEA instruction.  This requires
23485         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23486         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23487           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23488           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23489
23490           bool isFastMultiplier = false;
23491           if (Diff < 10) {
23492             switch ((unsigned char)Diff) {
23493               default: break;
23494               case 1:  // result = add base, cond
23495               case 2:  // result = lea base(    , cond*2)
23496               case 3:  // result = lea base(cond, cond*2)
23497               case 4:  // result = lea base(    , cond*4)
23498               case 5:  // result = lea base(cond, cond*4)
23499               case 8:  // result = lea base(    , cond*8)
23500               case 9:  // result = lea base(cond, cond*8)
23501                 isFastMultiplier = true;
23502                 break;
23503             }
23504           }
23505
23506           if (isFastMultiplier) {
23507             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23508             if (NeedsCondInvert) // Invert the condition if needed.
23509               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23510                                  DAG.getConstant(1, Cond.getValueType()));
23511
23512             // Zero extend the condition if needed.
23513             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23514                                Cond);
23515             // Scale the condition by the difference.
23516             if (Diff != 1)
23517               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23518                                  DAG.getConstant(Diff, Cond.getValueType()));
23519
23520             // Add the base if non-zero.
23521             if (FalseC->getAPIntValue() != 0)
23522               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23523                                  SDValue(FalseC, 0));
23524             return Cond;
23525           }
23526         }
23527       }
23528   }
23529
23530   // Canonicalize max and min:
23531   // (x > y) ? x : y -> (x >= y) ? x : y
23532   // (x < y) ? x : y -> (x <= y) ? x : y
23533   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
23534   // the need for an extra compare
23535   // against zero. e.g.
23536   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
23537   // subl   %esi, %edi
23538   // testl  %edi, %edi
23539   // movl   $0, %eax
23540   // cmovgl %edi, %eax
23541   // =>
23542   // xorl   %eax, %eax
23543   // subl   %esi, $edi
23544   // cmovsl %eax, %edi
23545   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
23546       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23547       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23548     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23549     switch (CC) {
23550     default: break;
23551     case ISD::SETLT:
23552     case ISD::SETGT: {
23553       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
23554       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
23555                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
23556       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
23557     }
23558     }
23559   }
23560
23561   // Early exit check
23562   if (!TLI.isTypeLegal(VT))
23563     return SDValue();
23564
23565   // Match VSELECTs into subs with unsigned saturation.
23566   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
23567       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
23568       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
23569        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
23570     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23571
23572     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
23573     // left side invert the predicate to simplify logic below.
23574     SDValue Other;
23575     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
23576       Other = RHS;
23577       CC = ISD::getSetCCInverse(CC, true);
23578     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
23579       Other = LHS;
23580     }
23581
23582     if (Other.getNode() && Other->getNumOperands() == 2 &&
23583         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
23584       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
23585       SDValue CondRHS = Cond->getOperand(1);
23586
23587       // Look for a general sub with unsigned saturation first.
23588       // x >= y ? x-y : 0 --> subus x, y
23589       // x >  y ? x-y : 0 --> subus x, y
23590       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
23591           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
23592         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
23593
23594       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
23595         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
23596           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
23597             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
23598               // If the RHS is a constant we have to reverse the const
23599               // canonicalization.
23600               // x > C-1 ? x+-C : 0 --> subus x, C
23601               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
23602                   CondRHSConst->getAPIntValue() ==
23603                       (-OpRHSConst->getAPIntValue() - 1))
23604                 return DAG.getNode(
23605                     X86ISD::SUBUS, DL, VT, OpLHS,
23606                     DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
23607
23608           // Another special case: If C was a sign bit, the sub has been
23609           // canonicalized into a xor.
23610           // FIXME: Would it be better to use computeKnownBits to determine
23611           //        whether it's safe to decanonicalize the xor?
23612           // x s< 0 ? x^C : 0 --> subus x, C
23613           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
23614               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
23615               OpRHSConst->getAPIntValue().isSignBit())
23616             // Note that we have to rebuild the RHS constant here to ensure we
23617             // don't rely on particular values of undef lanes.
23618             return DAG.getNode(
23619                 X86ISD::SUBUS, DL, VT, OpLHS,
23620                 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
23621         }
23622     }
23623   }
23624
23625   // Try to match a min/max vector operation.
23626   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
23627     std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
23628     unsigned Opc = ret.first;
23629     bool NeedSplit = ret.second;
23630
23631     if (Opc && NeedSplit) {
23632       unsigned NumElems = VT.getVectorNumElements();
23633       // Extract the LHS vectors
23634       SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
23635       SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
23636
23637       // Extract the RHS vectors
23638       SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
23639       SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
23640
23641       // Create min/max for each subvector
23642       LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
23643       RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
23644
23645       // Merge the result
23646       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
23647     } else if (Opc)
23648       return DAG.getNode(Opc, DL, VT, LHS, RHS);
23649   }
23650
23651   // Simplify vector selection if condition value type matches vselect
23652   // operand type
23653   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
23654     assert(Cond.getValueType().isVector() &&
23655            "vector select expects a vector selector!");
23656
23657     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
23658     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
23659
23660     // Try invert the condition if true value is not all 1s and false value
23661     // is not all 0s.
23662     if (!TValIsAllOnes && !FValIsAllZeros &&
23663         // Check if the selector will be produced by CMPP*/PCMP*
23664         Cond.getOpcode() == ISD::SETCC &&
23665         // Check if SETCC has already been promoted
23666         TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
23667       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
23668       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
23669
23670       if (TValIsAllZeros || FValIsAllOnes) {
23671         SDValue CC = Cond.getOperand(2);
23672         ISD::CondCode NewCC =
23673           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
23674                                Cond.getOperand(0).getValueType().isInteger());
23675         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
23676         std::swap(LHS, RHS);
23677         TValIsAllOnes = FValIsAllOnes;
23678         FValIsAllZeros = TValIsAllZeros;
23679       }
23680     }
23681
23682     if (TValIsAllOnes || FValIsAllZeros) {
23683       SDValue Ret;
23684
23685       if (TValIsAllOnes && FValIsAllZeros)
23686         Ret = Cond;
23687       else if (TValIsAllOnes)
23688         Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
23689                           DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
23690       else if (FValIsAllZeros)
23691         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
23692                           DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
23693
23694       return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
23695     }
23696   }
23697
23698   // If we know that this node is legal then we know that it is going to be
23699   // matched by one of the SSE/AVX BLEND instructions. These instructions only
23700   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
23701   // to simplify previous instructions.
23702   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
23703       !DCI.isBeforeLegalize() &&
23704       // We explicitly check against v8i16 and v16i16 because, although
23705       // they're marked as Custom, they might only be legal when Cond is a
23706       // build_vector of constants. This will be taken care in a later
23707       // condition.
23708       (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
23709        VT != MVT::v8i16) &&
23710       // Don't optimize vector of constants. Those are handled by
23711       // the generic code and all the bits must be properly set for
23712       // the generic optimizer.
23713       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
23714     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
23715
23716     // Don't optimize vector selects that map to mask-registers.
23717     if (BitWidth == 1)
23718       return SDValue();
23719
23720     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
23721     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
23722
23723     APInt KnownZero, KnownOne;
23724     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
23725                                           DCI.isBeforeLegalizeOps());
23726     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
23727         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
23728                                  TLO)) {
23729       // If we changed the computation somewhere in the DAG, this change
23730       // will affect all users of Cond.
23731       // Make sure it is fine and update all the nodes so that we do not
23732       // use the generic VSELECT anymore. Otherwise, we may perform
23733       // wrong optimizations as we messed up with the actual expectation
23734       // for the vector boolean values.
23735       if (Cond != TLO.Old) {
23736         // Check all uses of that condition operand to check whether it will be
23737         // consumed by non-BLEND instructions, which may depend on all bits are
23738         // set properly.
23739         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23740              I != E; ++I)
23741           if (I->getOpcode() != ISD::VSELECT)
23742             // TODO: Add other opcodes eventually lowered into BLEND.
23743             return SDValue();
23744
23745         // Update all the users of the condition, before committing the change,
23746         // so that the VSELECT optimizations that expect the correct vector
23747         // boolean value will not be triggered.
23748         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23749              I != E; ++I)
23750           DAG.ReplaceAllUsesOfValueWith(
23751               SDValue(*I, 0),
23752               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
23753                           Cond, I->getOperand(1), I->getOperand(2)));
23754         DCI.CommitTargetLoweringOpt(TLO);
23755         return SDValue();
23756       }
23757       // At this point, only Cond is changed. Change the condition
23758       // just for N to keep the opportunity to optimize all other
23759       // users their own way.
23760       DAG.ReplaceAllUsesOfValueWith(
23761           SDValue(N, 0),
23762           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
23763                       TLO.New, N->getOperand(1), N->getOperand(2)));
23764       return SDValue();
23765     }
23766   }
23767
23768   // We should generate an X86ISD::BLENDI from a vselect if its argument
23769   // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
23770   // constants. This specific pattern gets generated when we split a
23771   // selector for a 512 bit vector in a machine without AVX512 (but with
23772   // 256-bit vectors), during legalization:
23773   //
23774   // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
23775   //
23776   // Iff we find this pattern and the build_vectors are built from
23777   // constants, we translate the vselect into a shuffle_vector that we
23778   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
23779   if ((N->getOpcode() == ISD::VSELECT ||
23780        N->getOpcode() == X86ISD::SHRUNKBLEND) &&
23781       !DCI.isBeforeLegalize()) {
23782     SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
23783     if (Shuffle.getNode())
23784       return Shuffle;
23785   }
23786
23787   return SDValue();
23788 }
23789
23790 // Check whether a boolean test is testing a boolean value generated by
23791 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
23792 // code.
23793 //
23794 // Simplify the following patterns:
23795 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
23796 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
23797 // to (Op EFLAGS Cond)
23798 //
23799 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
23800 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
23801 // to (Op EFLAGS !Cond)
23802 //
23803 // where Op could be BRCOND or CMOV.
23804 //
23805 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
23806   // Quit if not CMP and SUB with its value result used.
23807   if (Cmp.getOpcode() != X86ISD::CMP &&
23808       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
23809       return SDValue();
23810
23811   // Quit if not used as a boolean value.
23812   if (CC != X86::COND_E && CC != X86::COND_NE)
23813     return SDValue();
23814
23815   // Check CMP operands. One of them should be 0 or 1 and the other should be
23816   // an SetCC or extended from it.
23817   SDValue Op1 = Cmp.getOperand(0);
23818   SDValue Op2 = Cmp.getOperand(1);
23819
23820   SDValue SetCC;
23821   const ConstantSDNode* C = nullptr;
23822   bool needOppositeCond = (CC == X86::COND_E);
23823   bool checkAgainstTrue = false; // Is it a comparison against 1?
23824
23825   if ((C = dyn_cast<ConstantSDNode>(Op1)))
23826     SetCC = Op2;
23827   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
23828     SetCC = Op1;
23829   else // Quit if all operands are not constants.
23830     return SDValue();
23831
23832   if (C->getZExtValue() == 1) {
23833     needOppositeCond = !needOppositeCond;
23834     checkAgainstTrue = true;
23835   } else if (C->getZExtValue() != 0)
23836     // Quit if the constant is neither 0 or 1.
23837     return SDValue();
23838
23839   bool truncatedToBoolWithAnd = false;
23840   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
23841   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
23842          SetCC.getOpcode() == ISD::TRUNCATE ||
23843          SetCC.getOpcode() == ISD::AND) {
23844     if (SetCC.getOpcode() == ISD::AND) {
23845       int OpIdx = -1;
23846       ConstantSDNode *CS;
23847       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
23848           CS->getZExtValue() == 1)
23849         OpIdx = 1;
23850       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
23851           CS->getZExtValue() == 1)
23852         OpIdx = 0;
23853       if (OpIdx == -1)
23854         break;
23855       SetCC = SetCC.getOperand(OpIdx);
23856       truncatedToBoolWithAnd = true;
23857     } else
23858       SetCC = SetCC.getOperand(0);
23859   }
23860
23861   switch (SetCC.getOpcode()) {
23862   case X86ISD::SETCC_CARRY:
23863     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
23864     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
23865     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
23866     // truncated to i1 using 'and'.
23867     if (checkAgainstTrue && !truncatedToBoolWithAnd)
23868       break;
23869     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
23870            "Invalid use of SETCC_CARRY!");
23871     // FALL THROUGH
23872   case X86ISD::SETCC:
23873     // Set the condition code or opposite one if necessary.
23874     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
23875     if (needOppositeCond)
23876       CC = X86::GetOppositeBranchCondition(CC);
23877     return SetCC.getOperand(1);
23878   case X86ISD::CMOV: {
23879     // Check whether false/true value has canonical one, i.e. 0 or 1.
23880     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
23881     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
23882     // Quit if true value is not a constant.
23883     if (!TVal)
23884       return SDValue();
23885     // Quit if false value is not a constant.
23886     if (!FVal) {
23887       SDValue Op = SetCC.getOperand(0);
23888       // Skip 'zext' or 'trunc' node.
23889       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
23890           Op.getOpcode() == ISD::TRUNCATE)
23891         Op = Op.getOperand(0);
23892       // A special case for rdrand/rdseed, where 0 is set if false cond is
23893       // found.
23894       if ((Op.getOpcode() != X86ISD::RDRAND &&
23895            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
23896         return SDValue();
23897     }
23898     // Quit if false value is not the constant 0 or 1.
23899     bool FValIsFalse = true;
23900     if (FVal && FVal->getZExtValue() != 0) {
23901       if (FVal->getZExtValue() != 1)
23902         return SDValue();
23903       // If FVal is 1, opposite cond is needed.
23904       needOppositeCond = !needOppositeCond;
23905       FValIsFalse = false;
23906     }
23907     // Quit if TVal is not the constant opposite of FVal.
23908     if (FValIsFalse && TVal->getZExtValue() != 1)
23909       return SDValue();
23910     if (!FValIsFalse && TVal->getZExtValue() != 0)
23911       return SDValue();
23912     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
23913     if (needOppositeCond)
23914       CC = X86::GetOppositeBranchCondition(CC);
23915     return SetCC.getOperand(3);
23916   }
23917   }
23918
23919   return SDValue();
23920 }
23921
23922 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
23923 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
23924                                   TargetLowering::DAGCombinerInfo &DCI,
23925                                   const X86Subtarget *Subtarget) {
23926   SDLoc DL(N);
23927
23928   // If the flag operand isn't dead, don't touch this CMOV.
23929   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
23930     return SDValue();
23931
23932   SDValue FalseOp = N->getOperand(0);
23933   SDValue TrueOp = N->getOperand(1);
23934   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
23935   SDValue Cond = N->getOperand(3);
23936
23937   if (CC == X86::COND_E || CC == X86::COND_NE) {
23938     switch (Cond.getOpcode()) {
23939     default: break;
23940     case X86ISD::BSR:
23941     case X86ISD::BSF:
23942       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
23943       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
23944         return (CC == X86::COND_E) ? FalseOp : TrueOp;
23945     }
23946   }
23947
23948   SDValue Flags;
23949
23950   Flags = checkBoolTestSetCCCombine(Cond, CC);
23951   if (Flags.getNode() &&
23952       // Extra check as FCMOV only supports a subset of X86 cond.
23953       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
23954     SDValue Ops[] = { FalseOp, TrueOp,
23955                       DAG.getConstant(CC, MVT::i8), Flags };
23956     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
23957   }
23958
23959   // If this is a select between two integer constants, try to do some
23960   // optimizations.  Note that the operands are ordered the opposite of SELECT
23961   // operands.
23962   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
23963     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
23964       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
23965       // larger than FalseC (the false value).
23966       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
23967         CC = X86::GetOppositeBranchCondition(CC);
23968         std::swap(TrueC, FalseC);
23969         std::swap(TrueOp, FalseOp);
23970       }
23971
23972       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
23973       // This is efficient for any integer data type (including i8/i16) and
23974       // shift amount.
23975       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
23976         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23977                            DAG.getConstant(CC, MVT::i8), Cond);
23978
23979         // Zero extend the condition if needed.
23980         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
23981
23982         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23983         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
23984                            DAG.getConstant(ShAmt, MVT::i8));
23985         if (N->getNumValues() == 2)  // Dead flag value?
23986           return DCI.CombineTo(N, Cond, SDValue());
23987         return Cond;
23988       }
23989
23990       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
23991       // for any integer data type, including i8/i16.
23992       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23993         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23994                            DAG.getConstant(CC, MVT::i8), Cond);
23995
23996         // Zero extend the condition if needed.
23997         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23998                            FalseC->getValueType(0), Cond);
23999         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24000                            SDValue(FalseC, 0));
24001
24002         if (N->getNumValues() == 2)  // Dead flag value?
24003           return DCI.CombineTo(N, Cond, SDValue());
24004         return Cond;
24005       }
24006
24007       // Optimize cases that will turn into an LEA instruction.  This requires
24008       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
24009       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
24010         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
24011         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
24012
24013         bool isFastMultiplier = false;
24014         if (Diff < 10) {
24015           switch ((unsigned char)Diff) {
24016           default: break;
24017           case 1:  // result = add base, cond
24018           case 2:  // result = lea base(    , cond*2)
24019           case 3:  // result = lea base(cond, cond*2)
24020           case 4:  // result = lea base(    , cond*4)
24021           case 5:  // result = lea base(cond, cond*4)
24022           case 8:  // result = lea base(    , cond*8)
24023           case 9:  // result = lea base(cond, cond*8)
24024             isFastMultiplier = true;
24025             break;
24026           }
24027         }
24028
24029         if (isFastMultiplier) {
24030           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
24031           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24032                              DAG.getConstant(CC, MVT::i8), Cond);
24033           // Zero extend the condition if needed.
24034           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
24035                              Cond);
24036           // Scale the condition by the difference.
24037           if (Diff != 1)
24038             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
24039                                DAG.getConstant(Diff, Cond.getValueType()));
24040
24041           // Add the base if non-zero.
24042           if (FalseC->getAPIntValue() != 0)
24043             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24044                                SDValue(FalseC, 0));
24045           if (N->getNumValues() == 2)  // Dead flag value?
24046             return DCI.CombineTo(N, Cond, SDValue());
24047           return Cond;
24048         }
24049       }
24050     }
24051   }
24052
24053   // Handle these cases:
24054   //   (select (x != c), e, c) -> select (x != c), e, x),
24055   //   (select (x == c), c, e) -> select (x == c), x, e)
24056   // where the c is an integer constant, and the "select" is the combination
24057   // of CMOV and CMP.
24058   //
24059   // The rationale for this change is that the conditional-move from a constant
24060   // needs two instructions, however, conditional-move from a register needs
24061   // only one instruction.
24062   //
24063   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
24064   //  some instruction-combining opportunities. This opt needs to be
24065   //  postponed as late as possible.
24066   //
24067   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
24068     // the DCI.xxxx conditions are provided to postpone the optimization as
24069     // late as possible.
24070
24071     ConstantSDNode *CmpAgainst = nullptr;
24072     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
24073         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
24074         !isa<ConstantSDNode>(Cond.getOperand(0))) {
24075
24076       if (CC == X86::COND_NE &&
24077           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
24078         CC = X86::GetOppositeBranchCondition(CC);
24079         std::swap(TrueOp, FalseOp);
24080       }
24081
24082       if (CC == X86::COND_E &&
24083           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
24084         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
24085                           DAG.getConstant(CC, MVT::i8), Cond };
24086         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
24087       }
24088     }
24089   }
24090
24091   return SDValue();
24092 }
24093
24094 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
24095                                                 const X86Subtarget *Subtarget) {
24096   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
24097   switch (IntNo) {
24098   default: return SDValue();
24099   // SSE/AVX/AVX2 blend intrinsics.
24100   case Intrinsic::x86_avx2_pblendvb:
24101   case Intrinsic::x86_avx2_pblendw:
24102   case Intrinsic::x86_avx2_pblendd_128:
24103   case Intrinsic::x86_avx2_pblendd_256:
24104     // Don't try to simplify this intrinsic if we don't have AVX2.
24105     if (!Subtarget->hasAVX2())
24106       return SDValue();
24107     // FALL-THROUGH
24108   case Intrinsic::x86_avx_blend_pd_256:
24109   case Intrinsic::x86_avx_blend_ps_256:
24110   case Intrinsic::x86_avx_blendv_pd_256:
24111   case Intrinsic::x86_avx_blendv_ps_256:
24112     // Don't try to simplify this intrinsic if we don't have AVX.
24113     if (!Subtarget->hasAVX())
24114       return SDValue();
24115     // FALL-THROUGH
24116   case Intrinsic::x86_sse41_pblendw:
24117   case Intrinsic::x86_sse41_blendpd:
24118   case Intrinsic::x86_sse41_blendps:
24119   case Intrinsic::x86_sse41_blendvps:
24120   case Intrinsic::x86_sse41_blendvpd:
24121   case Intrinsic::x86_sse41_pblendvb: {
24122     SDValue Op0 = N->getOperand(1);
24123     SDValue Op1 = N->getOperand(2);
24124     SDValue Mask = N->getOperand(3);
24125
24126     // Don't try to simplify this intrinsic if we don't have SSE4.1.
24127     if (!Subtarget->hasSSE41())
24128       return SDValue();
24129
24130     // fold (blend A, A, Mask) -> A
24131     if (Op0 == Op1)
24132       return Op0;
24133     // fold (blend A, B, allZeros) -> A
24134     if (ISD::isBuildVectorAllZeros(Mask.getNode()))
24135       return Op0;
24136     // fold (blend A, B, allOnes) -> B
24137     if (ISD::isBuildVectorAllOnes(Mask.getNode()))
24138       return Op1;
24139
24140     // Simplify the case where the mask is a constant i32 value.
24141     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
24142       if (C->isNullValue())
24143         return Op0;
24144       if (C->isAllOnesValue())
24145         return Op1;
24146     }
24147
24148     return SDValue();
24149   }
24150
24151   // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
24152   case Intrinsic::x86_sse2_psrai_w:
24153   case Intrinsic::x86_sse2_psrai_d:
24154   case Intrinsic::x86_avx2_psrai_w:
24155   case Intrinsic::x86_avx2_psrai_d:
24156   case Intrinsic::x86_sse2_psra_w:
24157   case Intrinsic::x86_sse2_psra_d:
24158   case Intrinsic::x86_avx2_psra_w:
24159   case Intrinsic::x86_avx2_psra_d: {
24160     SDValue Op0 = N->getOperand(1);
24161     SDValue Op1 = N->getOperand(2);
24162     EVT VT = Op0.getValueType();
24163     assert(VT.isVector() && "Expected a vector type!");
24164
24165     if (isa<BuildVectorSDNode>(Op1))
24166       Op1 = Op1.getOperand(0);
24167
24168     if (!isa<ConstantSDNode>(Op1))
24169       return SDValue();
24170
24171     EVT SVT = VT.getVectorElementType();
24172     unsigned SVTBits = SVT.getSizeInBits();
24173
24174     ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
24175     const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
24176     uint64_t ShAmt = C.getZExtValue();
24177
24178     // Don't try to convert this shift into a ISD::SRA if the shift
24179     // count is bigger than or equal to the element size.
24180     if (ShAmt >= SVTBits)
24181       return SDValue();
24182
24183     // Trivial case: if the shift count is zero, then fold this
24184     // into the first operand.
24185     if (ShAmt == 0)
24186       return Op0;
24187
24188     // Replace this packed shift intrinsic with a target independent
24189     // shift dag node.
24190     SDValue Splat = DAG.getConstant(C, VT);
24191     return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
24192   }
24193   }
24194 }
24195
24196 /// PerformMulCombine - Optimize a single multiply with constant into two
24197 /// in order to implement it with two cheaper instructions, e.g.
24198 /// LEA + SHL, LEA + LEA.
24199 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
24200                                  TargetLowering::DAGCombinerInfo &DCI) {
24201   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24202     return SDValue();
24203
24204   EVT VT = N->getValueType(0);
24205   if (VT != MVT::i64 && VT != MVT::i32)
24206     return SDValue();
24207
24208   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
24209   if (!C)
24210     return SDValue();
24211   uint64_t MulAmt = C->getZExtValue();
24212   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
24213     return SDValue();
24214
24215   uint64_t MulAmt1 = 0;
24216   uint64_t MulAmt2 = 0;
24217   if ((MulAmt % 9) == 0) {
24218     MulAmt1 = 9;
24219     MulAmt2 = MulAmt / 9;
24220   } else if ((MulAmt % 5) == 0) {
24221     MulAmt1 = 5;
24222     MulAmt2 = MulAmt / 5;
24223   } else if ((MulAmt % 3) == 0) {
24224     MulAmt1 = 3;
24225     MulAmt2 = MulAmt / 3;
24226   }
24227   if (MulAmt2 &&
24228       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
24229     SDLoc DL(N);
24230
24231     if (isPowerOf2_64(MulAmt2) &&
24232         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
24233       // If second multiplifer is pow2, issue it first. We want the multiply by
24234       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
24235       // is an add.
24236       std::swap(MulAmt1, MulAmt2);
24237
24238     SDValue NewMul;
24239     if (isPowerOf2_64(MulAmt1))
24240       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
24241                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
24242     else
24243       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
24244                            DAG.getConstant(MulAmt1, VT));
24245
24246     if (isPowerOf2_64(MulAmt2))
24247       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
24248                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
24249     else
24250       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
24251                            DAG.getConstant(MulAmt2, VT));
24252
24253     // Do not add new nodes to DAG combiner worklist.
24254     DCI.CombineTo(N, NewMul, false);
24255   }
24256   return SDValue();
24257 }
24258
24259 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
24260   SDValue N0 = N->getOperand(0);
24261   SDValue N1 = N->getOperand(1);
24262   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
24263   EVT VT = N0.getValueType();
24264
24265   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
24266   // since the result of setcc_c is all zero's or all ones.
24267   if (VT.isInteger() && !VT.isVector() &&
24268       N1C && N0.getOpcode() == ISD::AND &&
24269       N0.getOperand(1).getOpcode() == ISD::Constant) {
24270     SDValue N00 = N0.getOperand(0);
24271     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
24272         ((N00.getOpcode() == ISD::ANY_EXTEND ||
24273           N00.getOpcode() == ISD::ZERO_EXTEND) &&
24274          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
24275       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
24276       APInt ShAmt = N1C->getAPIntValue();
24277       Mask = Mask.shl(ShAmt);
24278       if (Mask != 0)
24279         return DAG.getNode(ISD::AND, SDLoc(N), VT,
24280                            N00, DAG.getConstant(Mask, VT));
24281     }
24282   }
24283
24284   // Hardware support for vector shifts is sparse which makes us scalarize the
24285   // vector operations in many cases. Also, on sandybridge ADD is faster than
24286   // shl.
24287   // (shl V, 1) -> add V,V
24288   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
24289     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
24290       assert(N0.getValueType().isVector() && "Invalid vector shift type");
24291       // We shift all of the values by one. In many cases we do not have
24292       // hardware support for this operation. This is better expressed as an ADD
24293       // of two values.
24294       if (N1SplatC->getZExtValue() == 1)
24295         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
24296     }
24297
24298   return SDValue();
24299 }
24300
24301 /// \brief Returns a vector of 0s if the node in input is a vector logical
24302 /// shift by a constant amount which is known to be bigger than or equal
24303 /// to the vector element size in bits.
24304 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
24305                                       const X86Subtarget *Subtarget) {
24306   EVT VT = N->getValueType(0);
24307
24308   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
24309       (!Subtarget->hasInt256() ||
24310        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
24311     return SDValue();
24312
24313   SDValue Amt = N->getOperand(1);
24314   SDLoc DL(N);
24315   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
24316     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
24317       APInt ShiftAmt = AmtSplat->getAPIntValue();
24318       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
24319
24320       // SSE2/AVX2 logical shifts always return a vector of 0s
24321       // if the shift amount is bigger than or equal to
24322       // the element size. The constant shift amount will be
24323       // encoded as a 8-bit immediate.
24324       if (ShiftAmt.trunc(8).uge(MaxAmount))
24325         return getZeroVector(VT, Subtarget, DAG, DL);
24326     }
24327
24328   return SDValue();
24329 }
24330
24331 /// PerformShiftCombine - Combine shifts.
24332 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
24333                                    TargetLowering::DAGCombinerInfo &DCI,
24334                                    const X86Subtarget *Subtarget) {
24335   if (N->getOpcode() == ISD::SHL) {
24336     SDValue V = PerformSHLCombine(N, DAG);
24337     if (V.getNode()) return V;
24338   }
24339
24340   if (N->getOpcode() != ISD::SRA) {
24341     // Try to fold this logical shift into a zero vector.
24342     SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
24343     if (V.getNode()) return V;
24344   }
24345
24346   return SDValue();
24347 }
24348
24349 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
24350 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
24351 // and friends.  Likewise for OR -> CMPNEQSS.
24352 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
24353                             TargetLowering::DAGCombinerInfo &DCI,
24354                             const X86Subtarget *Subtarget) {
24355   unsigned opcode;
24356
24357   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
24358   // we're requiring SSE2 for both.
24359   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
24360     SDValue N0 = N->getOperand(0);
24361     SDValue N1 = N->getOperand(1);
24362     SDValue CMP0 = N0->getOperand(1);
24363     SDValue CMP1 = N1->getOperand(1);
24364     SDLoc DL(N);
24365
24366     // The SETCCs should both refer to the same CMP.
24367     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
24368       return SDValue();
24369
24370     SDValue CMP00 = CMP0->getOperand(0);
24371     SDValue CMP01 = CMP0->getOperand(1);
24372     EVT     VT    = CMP00.getValueType();
24373
24374     if (VT == MVT::f32 || VT == MVT::f64) {
24375       bool ExpectingFlags = false;
24376       // Check for any users that want flags:
24377       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
24378            !ExpectingFlags && UI != UE; ++UI)
24379         switch (UI->getOpcode()) {
24380         default:
24381         case ISD::BR_CC:
24382         case ISD::BRCOND:
24383         case ISD::SELECT:
24384           ExpectingFlags = true;
24385           break;
24386         case ISD::CopyToReg:
24387         case ISD::SIGN_EXTEND:
24388         case ISD::ZERO_EXTEND:
24389         case ISD::ANY_EXTEND:
24390           break;
24391         }
24392
24393       if (!ExpectingFlags) {
24394         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
24395         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
24396
24397         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
24398           X86::CondCode tmp = cc0;
24399           cc0 = cc1;
24400           cc1 = tmp;
24401         }
24402
24403         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
24404             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
24405           // FIXME: need symbolic constants for these magic numbers.
24406           // See X86ATTInstPrinter.cpp:printSSECC().
24407           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
24408           if (Subtarget->hasAVX512()) {
24409             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
24410                                          CMP01, DAG.getConstant(x86cc, MVT::i8));
24411             if (N->getValueType(0) != MVT::i1)
24412               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
24413                                  FSetCC);
24414             return FSetCC;
24415           }
24416           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
24417                                               CMP00.getValueType(), CMP00, CMP01,
24418                                               DAG.getConstant(x86cc, MVT::i8));
24419
24420           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
24421           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
24422
24423           if (is64BitFP && !Subtarget->is64Bit()) {
24424             // On a 32-bit target, we cannot bitcast the 64-bit float to a
24425             // 64-bit integer, since that's not a legal type. Since
24426             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
24427             // bits, but can do this little dance to extract the lowest 32 bits
24428             // and work with those going forward.
24429             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
24430                                            OnesOrZeroesF);
24431             SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
24432                                            Vector64);
24433             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
24434                                         Vector32, DAG.getIntPtrConstant(0));
24435             IntVT = MVT::i32;
24436           }
24437
24438           SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
24439           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
24440                                       DAG.getConstant(1, IntVT));
24441           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
24442           return OneBitOfTruth;
24443         }
24444       }
24445     }
24446   }
24447   return SDValue();
24448 }
24449
24450 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
24451 /// so it can be folded inside ANDNP.
24452 static bool CanFoldXORWithAllOnes(const SDNode *N) {
24453   EVT VT = N->getValueType(0);
24454
24455   // Match direct AllOnes for 128 and 256-bit vectors
24456   if (ISD::isBuildVectorAllOnes(N))
24457     return true;
24458
24459   // Look through a bit convert.
24460   if (N->getOpcode() == ISD::BITCAST)
24461     N = N->getOperand(0).getNode();
24462
24463   // Sometimes the operand may come from a insert_subvector building a 256-bit
24464   // allones vector
24465   if (VT.is256BitVector() &&
24466       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
24467     SDValue V1 = N->getOperand(0);
24468     SDValue V2 = N->getOperand(1);
24469
24470     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
24471         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
24472         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
24473         ISD::isBuildVectorAllOnes(V2.getNode()))
24474       return true;
24475   }
24476
24477   return false;
24478 }
24479
24480 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
24481 // register. In most cases we actually compare or select YMM-sized registers
24482 // and mixing the two types creates horrible code. This method optimizes
24483 // some of the transition sequences.
24484 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
24485                                  TargetLowering::DAGCombinerInfo &DCI,
24486                                  const X86Subtarget *Subtarget) {
24487   EVT VT = N->getValueType(0);
24488   if (!VT.is256BitVector())
24489     return SDValue();
24490
24491   assert((N->getOpcode() == ISD::ANY_EXTEND ||
24492           N->getOpcode() == ISD::ZERO_EXTEND ||
24493           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
24494
24495   SDValue Narrow = N->getOperand(0);
24496   EVT NarrowVT = Narrow->getValueType(0);
24497   if (!NarrowVT.is128BitVector())
24498     return SDValue();
24499
24500   if (Narrow->getOpcode() != ISD::XOR &&
24501       Narrow->getOpcode() != ISD::AND &&
24502       Narrow->getOpcode() != ISD::OR)
24503     return SDValue();
24504
24505   SDValue N0  = Narrow->getOperand(0);
24506   SDValue N1  = Narrow->getOperand(1);
24507   SDLoc DL(Narrow);
24508
24509   // The Left side has to be a trunc.
24510   if (N0.getOpcode() != ISD::TRUNCATE)
24511     return SDValue();
24512
24513   // The type of the truncated inputs.
24514   EVT WideVT = N0->getOperand(0)->getValueType(0);
24515   if (WideVT != VT)
24516     return SDValue();
24517
24518   // The right side has to be a 'trunc' or a constant vector.
24519   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
24520   ConstantSDNode *RHSConstSplat = nullptr;
24521   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
24522     RHSConstSplat = RHSBV->getConstantSplatNode();
24523   if (!RHSTrunc && !RHSConstSplat)
24524     return SDValue();
24525
24526   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24527
24528   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
24529     return SDValue();
24530
24531   // Set N0 and N1 to hold the inputs to the new wide operation.
24532   N0 = N0->getOperand(0);
24533   if (RHSConstSplat) {
24534     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
24535                      SDValue(RHSConstSplat, 0));
24536     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
24537     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
24538   } else if (RHSTrunc) {
24539     N1 = N1->getOperand(0);
24540   }
24541
24542   // Generate the wide operation.
24543   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
24544   unsigned Opcode = N->getOpcode();
24545   switch (Opcode) {
24546   case ISD::ANY_EXTEND:
24547     return Op;
24548   case ISD::ZERO_EXTEND: {
24549     unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
24550     APInt Mask = APInt::getAllOnesValue(InBits);
24551     Mask = Mask.zext(VT.getScalarType().getSizeInBits());
24552     return DAG.getNode(ISD::AND, DL, VT,
24553                        Op, DAG.getConstant(Mask, VT));
24554   }
24555   case ISD::SIGN_EXTEND:
24556     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
24557                        Op, DAG.getValueType(NarrowVT));
24558   default:
24559     llvm_unreachable("Unexpected opcode");
24560   }
24561 }
24562
24563 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
24564                                  TargetLowering::DAGCombinerInfo &DCI,
24565                                  const X86Subtarget *Subtarget) {
24566   EVT VT = N->getValueType(0);
24567   if (DCI.isBeforeLegalizeOps())
24568     return SDValue();
24569
24570   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24571   if (R.getNode())
24572     return R;
24573
24574   // Create BEXTR instructions
24575   // BEXTR is ((X >> imm) & (2**size-1))
24576   if (VT == MVT::i32 || VT == MVT::i64) {
24577     SDValue N0 = N->getOperand(0);
24578     SDValue N1 = N->getOperand(1);
24579     SDLoc DL(N);
24580
24581     // Check for BEXTR.
24582     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
24583         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
24584       ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
24585       ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
24586       if (MaskNode && ShiftNode) {
24587         uint64_t Mask = MaskNode->getZExtValue();
24588         uint64_t Shift = ShiftNode->getZExtValue();
24589         if (isMask_64(Mask)) {
24590           uint64_t MaskSize = CountPopulation_64(Mask);
24591           if (Shift + MaskSize <= VT.getSizeInBits())
24592             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
24593                                DAG.getConstant(Shift | (MaskSize << 8), VT));
24594         }
24595       }
24596     } // BEXTR
24597
24598     return SDValue();
24599   }
24600
24601   // Want to form ANDNP nodes:
24602   // 1) In the hopes of then easily combining them with OR and AND nodes
24603   //    to form PBLEND/PSIGN.
24604   // 2) To match ANDN packed intrinsics
24605   if (VT != MVT::v2i64 && VT != MVT::v4i64)
24606     return SDValue();
24607
24608   SDValue N0 = N->getOperand(0);
24609   SDValue N1 = N->getOperand(1);
24610   SDLoc DL(N);
24611
24612   // Check LHS for vnot
24613   if (N0.getOpcode() == ISD::XOR &&
24614       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
24615       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
24616     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
24617
24618   // Check RHS for vnot
24619   if (N1.getOpcode() == ISD::XOR &&
24620       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
24621       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
24622     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
24623
24624   return SDValue();
24625 }
24626
24627 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
24628                                 TargetLowering::DAGCombinerInfo &DCI,
24629                                 const X86Subtarget *Subtarget) {
24630   if (DCI.isBeforeLegalizeOps())
24631     return SDValue();
24632
24633   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24634   if (R.getNode())
24635     return R;
24636
24637   SDValue N0 = N->getOperand(0);
24638   SDValue N1 = N->getOperand(1);
24639   EVT VT = N->getValueType(0);
24640
24641   // look for psign/blend
24642   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
24643     if (!Subtarget->hasSSSE3() ||
24644         (VT == MVT::v4i64 && !Subtarget->hasInt256()))
24645       return SDValue();
24646
24647     // Canonicalize pandn to RHS
24648     if (N0.getOpcode() == X86ISD::ANDNP)
24649       std::swap(N0, N1);
24650     // or (and (m, y), (pandn m, x))
24651     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
24652       SDValue Mask = N1.getOperand(0);
24653       SDValue X    = N1.getOperand(1);
24654       SDValue Y;
24655       if (N0.getOperand(0) == Mask)
24656         Y = N0.getOperand(1);
24657       if (N0.getOperand(1) == Mask)
24658         Y = N0.getOperand(0);
24659
24660       // Check to see if the mask appeared in both the AND and ANDNP and
24661       if (!Y.getNode())
24662         return SDValue();
24663
24664       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
24665       // Look through mask bitcast.
24666       if (Mask.getOpcode() == ISD::BITCAST)
24667         Mask = Mask.getOperand(0);
24668       if (X.getOpcode() == ISD::BITCAST)
24669         X = X.getOperand(0);
24670       if (Y.getOpcode() == ISD::BITCAST)
24671         Y = Y.getOperand(0);
24672
24673       EVT MaskVT = Mask.getValueType();
24674
24675       // Validate that the Mask operand is a vector sra node.
24676       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
24677       // there is no psrai.b
24678       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
24679       unsigned SraAmt = ~0;
24680       if (Mask.getOpcode() == ISD::SRA) {
24681         if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
24682           if (auto *AmtConst = AmtBV->getConstantSplatNode())
24683             SraAmt = AmtConst->getZExtValue();
24684       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
24685         SDValue SraC = Mask.getOperand(1);
24686         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
24687       }
24688       if ((SraAmt + 1) != EltBits)
24689         return SDValue();
24690
24691       SDLoc DL(N);
24692
24693       // Now we know we at least have a plendvb with the mask val.  See if
24694       // we can form a psignb/w/d.
24695       // psign = x.type == y.type == mask.type && y = sub(0, x);
24696       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
24697           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
24698           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
24699         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
24700                "Unsupported VT for PSIGN");
24701         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
24702         return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24703       }
24704       // PBLENDVB only available on SSE 4.1
24705       if (!Subtarget->hasSSE41())
24706         return SDValue();
24707
24708       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
24709
24710       X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
24711       Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
24712       Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
24713       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
24714       return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24715     }
24716   }
24717
24718   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
24719     return SDValue();
24720
24721   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
24722   MachineFunction &MF = DAG.getMachineFunction();
24723   bool OptForSize = MF.getFunction()->getAttributes().
24724     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
24725
24726   // SHLD/SHRD instructions have lower register pressure, but on some
24727   // platforms they have higher latency than the equivalent
24728   // series of shifts/or that would otherwise be generated.
24729   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
24730   // have higher latencies and we are not optimizing for size.
24731   if (!OptForSize && Subtarget->isSHLDSlow())
24732     return SDValue();
24733
24734   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
24735     std::swap(N0, N1);
24736   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
24737     return SDValue();
24738   if (!N0.hasOneUse() || !N1.hasOneUse())
24739     return SDValue();
24740
24741   SDValue ShAmt0 = N0.getOperand(1);
24742   if (ShAmt0.getValueType() != MVT::i8)
24743     return SDValue();
24744   SDValue ShAmt1 = N1.getOperand(1);
24745   if (ShAmt1.getValueType() != MVT::i8)
24746     return SDValue();
24747   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
24748     ShAmt0 = ShAmt0.getOperand(0);
24749   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
24750     ShAmt1 = ShAmt1.getOperand(0);
24751
24752   SDLoc DL(N);
24753   unsigned Opc = X86ISD::SHLD;
24754   SDValue Op0 = N0.getOperand(0);
24755   SDValue Op1 = N1.getOperand(0);
24756   if (ShAmt0.getOpcode() == ISD::SUB) {
24757     Opc = X86ISD::SHRD;
24758     std::swap(Op0, Op1);
24759     std::swap(ShAmt0, ShAmt1);
24760   }
24761
24762   unsigned Bits = VT.getSizeInBits();
24763   if (ShAmt1.getOpcode() == ISD::SUB) {
24764     SDValue Sum = ShAmt1.getOperand(0);
24765     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
24766       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
24767       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
24768         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
24769       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
24770         return DAG.getNode(Opc, DL, VT,
24771                            Op0, Op1,
24772                            DAG.getNode(ISD::TRUNCATE, DL,
24773                                        MVT::i8, ShAmt0));
24774     }
24775   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
24776     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
24777     if (ShAmt0C &&
24778         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
24779       return DAG.getNode(Opc, DL, VT,
24780                          N0.getOperand(0), N1.getOperand(0),
24781                          DAG.getNode(ISD::TRUNCATE, DL,
24782                                        MVT::i8, ShAmt0));
24783   }
24784
24785   return SDValue();
24786 }
24787
24788 // Generate NEG and CMOV for integer abs.
24789 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
24790   EVT VT = N->getValueType(0);
24791
24792   // Since X86 does not have CMOV for 8-bit integer, we don't convert
24793   // 8-bit integer abs to NEG and CMOV.
24794   if (VT.isInteger() && VT.getSizeInBits() == 8)
24795     return SDValue();
24796
24797   SDValue N0 = N->getOperand(0);
24798   SDValue N1 = N->getOperand(1);
24799   SDLoc DL(N);
24800
24801   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
24802   // and change it to SUB and CMOV.
24803   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
24804       N0.getOpcode() == ISD::ADD &&
24805       N0.getOperand(1) == N1 &&
24806       N1.getOpcode() == ISD::SRA &&
24807       N1.getOperand(0) == N0.getOperand(0))
24808     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
24809       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
24810         // Generate SUB & CMOV.
24811         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
24812                                   DAG.getConstant(0, VT), N0.getOperand(0));
24813
24814         SDValue Ops[] = { N0.getOperand(0), Neg,
24815                           DAG.getConstant(X86::COND_GE, MVT::i8),
24816                           SDValue(Neg.getNode(), 1) };
24817         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
24818       }
24819   return SDValue();
24820 }
24821
24822 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
24823 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
24824                                  TargetLowering::DAGCombinerInfo &DCI,
24825                                  const X86Subtarget *Subtarget) {
24826   if (DCI.isBeforeLegalizeOps())
24827     return SDValue();
24828
24829   if (Subtarget->hasCMov()) {
24830     SDValue RV = performIntegerAbsCombine(N, DAG);
24831     if (RV.getNode())
24832       return RV;
24833   }
24834
24835   return SDValue();
24836 }
24837
24838 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
24839 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
24840                                   TargetLowering::DAGCombinerInfo &DCI,
24841                                   const X86Subtarget *Subtarget) {
24842   LoadSDNode *Ld = cast<LoadSDNode>(N);
24843   EVT RegVT = Ld->getValueType(0);
24844   EVT MemVT = Ld->getMemoryVT();
24845   SDLoc dl(Ld);
24846   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24847
24848   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
24849   // into two 16-byte operations.
24850   ISD::LoadExtType Ext = Ld->getExtensionType();
24851   unsigned Alignment = Ld->getAlignment();
24852   bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
24853   if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24854       !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
24855     unsigned NumElems = RegVT.getVectorNumElements();
24856     if (NumElems < 2)
24857       return SDValue();
24858
24859     SDValue Ptr = Ld->getBasePtr();
24860     SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
24861
24862     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
24863                                   NumElems/2);
24864     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24865                                 Ld->getPointerInfo(), Ld->isVolatile(),
24866                                 Ld->isNonTemporal(), Ld->isInvariant(),
24867                                 Alignment);
24868     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24869     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24870                                 Ld->getPointerInfo(), Ld->isVolatile(),
24871                                 Ld->isNonTemporal(), Ld->isInvariant(),
24872                                 std::min(16U, Alignment));
24873     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24874                              Load1.getValue(1),
24875                              Load2.getValue(1));
24876
24877     SDValue NewVec = DAG.getUNDEF(RegVT);
24878     NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
24879     NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
24880     return DCI.CombineTo(N, NewVec, TF, true);
24881   }
24882
24883   return SDValue();
24884 }
24885
24886 /// PerformMLOADCombine - Resolve extending loads
24887 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
24888                                    TargetLowering::DAGCombinerInfo &DCI,
24889                                    const X86Subtarget *Subtarget) {
24890   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
24891   if (Mld->getExtensionType() != ISD::SEXTLOAD)
24892     return SDValue();
24893
24894   EVT VT = Mld->getValueType(0);
24895   unsigned NumElems = VT.getVectorNumElements();
24896   EVT LdVT = Mld->getMemoryVT();
24897   SDLoc dl(Mld);
24898
24899   assert(LdVT != VT && "Cannot extend to the same type");
24900   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
24901   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
24902   // From, To sizes and ElemCount must be pow of two
24903   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
24904     "Unexpected size for extending masked load");
24905
24906   unsigned SizeRatio  = ToSz / FromSz;
24907   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
24908
24909   // Create a type on which we perform the shuffle
24910   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24911           LdVT.getScalarType(), NumElems*SizeRatio);
24912   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24913
24914   // Convert Src0 value
24915   SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
24916   if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
24917     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24918     for (unsigned i = 0; i != NumElems; ++i)
24919       ShuffleVec[i] = i * SizeRatio;
24920
24921     // Can't shuffle using an illegal type.
24922     assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
24923             && "WideVecVT should be legal");
24924     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
24925                                     DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
24926   }
24927   // Prepare the new mask
24928   SDValue NewMask;
24929   SDValue Mask = Mld->getMask();
24930   if (Mask.getValueType() == VT) {
24931     // Mask and original value have the same type
24932     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
24933     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24934     for (unsigned i = 0; i != NumElems; ++i)
24935       ShuffleVec[i] = i * SizeRatio;
24936     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
24937       ShuffleVec[i] = NumElems*SizeRatio;
24938     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
24939                                    DAG.getConstant(0, WideVecVT),
24940                                    &ShuffleVec[0]);
24941   }
24942   else {
24943     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
24944     unsigned WidenNumElts = NumElems*SizeRatio;
24945     unsigned MaskNumElts = VT.getVectorNumElements();
24946     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
24947                                      WidenNumElts);
24948
24949     unsigned NumConcat = WidenNumElts / MaskNumElts;
24950     SmallVector<SDValue, 16> Ops(NumConcat);
24951     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
24952     Ops[0] = Mask;
24953     for (unsigned i = 1; i != NumConcat; ++i)
24954       Ops[i] = ZeroVal;
24955
24956     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
24957   }
24958
24959   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
24960                                      Mld->getBasePtr(), NewMask, WideSrc0,
24961                                      Mld->getMemoryVT(), Mld->getMemOperand(),
24962                                      ISD::NON_EXTLOAD);
24963   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
24964   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
24965
24966 }
24967 /// PerformMSTORECombine - Resolve truncating stores
24968 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
24969                                     const X86Subtarget *Subtarget) {
24970   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
24971   if (!Mst->isTruncatingStore())
24972     return SDValue();
24973
24974   EVT VT = Mst->getValue().getValueType();
24975   unsigned NumElems = VT.getVectorNumElements();
24976   EVT StVT = Mst->getMemoryVT();
24977   SDLoc dl(Mst);
24978
24979   assert(StVT != VT && "Cannot truncate to the same type");
24980   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
24981   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
24982
24983   // From, To sizes and ElemCount must be pow of two
24984   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
24985     "Unexpected size for truncating masked store");
24986   // We are going to use the original vector elt for storing.
24987   // Accumulated smaller vector elements must be a multiple of the store size.
24988   assert (((NumElems * FromSz) % ToSz) == 0 &&
24989           "Unexpected ratio for truncating masked store");
24990
24991   unsigned SizeRatio  = FromSz / ToSz;
24992   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
24993
24994   // Create a type on which we perform the shuffle
24995   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24996           StVT.getScalarType(), NumElems*SizeRatio);
24997
24998   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24999
25000   SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
25001   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
25002   for (unsigned i = 0; i != NumElems; ++i)
25003     ShuffleVec[i] = i * SizeRatio;
25004
25005   // Can't shuffle using an illegal type.
25006   assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
25007           && "WideVecVT should be legal");
25008
25009   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25010                                         DAG.getUNDEF(WideVecVT),
25011                                         &ShuffleVec[0]);
25012
25013   SDValue NewMask;
25014   SDValue Mask = Mst->getMask();
25015   if (Mask.getValueType() == VT) {
25016     // Mask and original value have the same type
25017     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
25018     for (unsigned i = 0; i != NumElems; ++i)
25019       ShuffleVec[i] = i * SizeRatio;
25020     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
25021       ShuffleVec[i] = NumElems*SizeRatio;
25022     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
25023                                    DAG.getConstant(0, WideVecVT),
25024                                    &ShuffleVec[0]);
25025   }
25026   else {
25027     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
25028     unsigned WidenNumElts = NumElems*SizeRatio;
25029     unsigned MaskNumElts = VT.getVectorNumElements();
25030     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
25031                                      WidenNumElts);
25032
25033     unsigned NumConcat = WidenNumElts / MaskNumElts;
25034     SmallVector<SDValue, 16> Ops(NumConcat);
25035     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
25036     Ops[0] = Mask;
25037     for (unsigned i = 1; i != NumConcat; ++i)
25038       Ops[i] = ZeroVal;
25039
25040     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
25041   }
25042
25043   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
25044                             NewMask, StVT, Mst->getMemOperand(), false);
25045 }
25046 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
25047 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
25048                                    const X86Subtarget *Subtarget) {
25049   StoreSDNode *St = cast<StoreSDNode>(N);
25050   EVT VT = St->getValue().getValueType();
25051   EVT StVT = St->getMemoryVT();
25052   SDLoc dl(St);
25053   SDValue StoredVal = St->getOperand(1);
25054   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25055
25056   // If we are saving a concatenation of two XMM registers and 32-byte stores
25057   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
25058   unsigned Alignment = St->getAlignment();
25059   bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
25060   if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
25061       StVT == VT && !IsAligned) {
25062     unsigned NumElems = VT.getVectorNumElements();
25063     if (NumElems < 2)
25064       return SDValue();
25065
25066     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
25067     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
25068
25069     SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
25070     SDValue Ptr0 = St->getBasePtr();
25071     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
25072
25073     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
25074                                 St->getPointerInfo(), St->isVolatile(),
25075                                 St->isNonTemporal(), Alignment);
25076     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
25077                                 St->getPointerInfo(), St->isVolatile(),
25078                                 St->isNonTemporal(),
25079                                 std::min(16U, Alignment));
25080     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
25081   }
25082
25083   // Optimize trunc store (of multiple scalars) to shuffle and store.
25084   // First, pack all of the elements in one place. Next, store to memory
25085   // in fewer chunks.
25086   if (St->isTruncatingStore() && VT.isVector()) {
25087     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25088     unsigned NumElems = VT.getVectorNumElements();
25089     assert(StVT != VT && "Cannot truncate to the same type");
25090     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25091     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25092
25093     // From, To sizes and ElemCount must be pow of two
25094     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
25095     // We are going to use the original vector elt for storing.
25096     // Accumulated smaller vector elements must be a multiple of the store size.
25097     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
25098
25099     unsigned SizeRatio  = FromSz / ToSz;
25100
25101     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25102
25103     // Create a type on which we perform the shuffle
25104     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25105             StVT.getScalarType(), NumElems*SizeRatio);
25106
25107     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25108
25109     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
25110     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
25111     for (unsigned i = 0; i != NumElems; ++i)
25112       ShuffleVec[i] = i * SizeRatio;
25113
25114     // Can't shuffle using an illegal type.
25115     if (!TLI.isTypeLegal(WideVecVT))
25116       return SDValue();
25117
25118     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25119                                          DAG.getUNDEF(WideVecVT),
25120                                          &ShuffleVec[0]);
25121     // At this point all of the data is stored at the bottom of the
25122     // register. We now need to save it to mem.
25123
25124     // Find the largest store unit
25125     MVT StoreType = MVT::i8;
25126     for (MVT Tp : MVT::integer_valuetypes()) {
25127       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
25128         StoreType = Tp;
25129     }
25130
25131     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
25132     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
25133         (64 <= NumElems * ToSz))
25134       StoreType = MVT::f64;
25135
25136     // Bitcast the original vector into a vector of store-size units
25137     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
25138             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
25139     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
25140     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
25141     SmallVector<SDValue, 8> Chains;
25142     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
25143                                         TLI.getPointerTy());
25144     SDValue Ptr = St->getBasePtr();
25145
25146     // Perform one or more big stores into memory.
25147     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
25148       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
25149                                    StoreType, ShuffWide,
25150                                    DAG.getIntPtrConstant(i));
25151       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
25152                                 St->getPointerInfo(), St->isVolatile(),
25153                                 St->isNonTemporal(), St->getAlignment());
25154       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
25155       Chains.push_back(Ch);
25156     }
25157
25158     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
25159   }
25160
25161   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
25162   // the FP state in cases where an emms may be missing.
25163   // A preferable solution to the general problem is to figure out the right
25164   // places to insert EMMS.  This qualifies as a quick hack.
25165
25166   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
25167   if (VT.getSizeInBits() != 64)
25168     return SDValue();
25169
25170   const Function *F = DAG.getMachineFunction().getFunction();
25171   bool NoImplicitFloatOps = F->getAttributes().
25172     hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
25173   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
25174                      && Subtarget->hasSSE2();
25175   if ((VT.isVector() ||
25176        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
25177       isa<LoadSDNode>(St->getValue()) &&
25178       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
25179       St->getChain().hasOneUse() && !St->isVolatile()) {
25180     SDNode* LdVal = St->getValue().getNode();
25181     LoadSDNode *Ld = nullptr;
25182     int TokenFactorIndex = -1;
25183     SmallVector<SDValue, 8> Ops;
25184     SDNode* ChainVal = St->getChain().getNode();
25185     // Must be a store of a load.  We currently handle two cases:  the load
25186     // is a direct child, and it's under an intervening TokenFactor.  It is
25187     // possible to dig deeper under nested TokenFactors.
25188     if (ChainVal == LdVal)
25189       Ld = cast<LoadSDNode>(St->getChain());
25190     else if (St->getValue().hasOneUse() &&
25191              ChainVal->getOpcode() == ISD::TokenFactor) {
25192       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
25193         if (ChainVal->getOperand(i).getNode() == LdVal) {
25194           TokenFactorIndex = i;
25195           Ld = cast<LoadSDNode>(St->getValue());
25196         } else
25197           Ops.push_back(ChainVal->getOperand(i));
25198       }
25199     }
25200
25201     if (!Ld || !ISD::isNormalLoad(Ld))
25202       return SDValue();
25203
25204     // If this is not the MMX case, i.e. we are just turning i64 load/store
25205     // into f64 load/store, avoid the transformation if there are multiple
25206     // uses of the loaded value.
25207     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
25208       return SDValue();
25209
25210     SDLoc LdDL(Ld);
25211     SDLoc StDL(N);
25212     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
25213     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
25214     // pair instead.
25215     if (Subtarget->is64Bit() || F64IsLegal) {
25216       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
25217       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
25218                                   Ld->getPointerInfo(), Ld->isVolatile(),
25219                                   Ld->isNonTemporal(), Ld->isInvariant(),
25220                                   Ld->getAlignment());
25221       SDValue NewChain = NewLd.getValue(1);
25222       if (TokenFactorIndex != -1) {
25223         Ops.push_back(NewChain);
25224         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25225       }
25226       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
25227                           St->getPointerInfo(),
25228                           St->isVolatile(), St->isNonTemporal(),
25229                           St->getAlignment());
25230     }
25231
25232     // Otherwise, lower to two pairs of 32-bit loads / stores.
25233     SDValue LoAddr = Ld->getBasePtr();
25234     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
25235                                  DAG.getConstant(4, MVT::i32));
25236
25237     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
25238                                Ld->getPointerInfo(),
25239                                Ld->isVolatile(), Ld->isNonTemporal(),
25240                                Ld->isInvariant(), Ld->getAlignment());
25241     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
25242                                Ld->getPointerInfo().getWithOffset(4),
25243                                Ld->isVolatile(), Ld->isNonTemporal(),
25244                                Ld->isInvariant(),
25245                                MinAlign(Ld->getAlignment(), 4));
25246
25247     SDValue NewChain = LoLd.getValue(1);
25248     if (TokenFactorIndex != -1) {
25249       Ops.push_back(LoLd);
25250       Ops.push_back(HiLd);
25251       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25252     }
25253
25254     LoAddr = St->getBasePtr();
25255     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
25256                          DAG.getConstant(4, MVT::i32));
25257
25258     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
25259                                 St->getPointerInfo(),
25260                                 St->isVolatile(), St->isNonTemporal(),
25261                                 St->getAlignment());
25262     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
25263                                 St->getPointerInfo().getWithOffset(4),
25264                                 St->isVolatile(),
25265                                 St->isNonTemporal(),
25266                                 MinAlign(St->getAlignment(), 4));
25267     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
25268   }
25269   return SDValue();
25270 }
25271
25272 /// Return 'true' if this vector operation is "horizontal"
25273 /// and return the operands for the horizontal operation in LHS and RHS.  A
25274 /// horizontal operation performs the binary operation on successive elements
25275 /// of its first operand, then on successive elements of its second operand,
25276 /// returning the resulting values in a vector.  For example, if
25277 ///   A = < float a0, float a1, float a2, float a3 >
25278 /// and
25279 ///   B = < float b0, float b1, float b2, float b3 >
25280 /// then the result of doing a horizontal operation on A and B is
25281 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
25282 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
25283 /// A horizontal-op B, for some already available A and B, and if so then LHS is
25284 /// set to A, RHS to B, and the routine returns 'true'.
25285 /// Note that the binary operation should have the property that if one of the
25286 /// operands is UNDEF then the result is UNDEF.
25287 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
25288   // Look for the following pattern: if
25289   //   A = < float a0, float a1, float a2, float a3 >
25290   //   B = < float b0, float b1, float b2, float b3 >
25291   // and
25292   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
25293   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
25294   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
25295   // which is A horizontal-op B.
25296
25297   // At least one of the operands should be a vector shuffle.
25298   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
25299       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
25300     return false;
25301
25302   MVT VT = LHS.getSimpleValueType();
25303
25304   assert((VT.is128BitVector() || VT.is256BitVector()) &&
25305          "Unsupported vector type for horizontal add/sub");
25306
25307   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
25308   // operate independently on 128-bit lanes.
25309   unsigned NumElts = VT.getVectorNumElements();
25310   unsigned NumLanes = VT.getSizeInBits()/128;
25311   unsigned NumLaneElts = NumElts / NumLanes;
25312   assert((NumLaneElts % 2 == 0) &&
25313          "Vector type should have an even number of elements in each lane");
25314   unsigned HalfLaneElts = NumLaneElts/2;
25315
25316   // View LHS in the form
25317   //   LHS = VECTOR_SHUFFLE A, B, LMask
25318   // If LHS is not a shuffle then pretend it is the shuffle
25319   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
25320   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
25321   // type VT.
25322   SDValue A, B;
25323   SmallVector<int, 16> LMask(NumElts);
25324   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25325     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
25326       A = LHS.getOperand(0);
25327     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
25328       B = LHS.getOperand(1);
25329     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
25330     std::copy(Mask.begin(), Mask.end(), LMask.begin());
25331   } else {
25332     if (LHS.getOpcode() != ISD::UNDEF)
25333       A = LHS;
25334     for (unsigned i = 0; i != NumElts; ++i)
25335       LMask[i] = i;
25336   }
25337
25338   // Likewise, view RHS in the form
25339   //   RHS = VECTOR_SHUFFLE C, D, RMask
25340   SDValue C, D;
25341   SmallVector<int, 16> RMask(NumElts);
25342   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25343     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
25344       C = RHS.getOperand(0);
25345     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
25346       D = RHS.getOperand(1);
25347     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
25348     std::copy(Mask.begin(), Mask.end(), RMask.begin());
25349   } else {
25350     if (RHS.getOpcode() != ISD::UNDEF)
25351       C = RHS;
25352     for (unsigned i = 0; i != NumElts; ++i)
25353       RMask[i] = i;
25354   }
25355
25356   // Check that the shuffles are both shuffling the same vectors.
25357   if (!(A == C && B == D) && !(A == D && B == C))
25358     return false;
25359
25360   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
25361   if (!A.getNode() && !B.getNode())
25362     return false;
25363
25364   // If A and B occur in reverse order in RHS, then "swap" them (which means
25365   // rewriting the mask).
25366   if (A != C)
25367     CommuteVectorShuffleMask(RMask, NumElts);
25368
25369   // At this point LHS and RHS are equivalent to
25370   //   LHS = VECTOR_SHUFFLE A, B, LMask
25371   //   RHS = VECTOR_SHUFFLE A, B, RMask
25372   // Check that the masks correspond to performing a horizontal operation.
25373   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
25374     for (unsigned i = 0; i != NumLaneElts; ++i) {
25375       int LIdx = LMask[i+l], RIdx = RMask[i+l];
25376
25377       // Ignore any UNDEF components.
25378       if (LIdx < 0 || RIdx < 0 ||
25379           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
25380           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
25381         continue;
25382
25383       // Check that successive elements are being operated on.  If not, this is
25384       // not a horizontal operation.
25385       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
25386       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
25387       if (!(LIdx == Index && RIdx == Index + 1) &&
25388           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
25389         return false;
25390     }
25391   }
25392
25393   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
25394   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
25395   return true;
25396 }
25397
25398 /// Do target-specific dag combines on floating point adds.
25399 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
25400                                   const X86Subtarget *Subtarget) {
25401   EVT VT = N->getValueType(0);
25402   SDValue LHS = N->getOperand(0);
25403   SDValue RHS = N->getOperand(1);
25404
25405   // Try to synthesize horizontal adds from adds of shuffles.
25406   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25407        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25408       isHorizontalBinOp(LHS, RHS, true))
25409     return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
25410   return SDValue();
25411 }
25412
25413 /// Do target-specific dag combines on floating point subs.
25414 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
25415                                   const X86Subtarget *Subtarget) {
25416   EVT VT = N->getValueType(0);
25417   SDValue LHS = N->getOperand(0);
25418   SDValue RHS = N->getOperand(1);
25419
25420   // Try to synthesize horizontal subs from subs of shuffles.
25421   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25422        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25423       isHorizontalBinOp(LHS, RHS, false))
25424     return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
25425   return SDValue();
25426 }
25427
25428 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
25429 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
25430   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
25431   // F[X]OR(0.0, x) -> x
25432   // F[X]OR(x, 0.0) -> x
25433   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25434     if (C->getValueAPF().isPosZero())
25435       return N->getOperand(1);
25436   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25437     if (C->getValueAPF().isPosZero())
25438       return N->getOperand(0);
25439   return SDValue();
25440 }
25441
25442 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
25443 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
25444   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
25445
25446   // Only perform optimizations if UnsafeMath is used.
25447   if (!DAG.getTarget().Options.UnsafeFPMath)
25448     return SDValue();
25449
25450   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
25451   // into FMINC and FMAXC, which are Commutative operations.
25452   unsigned NewOp = 0;
25453   switch (N->getOpcode()) {
25454     default: llvm_unreachable("unknown opcode");
25455     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
25456     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
25457   }
25458
25459   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
25460                      N->getOperand(0), N->getOperand(1));
25461 }
25462
25463 /// Do target-specific dag combines on X86ISD::FAND nodes.
25464 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
25465   // FAND(0.0, x) -> 0.0
25466   // FAND(x, 0.0) -> 0.0
25467   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25468     if (C->getValueAPF().isPosZero())
25469       return N->getOperand(0);
25470   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25471     if (C->getValueAPF().isPosZero())
25472       return N->getOperand(1);
25473   return SDValue();
25474 }
25475
25476 /// Do target-specific dag combines on X86ISD::FANDN nodes
25477 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
25478   // FANDN(x, 0.0) -> 0.0
25479   // FANDN(0.0, x) -> x
25480   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25481     if (C->getValueAPF().isPosZero())
25482       return N->getOperand(1);
25483   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25484     if (C->getValueAPF().isPosZero())
25485       return N->getOperand(1);
25486   return SDValue();
25487 }
25488
25489 static SDValue PerformBTCombine(SDNode *N,
25490                                 SelectionDAG &DAG,
25491                                 TargetLowering::DAGCombinerInfo &DCI) {
25492   // BT ignores high bits in the bit index operand.
25493   SDValue Op1 = N->getOperand(1);
25494   if (Op1.hasOneUse()) {
25495     unsigned BitWidth = Op1.getValueSizeInBits();
25496     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
25497     APInt KnownZero, KnownOne;
25498     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
25499                                           !DCI.isBeforeLegalizeOps());
25500     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25501     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
25502         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
25503       DCI.CommitTargetLoweringOpt(TLO);
25504   }
25505   return SDValue();
25506 }
25507
25508 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
25509   SDValue Op = N->getOperand(0);
25510   if (Op.getOpcode() == ISD::BITCAST)
25511     Op = Op.getOperand(0);
25512   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
25513   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
25514       VT.getVectorElementType().getSizeInBits() ==
25515       OpVT.getVectorElementType().getSizeInBits()) {
25516     return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
25517   }
25518   return SDValue();
25519 }
25520
25521 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
25522                                                const X86Subtarget *Subtarget) {
25523   EVT VT = N->getValueType(0);
25524   if (!VT.isVector())
25525     return SDValue();
25526
25527   SDValue N0 = N->getOperand(0);
25528   SDValue N1 = N->getOperand(1);
25529   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
25530   SDLoc dl(N);
25531
25532   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
25533   // both SSE and AVX2 since there is no sign-extended shift right
25534   // operation on a vector with 64-bit elements.
25535   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
25536   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
25537   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
25538       N0.getOpcode() == ISD::SIGN_EXTEND)) {
25539     SDValue N00 = N0.getOperand(0);
25540
25541     // EXTLOAD has a better solution on AVX2,
25542     // it may be replaced with X86ISD::VSEXT node.
25543     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
25544       if (!ISD::isNormalLoad(N00.getNode()))
25545         return SDValue();
25546
25547     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
25548         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
25549                                   N00, N1);
25550       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
25551     }
25552   }
25553   return SDValue();
25554 }
25555
25556 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
25557                                   TargetLowering::DAGCombinerInfo &DCI,
25558                                   const X86Subtarget *Subtarget) {
25559   SDValue N0 = N->getOperand(0);
25560   EVT VT = N->getValueType(0);
25561
25562   // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
25563   // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
25564   // This exposes the sext to the sdivrem lowering, so that it directly extends
25565   // from AH (which we otherwise need to do contortions to access).
25566   if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
25567       N0.getValueType() == MVT::i8 && VT == MVT::i32) {
25568     SDLoc dl(N);
25569     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25570     SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
25571                             N0.getOperand(0), N0.getOperand(1));
25572     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25573     return R.getValue(1);
25574   }
25575
25576   if (!DCI.isBeforeLegalizeOps())
25577     return SDValue();
25578
25579   if (!Subtarget->hasFp256())
25580     return SDValue();
25581
25582   if (VT.isVector() && VT.getSizeInBits() == 256) {
25583     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25584     if (R.getNode())
25585       return R;
25586   }
25587
25588   return SDValue();
25589 }
25590
25591 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
25592                                  const X86Subtarget* Subtarget) {
25593   SDLoc dl(N);
25594   EVT VT = N->getValueType(0);
25595
25596   // Let legalize expand this if it isn't a legal type yet.
25597   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
25598     return SDValue();
25599
25600   EVT ScalarVT = VT.getScalarType();
25601   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
25602       (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
25603     return SDValue();
25604
25605   SDValue A = N->getOperand(0);
25606   SDValue B = N->getOperand(1);
25607   SDValue C = N->getOperand(2);
25608
25609   bool NegA = (A.getOpcode() == ISD::FNEG);
25610   bool NegB = (B.getOpcode() == ISD::FNEG);
25611   bool NegC = (C.getOpcode() == ISD::FNEG);
25612
25613   // Negative multiplication when NegA xor NegB
25614   bool NegMul = (NegA != NegB);
25615   if (NegA)
25616     A = A.getOperand(0);
25617   if (NegB)
25618     B = B.getOperand(0);
25619   if (NegC)
25620     C = C.getOperand(0);
25621
25622   unsigned Opcode;
25623   if (!NegMul)
25624     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
25625   else
25626     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
25627
25628   return DAG.getNode(Opcode, dl, VT, A, B, C);
25629 }
25630
25631 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
25632                                   TargetLowering::DAGCombinerInfo &DCI,
25633                                   const X86Subtarget *Subtarget) {
25634   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
25635   //           (and (i32 x86isd::setcc_carry), 1)
25636   // This eliminates the zext. This transformation is necessary because
25637   // ISD::SETCC is always legalized to i8.
25638   SDLoc dl(N);
25639   SDValue N0 = N->getOperand(0);
25640   EVT VT = N->getValueType(0);
25641
25642   if (N0.getOpcode() == ISD::AND &&
25643       N0.hasOneUse() &&
25644       N0.getOperand(0).hasOneUse()) {
25645     SDValue N00 = N0.getOperand(0);
25646     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25647       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
25648       if (!C || C->getZExtValue() != 1)
25649         return SDValue();
25650       return DAG.getNode(ISD::AND, dl, VT,
25651                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25652                                      N00.getOperand(0), N00.getOperand(1)),
25653                          DAG.getConstant(1, VT));
25654     }
25655   }
25656
25657   if (N0.getOpcode() == ISD::TRUNCATE &&
25658       N0.hasOneUse() &&
25659       N0.getOperand(0).hasOneUse()) {
25660     SDValue N00 = N0.getOperand(0);
25661     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25662       return DAG.getNode(ISD::AND, dl, VT,
25663                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25664                                      N00.getOperand(0), N00.getOperand(1)),
25665                          DAG.getConstant(1, VT));
25666     }
25667   }
25668   if (VT.is256BitVector()) {
25669     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25670     if (R.getNode())
25671       return R;
25672   }
25673
25674   // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
25675   // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
25676   // This exposes the zext to the udivrem lowering, so that it directly extends
25677   // from AH (which we otherwise need to do contortions to access).
25678   if (N0.getOpcode() == ISD::UDIVREM &&
25679       N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
25680       (VT == MVT::i32 || VT == MVT::i64)) {
25681     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25682     SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
25683                             N0.getOperand(0), N0.getOperand(1));
25684     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25685     return R.getValue(1);
25686   }
25687
25688   return SDValue();
25689 }
25690
25691 // Optimize x == -y --> x+y == 0
25692 //          x != -y --> x+y != 0
25693 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
25694                                       const X86Subtarget* Subtarget) {
25695   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
25696   SDValue LHS = N->getOperand(0);
25697   SDValue RHS = N->getOperand(1);
25698   EVT VT = N->getValueType(0);
25699   SDLoc DL(N);
25700
25701   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
25702     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
25703       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
25704         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25705                                    LHS.getValueType(), RHS, LHS.getOperand(1));
25706         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25707                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25708       }
25709   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
25710     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
25711       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
25712         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25713                                    RHS.getValueType(), LHS, RHS.getOperand(1));
25714         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25715                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25716       }
25717
25718   if (VT.getScalarType() == MVT::i1) {
25719     bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
25720       (LHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25721     bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
25722     if (!IsSEXT0 && !IsVZero0)
25723       return SDValue();
25724     bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
25725       (RHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25726     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
25727
25728     if (!IsSEXT1 && !IsVZero1)
25729       return SDValue();
25730
25731     if (IsSEXT0 && IsVZero1) {
25732       assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
25733       if (CC == ISD::SETEQ)
25734         return DAG.getNOT(DL, LHS.getOperand(0), VT);
25735       return LHS.getOperand(0);
25736     }
25737     if (IsSEXT1 && IsVZero0) {
25738       assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
25739       if (CC == ISD::SETEQ)
25740         return DAG.getNOT(DL, RHS.getOperand(0), VT);
25741       return RHS.getOperand(0);
25742     }
25743   }
25744
25745   return SDValue();
25746 }
25747
25748 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
25749                                       const X86Subtarget *Subtarget) {
25750   SDLoc dl(N);
25751   MVT VT = N->getOperand(1)->getSimpleValueType(0);
25752   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
25753          "X86insertps is only defined for v4x32");
25754
25755   SDValue Ld = N->getOperand(1);
25756   if (MayFoldLoad(Ld)) {
25757     // Extract the countS bits from the immediate so we can get the proper
25758     // address when narrowing the vector load to a specific element.
25759     // When the second source op is a memory address, interps doesn't use
25760     // countS and just gets an f32 from that address.
25761     unsigned DestIndex =
25762         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
25763     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
25764   } else
25765     return SDValue();
25766
25767   // Create this as a scalar to vector to match the instruction pattern.
25768   SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
25769   // countS bits are ignored when loading from memory on insertps, which
25770   // means we don't need to explicitly set them to 0.
25771   return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
25772                      LoadScalarToVector, N->getOperand(2));
25773 }
25774
25775 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
25776 // as "sbb reg,reg", since it can be extended without zext and produces
25777 // an all-ones bit which is more useful than 0/1 in some cases.
25778 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
25779                                MVT VT) {
25780   if (VT == MVT::i8)
25781     return DAG.getNode(ISD::AND, DL, VT,
25782                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25783                                    DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
25784                        DAG.getConstant(1, VT));
25785   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
25786   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
25787                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25788                                  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
25789 }
25790
25791 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
25792 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
25793                                    TargetLowering::DAGCombinerInfo &DCI,
25794                                    const X86Subtarget *Subtarget) {
25795   SDLoc DL(N);
25796   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
25797   SDValue EFLAGS = N->getOperand(1);
25798
25799   if (CC == X86::COND_A) {
25800     // Try to convert COND_A into COND_B in an attempt to facilitate
25801     // materializing "setb reg".
25802     //
25803     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
25804     // cannot take an immediate as its first operand.
25805     //
25806     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
25807         EFLAGS.getValueType().isInteger() &&
25808         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
25809       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
25810                                    EFLAGS.getNode()->getVTList(),
25811                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
25812       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
25813       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
25814     }
25815   }
25816
25817   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
25818   // a zext and produces an all-ones bit which is more useful than 0/1 in some
25819   // cases.
25820   if (CC == X86::COND_B)
25821     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
25822
25823   SDValue Flags;
25824
25825   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25826   if (Flags.getNode()) {
25827     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25828     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
25829   }
25830
25831   return SDValue();
25832 }
25833
25834 // Optimize branch condition evaluation.
25835 //
25836 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
25837                                     TargetLowering::DAGCombinerInfo &DCI,
25838                                     const X86Subtarget *Subtarget) {
25839   SDLoc DL(N);
25840   SDValue Chain = N->getOperand(0);
25841   SDValue Dest = N->getOperand(1);
25842   SDValue EFLAGS = N->getOperand(3);
25843   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
25844
25845   SDValue Flags;
25846
25847   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25848   if (Flags.getNode()) {
25849     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25850     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
25851                        Flags);
25852   }
25853
25854   return SDValue();
25855 }
25856
25857 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
25858                                                          SelectionDAG &DAG) {
25859   // Take advantage of vector comparisons producing 0 or -1 in each lane to
25860   // optimize away operation when it's from a constant.
25861   //
25862   // The general transformation is:
25863   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
25864   //       AND(VECTOR_CMP(x,y), constant2)
25865   //    constant2 = UNARYOP(constant)
25866
25867   // Early exit if this isn't a vector operation, the operand of the
25868   // unary operation isn't a bitwise AND, or if the sizes of the operations
25869   // aren't the same.
25870   EVT VT = N->getValueType(0);
25871   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
25872       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
25873       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
25874     return SDValue();
25875
25876   // Now check that the other operand of the AND is a constant. We could
25877   // make the transformation for non-constant splats as well, but it's unclear
25878   // that would be a benefit as it would not eliminate any operations, just
25879   // perform one more step in scalar code before moving to the vector unit.
25880   if (BuildVectorSDNode *BV =
25881           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
25882     // Bail out if the vector isn't a constant.
25883     if (!BV->isConstant())
25884       return SDValue();
25885
25886     // Everything checks out. Build up the new and improved node.
25887     SDLoc DL(N);
25888     EVT IntVT = BV->getValueType(0);
25889     // Create a new constant of the appropriate type for the transformed
25890     // DAG.
25891     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
25892     // The AND node needs bitcasts to/from an integer vector type around it.
25893     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
25894     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
25895                                  N->getOperand(0)->getOperand(0), MaskConst);
25896     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
25897     return Res;
25898   }
25899
25900   return SDValue();
25901 }
25902
25903 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
25904                                         const X86Subtarget *Subtarget) {
25905   // First try to optimize away the conversion entirely when it's
25906   // conditionally from a constant. Vectors only.
25907   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
25908   if (Res != SDValue())
25909     return Res;
25910
25911   // Now move on to more general possibilities.
25912   SDValue Op0 = N->getOperand(0);
25913   EVT InVT = Op0->getValueType(0);
25914
25915   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
25916   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
25917     SDLoc dl(N);
25918     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
25919     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
25920     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
25921   }
25922
25923   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
25924   // a 32-bit target where SSE doesn't support i64->FP operations.
25925   if (Op0.getOpcode() == ISD::LOAD) {
25926     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
25927     EVT VT = Ld->getValueType(0);
25928     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
25929         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
25930         !Subtarget->is64Bit() && VT == MVT::i64) {
25931       SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
25932           SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
25933       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
25934       return FILDChain;
25935     }
25936   }
25937   return SDValue();
25938 }
25939
25940 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
25941 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
25942                                  X86TargetLowering::DAGCombinerInfo &DCI) {
25943   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
25944   // the result is either zero or one (depending on the input carry bit).
25945   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
25946   if (X86::isZeroNode(N->getOperand(0)) &&
25947       X86::isZeroNode(N->getOperand(1)) &&
25948       // We don't have a good way to replace an EFLAGS use, so only do this when
25949       // dead right now.
25950       SDValue(N, 1).use_empty()) {
25951     SDLoc DL(N);
25952     EVT VT = N->getValueType(0);
25953     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
25954     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
25955                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
25956                                            DAG.getConstant(X86::COND_B,MVT::i8),
25957                                            N->getOperand(2)),
25958                                DAG.getConstant(1, VT));
25959     return DCI.CombineTo(N, Res1, CarryOut);
25960   }
25961
25962   return SDValue();
25963 }
25964
25965 // fold (add Y, (sete  X, 0)) -> adc  0, Y
25966 //      (add Y, (setne X, 0)) -> sbb -1, Y
25967 //      (sub (sete  X, 0), Y) -> sbb  0, Y
25968 //      (sub (setne X, 0), Y) -> adc -1, Y
25969 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
25970   SDLoc DL(N);
25971
25972   // Look through ZExts.
25973   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
25974   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
25975     return SDValue();
25976
25977   SDValue SetCC = Ext.getOperand(0);
25978   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
25979     return SDValue();
25980
25981   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
25982   if (CC != X86::COND_E && CC != X86::COND_NE)
25983     return SDValue();
25984
25985   SDValue Cmp = SetCC.getOperand(1);
25986   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
25987       !X86::isZeroNode(Cmp.getOperand(1)) ||
25988       !Cmp.getOperand(0).getValueType().isInteger())
25989     return SDValue();
25990
25991   SDValue CmpOp0 = Cmp.getOperand(0);
25992   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
25993                                DAG.getConstant(1, CmpOp0.getValueType()));
25994
25995   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
25996   if (CC == X86::COND_NE)
25997     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
25998                        DL, OtherVal.getValueType(), OtherVal,
25999                        DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
26000   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
26001                      DL, OtherVal.getValueType(), OtherVal,
26002                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
26003 }
26004
26005 /// PerformADDCombine - Do target-specific dag combines on integer adds.
26006 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
26007                                  const X86Subtarget *Subtarget) {
26008   EVT VT = N->getValueType(0);
26009   SDValue Op0 = N->getOperand(0);
26010   SDValue Op1 = N->getOperand(1);
26011
26012   // Try to synthesize horizontal adds from adds of shuffles.
26013   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26014        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26015       isHorizontalBinOp(Op0, Op1, true))
26016     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
26017
26018   return OptimizeConditionalInDecrement(N, DAG);
26019 }
26020
26021 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
26022                                  const X86Subtarget *Subtarget) {
26023   SDValue Op0 = N->getOperand(0);
26024   SDValue Op1 = N->getOperand(1);
26025
26026   // X86 can't encode an immediate LHS of a sub. See if we can push the
26027   // negation into a preceding instruction.
26028   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
26029     // If the RHS of the sub is a XOR with one use and a constant, invert the
26030     // immediate. Then add one to the LHS of the sub so we can turn
26031     // X-Y -> X+~Y+1, saving one register.
26032     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
26033         isa<ConstantSDNode>(Op1.getOperand(1))) {
26034       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
26035       EVT VT = Op0.getValueType();
26036       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
26037                                    Op1.getOperand(0),
26038                                    DAG.getConstant(~XorC, VT));
26039       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
26040                          DAG.getConstant(C->getAPIntValue()+1, VT));
26041     }
26042   }
26043
26044   // Try to synthesize horizontal adds from adds of shuffles.
26045   EVT VT = N->getValueType(0);
26046   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26047        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26048       isHorizontalBinOp(Op0, Op1, true))
26049     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
26050
26051   return OptimizeConditionalInDecrement(N, DAG);
26052 }
26053
26054 /// performVZEXTCombine - Performs build vector combines
26055 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
26056                                    TargetLowering::DAGCombinerInfo &DCI,
26057                                    const X86Subtarget *Subtarget) {
26058   SDLoc DL(N);
26059   MVT VT = N->getSimpleValueType(0);
26060   SDValue Op = N->getOperand(0);
26061   MVT OpVT = Op.getSimpleValueType();
26062   MVT OpEltVT = OpVT.getVectorElementType();
26063   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
26064
26065   // (vzext (bitcast (vzext (x)) -> (vzext x)
26066   SDValue V = Op;
26067   while (V.getOpcode() == ISD::BITCAST)
26068     V = V.getOperand(0);
26069
26070   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
26071     MVT InnerVT = V.getSimpleValueType();
26072     MVT InnerEltVT = InnerVT.getVectorElementType();
26073
26074     // If the element sizes match exactly, we can just do one larger vzext. This
26075     // is always an exact type match as vzext operates on integer types.
26076     if (OpEltVT == InnerEltVT) {
26077       assert(OpVT == InnerVT && "Types must match for vzext!");
26078       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
26079     }
26080
26081     // The only other way we can combine them is if only a single element of the
26082     // inner vzext is used in the input to the outer vzext.
26083     if (InnerEltVT.getSizeInBits() < InputBits)
26084       return SDValue();
26085
26086     // In this case, the inner vzext is completely dead because we're going to
26087     // only look at bits inside of the low element. Just do the outer vzext on
26088     // a bitcast of the input to the inner.
26089     return DAG.getNode(X86ISD::VZEXT, DL, VT,
26090                        DAG.getNode(ISD::BITCAST, DL, OpVT, V));
26091   }
26092
26093   // Check if we can bypass extracting and re-inserting an element of an input
26094   // vector. Essentialy:
26095   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
26096   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
26097       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
26098       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
26099     SDValue ExtractedV = V.getOperand(0);
26100     SDValue OrigV = ExtractedV.getOperand(0);
26101     if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
26102       if (ExtractIdx->getZExtValue() == 0) {
26103         MVT OrigVT = OrigV.getSimpleValueType();
26104         // Extract a subvector if necessary...
26105         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
26106           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
26107           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
26108                                     OrigVT.getVectorNumElements() / Ratio);
26109           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
26110                               DAG.getIntPtrConstant(0));
26111         }
26112         Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
26113         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
26114       }
26115   }
26116
26117   return SDValue();
26118 }
26119
26120 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
26121                                              DAGCombinerInfo &DCI) const {
26122   SelectionDAG &DAG = DCI.DAG;
26123   switch (N->getOpcode()) {
26124   default: break;
26125   case ISD::EXTRACT_VECTOR_ELT:
26126     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
26127   case ISD::VSELECT:
26128   case ISD::SELECT:
26129   case X86ISD::SHRUNKBLEND:
26130     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
26131   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
26132   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
26133   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
26134   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
26135   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
26136   case ISD::SHL:
26137   case ISD::SRA:
26138   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
26139   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
26140   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
26141   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
26142   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
26143   case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
26144   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
26145   case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
26146   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
26147   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
26148   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
26149   case X86ISD::FXOR:
26150   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
26151   case X86ISD::FMIN:
26152   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
26153   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
26154   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
26155   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
26156   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
26157   case ISD::ANY_EXTEND:
26158   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
26159   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
26160   case ISD::SIGN_EXTEND_INREG:
26161     return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
26162   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
26163   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
26164   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
26165   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
26166   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
26167   case X86ISD::SHUFP:       // Handle all target specific shuffles
26168   case X86ISD::PALIGNR:
26169   case X86ISD::UNPCKH:
26170   case X86ISD::UNPCKL:
26171   case X86ISD::MOVHLPS:
26172   case X86ISD::MOVLHPS:
26173   case X86ISD::PSHUFB:
26174   case X86ISD::PSHUFD:
26175   case X86ISD::PSHUFHW:
26176   case X86ISD::PSHUFLW:
26177   case X86ISD::MOVSS:
26178   case X86ISD::MOVSD:
26179   case X86ISD::VPERMILPI:
26180   case X86ISD::VPERM2X128:
26181   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
26182   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
26183   case ISD::INTRINSIC_WO_CHAIN:
26184     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
26185   case X86ISD::INSERTPS: {
26186     if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
26187       return PerformINSERTPSCombine(N, DAG, Subtarget);
26188     break;
26189   }
26190   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
26191   }
26192
26193   return SDValue();
26194 }
26195
26196 /// isTypeDesirableForOp - Return true if the target has native support for
26197 /// the specified value type and it is 'desirable' to use the type for the
26198 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
26199 /// instruction encodings are longer and some i16 instructions are slow.
26200 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
26201   if (!isTypeLegal(VT))
26202     return false;
26203   if (VT != MVT::i16)
26204     return true;
26205
26206   switch (Opc) {
26207   default:
26208     return true;
26209   case ISD::LOAD:
26210   case ISD::SIGN_EXTEND:
26211   case ISD::ZERO_EXTEND:
26212   case ISD::ANY_EXTEND:
26213   case ISD::SHL:
26214   case ISD::SRL:
26215   case ISD::SUB:
26216   case ISD::ADD:
26217   case ISD::MUL:
26218   case ISD::AND:
26219   case ISD::OR:
26220   case ISD::XOR:
26221     return false;
26222   }
26223 }
26224
26225 /// IsDesirableToPromoteOp - This method query the target whether it is
26226 /// beneficial for dag combiner to promote the specified node. If true, it
26227 /// should return the desired promotion type by reference.
26228 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
26229   EVT VT = Op.getValueType();
26230   if (VT != MVT::i16)
26231     return false;
26232
26233   bool Promote = false;
26234   bool Commute = false;
26235   switch (Op.getOpcode()) {
26236   default: break;
26237   case ISD::LOAD: {
26238     LoadSDNode *LD = cast<LoadSDNode>(Op);
26239     // If the non-extending load has a single use and it's not live out, then it
26240     // might be folded.
26241     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
26242                                                      Op.hasOneUse()*/) {
26243       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
26244              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
26245         // The only case where we'd want to promote LOAD (rather then it being
26246         // promoted as an operand is when it's only use is liveout.
26247         if (UI->getOpcode() != ISD::CopyToReg)
26248           return false;
26249       }
26250     }
26251     Promote = true;
26252     break;
26253   }
26254   case ISD::SIGN_EXTEND:
26255   case ISD::ZERO_EXTEND:
26256   case ISD::ANY_EXTEND:
26257     Promote = true;
26258     break;
26259   case ISD::SHL:
26260   case ISD::SRL: {
26261     SDValue N0 = Op.getOperand(0);
26262     // Look out for (store (shl (load), x)).
26263     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
26264       return false;
26265     Promote = true;
26266     break;
26267   }
26268   case ISD::ADD:
26269   case ISD::MUL:
26270   case ISD::AND:
26271   case ISD::OR:
26272   case ISD::XOR:
26273     Commute = true;
26274     // fallthrough
26275   case ISD::SUB: {
26276     SDValue N0 = Op.getOperand(0);
26277     SDValue N1 = Op.getOperand(1);
26278     if (!Commute && MayFoldLoad(N1))
26279       return false;
26280     // Avoid disabling potential load folding opportunities.
26281     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
26282       return false;
26283     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
26284       return false;
26285     Promote = true;
26286   }
26287   }
26288
26289   PVT = MVT::i32;
26290   return Promote;
26291 }
26292
26293 //===----------------------------------------------------------------------===//
26294 //                           X86 Inline Assembly Support
26295 //===----------------------------------------------------------------------===//
26296
26297 namespace {
26298   // Helper to match a string separated by whitespace.
26299   bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
26300     s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
26301
26302     for (unsigned i = 0, e = args.size(); i != e; ++i) {
26303       StringRef piece(*args[i]);
26304       if (!s.startswith(piece)) // Check if the piece matches.
26305         return false;
26306
26307       s = s.substr(piece.size());
26308       StringRef::size_type pos = s.find_first_not_of(" \t");
26309       if (pos == 0) // We matched a prefix.
26310         return false;
26311
26312       s = s.substr(pos);
26313     }
26314
26315     return s.empty();
26316   }
26317   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
26318 }
26319
26320 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
26321
26322   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
26323     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
26324         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
26325         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
26326
26327       if (AsmPieces.size() == 3)
26328         return true;
26329       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
26330         return true;
26331     }
26332   }
26333   return false;
26334 }
26335
26336 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
26337   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
26338
26339   std::string AsmStr = IA->getAsmString();
26340
26341   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
26342   if (!Ty || Ty->getBitWidth() % 16 != 0)
26343     return false;
26344
26345   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
26346   SmallVector<StringRef, 4> AsmPieces;
26347   SplitString(AsmStr, AsmPieces, ";\n");
26348
26349   switch (AsmPieces.size()) {
26350   default: return false;
26351   case 1:
26352     // FIXME: this should verify that we are targeting a 486 or better.  If not,
26353     // we will turn this bswap into something that will be lowered to logical
26354     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
26355     // lower so don't worry about this.
26356     // bswap $0
26357     if (matchAsm(AsmPieces[0], "bswap", "$0") ||
26358         matchAsm(AsmPieces[0], "bswapl", "$0") ||
26359         matchAsm(AsmPieces[0], "bswapq", "$0") ||
26360         matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
26361         matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
26362         matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
26363       // No need to check constraints, nothing other than the equivalent of
26364       // "=r,0" would be valid here.
26365       return IntrinsicLowering::LowerToByteSwap(CI);
26366     }
26367
26368     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
26369     if (CI->getType()->isIntegerTy(16) &&
26370         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26371         (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
26372          matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
26373       AsmPieces.clear();
26374       const std::string &ConstraintsStr = IA->getConstraintString();
26375       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26376       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26377       if (clobbersFlagRegisters(AsmPieces))
26378         return IntrinsicLowering::LowerToByteSwap(CI);
26379     }
26380     break;
26381   case 3:
26382     if (CI->getType()->isIntegerTy(32) &&
26383         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26384         matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
26385         matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
26386         matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
26387       AsmPieces.clear();
26388       const std::string &ConstraintsStr = IA->getConstraintString();
26389       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26390       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26391       if (clobbersFlagRegisters(AsmPieces))
26392         return IntrinsicLowering::LowerToByteSwap(CI);
26393     }
26394
26395     if (CI->getType()->isIntegerTy(64)) {
26396       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
26397       if (Constraints.size() >= 2 &&
26398           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
26399           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
26400         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
26401         if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
26402             matchAsm(AsmPieces[1], "bswap", "%edx") &&
26403             matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
26404           return IntrinsicLowering::LowerToByteSwap(CI);
26405       }
26406     }
26407     break;
26408   }
26409   return false;
26410 }
26411
26412 /// getConstraintType - Given a constraint letter, return the type of
26413 /// constraint it is for this target.
26414 X86TargetLowering::ConstraintType
26415 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
26416   if (Constraint.size() == 1) {
26417     switch (Constraint[0]) {
26418     case 'R':
26419     case 'q':
26420     case 'Q':
26421     case 'f':
26422     case 't':
26423     case 'u':
26424     case 'y':
26425     case 'x':
26426     case 'Y':
26427     case 'l':
26428       return C_RegisterClass;
26429     case 'a':
26430     case 'b':
26431     case 'c':
26432     case 'd':
26433     case 'S':
26434     case 'D':
26435     case 'A':
26436       return C_Register;
26437     case 'I':
26438     case 'J':
26439     case 'K':
26440     case 'L':
26441     case 'M':
26442     case 'N':
26443     case 'G':
26444     case 'C':
26445     case 'e':
26446     case 'Z':
26447       return C_Other;
26448     default:
26449       break;
26450     }
26451   }
26452   return TargetLowering::getConstraintType(Constraint);
26453 }
26454
26455 /// Examine constraint type and operand type and determine a weight value.
26456 /// This object must already have been set up with the operand type
26457 /// and the current alternative constraint selected.
26458 TargetLowering::ConstraintWeight
26459   X86TargetLowering::getSingleConstraintMatchWeight(
26460     AsmOperandInfo &info, const char *constraint) const {
26461   ConstraintWeight weight = CW_Invalid;
26462   Value *CallOperandVal = info.CallOperandVal;
26463     // If we don't have a value, we can't do a match,
26464     // but allow it at the lowest weight.
26465   if (!CallOperandVal)
26466     return CW_Default;
26467   Type *type = CallOperandVal->getType();
26468   // Look at the constraint type.
26469   switch (*constraint) {
26470   default:
26471     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
26472   case 'R':
26473   case 'q':
26474   case 'Q':
26475   case 'a':
26476   case 'b':
26477   case 'c':
26478   case 'd':
26479   case 'S':
26480   case 'D':
26481   case 'A':
26482     if (CallOperandVal->getType()->isIntegerTy())
26483       weight = CW_SpecificReg;
26484     break;
26485   case 'f':
26486   case 't':
26487   case 'u':
26488     if (type->isFloatingPointTy())
26489       weight = CW_SpecificReg;
26490     break;
26491   case 'y':
26492     if (type->isX86_MMXTy() && Subtarget->hasMMX())
26493       weight = CW_SpecificReg;
26494     break;
26495   case 'x':
26496   case 'Y':
26497     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
26498         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
26499       weight = CW_Register;
26500     break;
26501   case 'I':
26502     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
26503       if (C->getZExtValue() <= 31)
26504         weight = CW_Constant;
26505     }
26506     break;
26507   case 'J':
26508     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26509       if (C->getZExtValue() <= 63)
26510         weight = CW_Constant;
26511     }
26512     break;
26513   case 'K':
26514     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26515       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
26516         weight = CW_Constant;
26517     }
26518     break;
26519   case 'L':
26520     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26521       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
26522         weight = CW_Constant;
26523     }
26524     break;
26525   case 'M':
26526     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26527       if (C->getZExtValue() <= 3)
26528         weight = CW_Constant;
26529     }
26530     break;
26531   case 'N':
26532     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26533       if (C->getZExtValue() <= 0xff)
26534         weight = CW_Constant;
26535     }
26536     break;
26537   case 'G':
26538   case 'C':
26539     if (dyn_cast<ConstantFP>(CallOperandVal)) {
26540       weight = CW_Constant;
26541     }
26542     break;
26543   case 'e':
26544     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26545       if ((C->getSExtValue() >= -0x80000000LL) &&
26546           (C->getSExtValue() <= 0x7fffffffLL))
26547         weight = CW_Constant;
26548     }
26549     break;
26550   case 'Z':
26551     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26552       if (C->getZExtValue() <= 0xffffffff)
26553         weight = CW_Constant;
26554     }
26555     break;
26556   }
26557   return weight;
26558 }
26559
26560 /// LowerXConstraint - try to replace an X constraint, which matches anything,
26561 /// with another that has more specific requirements based on the type of the
26562 /// corresponding operand.
26563 const char *X86TargetLowering::
26564 LowerXConstraint(EVT ConstraintVT) const {
26565   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
26566   // 'f' like normal targets.
26567   if (ConstraintVT.isFloatingPoint()) {
26568     if (Subtarget->hasSSE2())
26569       return "Y";
26570     if (Subtarget->hasSSE1())
26571       return "x";
26572   }
26573
26574   return TargetLowering::LowerXConstraint(ConstraintVT);
26575 }
26576
26577 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
26578 /// vector.  If it is invalid, don't add anything to Ops.
26579 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
26580                                                      std::string &Constraint,
26581                                                      std::vector<SDValue>&Ops,
26582                                                      SelectionDAG &DAG) const {
26583   SDValue Result;
26584
26585   // Only support length 1 constraints for now.
26586   if (Constraint.length() > 1) return;
26587
26588   char ConstraintLetter = Constraint[0];
26589   switch (ConstraintLetter) {
26590   default: break;
26591   case 'I':
26592     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26593       if (C->getZExtValue() <= 31) {
26594         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26595         break;
26596       }
26597     }
26598     return;
26599   case 'J':
26600     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26601       if (C->getZExtValue() <= 63) {
26602         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26603         break;
26604       }
26605     }
26606     return;
26607   case 'K':
26608     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26609       if (isInt<8>(C->getSExtValue())) {
26610         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26611         break;
26612       }
26613     }
26614     return;
26615   case 'L':
26616     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26617       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
26618           (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
26619         Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
26620         break;
26621       }
26622     }
26623     return;
26624   case 'M':
26625     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26626       if (C->getZExtValue() <= 3) {
26627         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26628         break;
26629       }
26630     }
26631     return;
26632   case 'N':
26633     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26634       if (C->getZExtValue() <= 255) {
26635         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26636         break;
26637       }
26638     }
26639     return;
26640   case 'O':
26641     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26642       if (C->getZExtValue() <= 127) {
26643         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26644         break;
26645       }
26646     }
26647     return;
26648   case 'e': {
26649     // 32-bit signed value
26650     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26651       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26652                                            C->getSExtValue())) {
26653         // Widen to 64 bits here to get it sign extended.
26654         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
26655         break;
26656       }
26657     // FIXME gcc accepts some relocatable values here too, but only in certain
26658     // memory models; it's complicated.
26659     }
26660     return;
26661   }
26662   case 'Z': {
26663     // 32-bit unsigned value
26664     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26665       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26666                                            C->getZExtValue())) {
26667         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26668         break;
26669       }
26670     }
26671     // FIXME gcc accepts some relocatable values here too, but only in certain
26672     // memory models; it's complicated.
26673     return;
26674   }
26675   case 'i': {
26676     // Literal immediates are always ok.
26677     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
26678       // Widen to 64 bits here to get it sign extended.
26679       Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
26680       break;
26681     }
26682
26683     // In any sort of PIC mode addresses need to be computed at runtime by
26684     // adding in a register or some sort of table lookup.  These can't
26685     // be used as immediates.
26686     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
26687       return;
26688
26689     // If we are in non-pic codegen mode, we allow the address of a global (with
26690     // an optional displacement) to be used with 'i'.
26691     GlobalAddressSDNode *GA = nullptr;
26692     int64_t Offset = 0;
26693
26694     // Match either (GA), (GA+C), (GA+C1+C2), etc.
26695     while (1) {
26696       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
26697         Offset += GA->getOffset();
26698         break;
26699       } else if (Op.getOpcode() == ISD::ADD) {
26700         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26701           Offset += C->getZExtValue();
26702           Op = Op.getOperand(0);
26703           continue;
26704         }
26705       } else if (Op.getOpcode() == ISD::SUB) {
26706         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26707           Offset += -C->getZExtValue();
26708           Op = Op.getOperand(0);
26709           continue;
26710         }
26711       }
26712
26713       // Otherwise, this isn't something we can handle, reject it.
26714       return;
26715     }
26716
26717     const GlobalValue *GV = GA->getGlobal();
26718     // If we require an extra load to get this address, as in PIC mode, we
26719     // can't accept it.
26720     if (isGlobalStubReference(
26721             Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
26722       return;
26723
26724     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
26725                                         GA->getValueType(0), Offset);
26726     break;
26727   }
26728   }
26729
26730   if (Result.getNode()) {
26731     Ops.push_back(Result);
26732     return;
26733   }
26734   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
26735 }
26736
26737 std::pair<unsigned, const TargetRegisterClass*>
26738 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
26739                                                 MVT VT) const {
26740   // First, see if this is a constraint that directly corresponds to an LLVM
26741   // register class.
26742   if (Constraint.size() == 1) {
26743     // GCC Constraint Letters
26744     switch (Constraint[0]) {
26745     default: break;
26746       // TODO: Slight differences here in allocation order and leaving
26747       // RIP in the class. Do they matter any more here than they do
26748       // in the normal allocation?
26749     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
26750       if (Subtarget->is64Bit()) {
26751         if (VT == MVT::i32 || VT == MVT::f32)
26752           return std::make_pair(0U, &X86::GR32RegClass);
26753         if (VT == MVT::i16)
26754           return std::make_pair(0U, &X86::GR16RegClass);
26755         if (VT == MVT::i8 || VT == MVT::i1)
26756           return std::make_pair(0U, &X86::GR8RegClass);
26757         if (VT == MVT::i64 || VT == MVT::f64)
26758           return std::make_pair(0U, &X86::GR64RegClass);
26759         break;
26760       }
26761       // 32-bit fallthrough
26762     case 'Q':   // Q_REGS
26763       if (VT == MVT::i32 || VT == MVT::f32)
26764         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
26765       if (VT == MVT::i16)
26766         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
26767       if (VT == MVT::i8 || VT == MVT::i1)
26768         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
26769       if (VT == MVT::i64)
26770         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
26771       break;
26772     case 'r':   // GENERAL_REGS
26773     case 'l':   // INDEX_REGS
26774       if (VT == MVT::i8 || VT == MVT::i1)
26775         return std::make_pair(0U, &X86::GR8RegClass);
26776       if (VT == MVT::i16)
26777         return std::make_pair(0U, &X86::GR16RegClass);
26778       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
26779         return std::make_pair(0U, &X86::GR32RegClass);
26780       return std::make_pair(0U, &X86::GR64RegClass);
26781     case 'R':   // LEGACY_REGS
26782       if (VT == MVT::i8 || VT == MVT::i1)
26783         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
26784       if (VT == MVT::i16)
26785         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
26786       if (VT == MVT::i32 || !Subtarget->is64Bit())
26787         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
26788       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
26789     case 'f':  // FP Stack registers.
26790       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
26791       // value to the correct fpstack register class.
26792       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
26793         return std::make_pair(0U, &X86::RFP32RegClass);
26794       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
26795         return std::make_pair(0U, &X86::RFP64RegClass);
26796       return std::make_pair(0U, &X86::RFP80RegClass);
26797     case 'y':   // MMX_REGS if MMX allowed.
26798       if (!Subtarget->hasMMX()) break;
26799       return std::make_pair(0U, &X86::VR64RegClass);
26800     case 'Y':   // SSE_REGS if SSE2 allowed
26801       if (!Subtarget->hasSSE2()) break;
26802       // FALL THROUGH.
26803     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
26804       if (!Subtarget->hasSSE1()) break;
26805
26806       switch (VT.SimpleTy) {
26807       default: break;
26808       // Scalar SSE types.
26809       case MVT::f32:
26810       case MVT::i32:
26811         return std::make_pair(0U, &X86::FR32RegClass);
26812       case MVT::f64:
26813       case MVT::i64:
26814         return std::make_pair(0U, &X86::FR64RegClass);
26815       // Vector types.
26816       case MVT::v16i8:
26817       case MVT::v8i16:
26818       case MVT::v4i32:
26819       case MVT::v2i64:
26820       case MVT::v4f32:
26821       case MVT::v2f64:
26822         return std::make_pair(0U, &X86::VR128RegClass);
26823       // AVX types.
26824       case MVT::v32i8:
26825       case MVT::v16i16:
26826       case MVT::v8i32:
26827       case MVT::v4i64:
26828       case MVT::v8f32:
26829       case MVT::v4f64:
26830         return std::make_pair(0U, &X86::VR256RegClass);
26831       case MVT::v8f64:
26832       case MVT::v16f32:
26833       case MVT::v16i32:
26834       case MVT::v8i64:
26835         return std::make_pair(0U, &X86::VR512RegClass);
26836       }
26837       break;
26838     }
26839   }
26840
26841   // Use the default implementation in TargetLowering to convert the register
26842   // constraint into a member of a register class.
26843   std::pair<unsigned, const TargetRegisterClass*> Res;
26844   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
26845
26846   // Not found as a standard register?
26847   if (!Res.second) {
26848     // Map st(0) -> st(7) -> ST0
26849     if (Constraint.size() == 7 && Constraint[0] == '{' &&
26850         tolower(Constraint[1]) == 's' &&
26851         tolower(Constraint[2]) == 't' &&
26852         Constraint[3] == '(' &&
26853         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
26854         Constraint[5] == ')' &&
26855         Constraint[6] == '}') {
26856
26857       Res.first = X86::FP0+Constraint[4]-'0';
26858       Res.second = &X86::RFP80RegClass;
26859       return Res;
26860     }
26861
26862     // GCC allows "st(0)" to be called just plain "st".
26863     if (StringRef("{st}").equals_lower(Constraint)) {
26864       Res.first = X86::FP0;
26865       Res.second = &X86::RFP80RegClass;
26866       return Res;
26867     }
26868
26869     // flags -> EFLAGS
26870     if (StringRef("{flags}").equals_lower(Constraint)) {
26871       Res.first = X86::EFLAGS;
26872       Res.second = &X86::CCRRegClass;
26873       return Res;
26874     }
26875
26876     // 'A' means EAX + EDX.
26877     if (Constraint == "A") {
26878       Res.first = X86::EAX;
26879       Res.second = &X86::GR32_ADRegClass;
26880       return Res;
26881     }
26882     return Res;
26883   }
26884
26885   // Otherwise, check to see if this is a register class of the wrong value
26886   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
26887   // turn into {ax},{dx}.
26888   if (Res.second->hasType(VT))
26889     return Res;   // Correct type already, nothing to do.
26890
26891   // All of the single-register GCC register classes map their values onto
26892   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
26893   // really want an 8-bit or 32-bit register, map to the appropriate register
26894   // class and return the appropriate register.
26895   if (Res.second == &X86::GR16RegClass) {
26896     if (VT == MVT::i8 || VT == MVT::i1) {
26897       unsigned DestReg = 0;
26898       switch (Res.first) {
26899       default: break;
26900       case X86::AX: DestReg = X86::AL; break;
26901       case X86::DX: DestReg = X86::DL; break;
26902       case X86::CX: DestReg = X86::CL; break;
26903       case X86::BX: DestReg = X86::BL; break;
26904       }
26905       if (DestReg) {
26906         Res.first = DestReg;
26907         Res.second = &X86::GR8RegClass;
26908       }
26909     } else if (VT == MVT::i32 || VT == MVT::f32) {
26910       unsigned DestReg = 0;
26911       switch (Res.first) {
26912       default: break;
26913       case X86::AX: DestReg = X86::EAX; break;
26914       case X86::DX: DestReg = X86::EDX; break;
26915       case X86::CX: DestReg = X86::ECX; break;
26916       case X86::BX: DestReg = X86::EBX; break;
26917       case X86::SI: DestReg = X86::ESI; break;
26918       case X86::DI: DestReg = X86::EDI; break;
26919       case X86::BP: DestReg = X86::EBP; break;
26920       case X86::SP: DestReg = X86::ESP; break;
26921       }
26922       if (DestReg) {
26923         Res.first = DestReg;
26924         Res.second = &X86::GR32RegClass;
26925       }
26926     } else if (VT == MVT::i64 || VT == MVT::f64) {
26927       unsigned DestReg = 0;
26928       switch (Res.first) {
26929       default: break;
26930       case X86::AX: DestReg = X86::RAX; break;
26931       case X86::DX: DestReg = X86::RDX; break;
26932       case X86::CX: DestReg = X86::RCX; break;
26933       case X86::BX: DestReg = X86::RBX; break;
26934       case X86::SI: DestReg = X86::RSI; break;
26935       case X86::DI: DestReg = X86::RDI; break;
26936       case X86::BP: DestReg = X86::RBP; break;
26937       case X86::SP: DestReg = X86::RSP; break;
26938       }
26939       if (DestReg) {
26940         Res.first = DestReg;
26941         Res.second = &X86::GR64RegClass;
26942       }
26943     }
26944   } else if (Res.second == &X86::FR32RegClass ||
26945              Res.second == &X86::FR64RegClass ||
26946              Res.second == &X86::VR128RegClass ||
26947              Res.second == &X86::VR256RegClass ||
26948              Res.second == &X86::FR32XRegClass ||
26949              Res.second == &X86::FR64XRegClass ||
26950              Res.second == &X86::VR128XRegClass ||
26951              Res.second == &X86::VR256XRegClass ||
26952              Res.second == &X86::VR512RegClass) {
26953     // Handle references to XMM physical registers that got mapped into the
26954     // wrong class.  This can happen with constraints like {xmm0} where the
26955     // target independent register mapper will just pick the first match it can
26956     // find, ignoring the required type.
26957
26958     if (VT == MVT::f32 || VT == MVT::i32)
26959       Res.second = &X86::FR32RegClass;
26960     else if (VT == MVT::f64 || VT == MVT::i64)
26961       Res.second = &X86::FR64RegClass;
26962     else if (X86::VR128RegClass.hasType(VT))
26963       Res.second = &X86::VR128RegClass;
26964     else if (X86::VR256RegClass.hasType(VT))
26965       Res.second = &X86::VR256RegClass;
26966     else if (X86::VR512RegClass.hasType(VT))
26967       Res.second = &X86::VR512RegClass;
26968   }
26969
26970   return Res;
26971 }
26972
26973 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
26974                                             Type *Ty) const {
26975   // Scaling factors are not free at all.
26976   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
26977   // will take 2 allocations in the out of order engine instead of 1
26978   // for plain addressing mode, i.e. inst (reg1).
26979   // E.g.,
26980   // vaddps (%rsi,%drx), %ymm0, %ymm1
26981   // Requires two allocations (one for the load, one for the computation)
26982   // whereas:
26983   // vaddps (%rsi), %ymm0, %ymm1
26984   // Requires just 1 allocation, i.e., freeing allocations for other operations
26985   // and having less micro operations to execute.
26986   //
26987   // For some X86 architectures, this is even worse because for instance for
26988   // stores, the complex addressing mode forces the instruction to use the
26989   // "load" ports instead of the dedicated "store" port.
26990   // E.g., on Haswell:
26991   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
26992   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
26993   if (isLegalAddressingMode(AM, Ty))
26994     // Scale represents reg2 * scale, thus account for 1
26995     // as soon as we use a second register.
26996     return AM.Scale != 0;
26997   return -1;
26998 }
26999
27000 bool X86TargetLowering::isTargetFTOL() const {
27001   return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
27002 }