lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86ISelLowering.h"
  16 #include "Utils/X86ShuffleDecode.h"
  17 #include "X86CallingConv.h"
  18 #include "X86FrameLowering.h"
  19 #include "X86InstrBuilder.h"
  20 #include "X86MachineFunctionInfo.h"
  21 #include "X86TargetMachine.h"
  22 #include "X86TargetObjectFile.h"
  23 #include "llvm/ADT/SmallBitVector.h"
  24 #include "llvm/ADT/SmallSet.h"
  25 #include "llvm/ADT/Statistic.h"
  26 #include "llvm/ADT/StringExtras.h"
  27 #include "llvm/ADT/StringSwitch.h"
  28 #include "llvm/ADT/VariadicFunction.h"
  29 #include "llvm/CodeGen/IntrinsicLowering.h"
  30 #include "llvm/CodeGen/MachineFrameInfo.h"
  31 #include "llvm/CodeGen/MachineFunction.h"
  32 #include "llvm/CodeGen/MachineInstrBuilder.h"
  33 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  34 #include "llvm/CodeGen/MachineModuleInfo.h"
  35 #include "llvm/CodeGen/MachineRegisterInfo.h"
  36 #include "llvm/IR/CallSite.h"
  37 #include "llvm/IR/CallingConv.h"
  38 #include "llvm/IR/Constants.h"
  39 #include "llvm/IR/DerivedTypes.h"
  40 #include "llvm/IR/Function.h"
  41 #include "llvm/IR/GlobalAlias.h"
  42 #include "llvm/IR/GlobalVariable.h"
  43 #include "llvm/IR/Instructions.h"
  44 #include "llvm/IR/Intrinsics.h"
  45 #include "llvm/MC/MCAsmInfo.h"
  46 #include "llvm/MC/MCContext.h"
  47 #include "llvm/MC/MCExpr.h"
  48 #include "llvm/MC/MCSymbol.h"
  49 #include "llvm/Support/CommandLine.h"
  50 #include "llvm/Support/Debug.h"
  51 #include "llvm/Support/ErrorHandling.h"
  52 #include "llvm/Support/MathExtras.h"
  53 #include "llvm/Target/TargetOptions.h"
  54 #include "X86IntrinsicsInfo.h"
  55 #include <bitset>
  56 #include <numeric>
  57 #include <cctype>
  58 using namespace llvm;
  59
  60 #define DEBUG_TYPE "x86-isel"
  61
  62 STATISTIC(NumTailCalls, "Number of tail calls");
  63
  64 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  65     "x86-experimental-vector-widening-legalization", cl::init(false),
  66     cl::desc("Enable an experimental vector type legalization through widening "
  67              "rather than promotion."),
  68     cl::Hidden);
  69
  70 static cl::opt<bool> ExperimentalVectorShuffleLowering(
  71     "x86-experimental-vector-shuffle-lowering", cl::init(true),
  72     cl::desc("Enable an experimental vector shuffle lowering code path."),
  73     cl::Hidden);
  74
  75 static cl::opt<bool> ExperimentalVectorShuffleLegality(
  76     "x86-experimental-vector-shuffle-legality", cl::init(false),
  77     cl::desc("Enable experimental shuffle legality based on the experimental "
  78              "shuffle lowering. Should only be used with the experimental "
  79              "shuffle lowering."),
  80     cl::Hidden);
  81
  82 static cl::opt<int> ReciprocalEstimateRefinementSteps(
  83     "x86-recip-refinement-steps", cl::init(1),
  84     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
  85              "result of the hardware reciprocal estimate instruction."),
  86     cl::NotHidden);
  87
  88 // Forward declarations.
  89 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
  90                        SDValue V2);
  91
  92 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
  93                                 SelectionDAG &DAG, SDLoc dl,
  94                                 unsigned vectorWidth) {
  95   assert((vectorWidth == 128 || vectorWidth == 256) &&
  96          "Unsupported vector width");
  97   EVT VT = Vec.getValueType();
  98   EVT ElVT = VT.getVectorElementType();
  99   unsigned Factor = VT.getSizeInBits()/vectorWidth;
 100   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
 101                                   VT.getVectorNumElements()/Factor);
 102
 103   // Extract from UNDEF is UNDEF.
 104   if (Vec.getOpcode() == ISD::UNDEF)
 105     return DAG.getUNDEF(ResultVT);
 106
 107   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
 108   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
 109
 110   // This is the index of the first element of the vectorWidth-bit chunk
 111   // we want.
 112   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
 113                                * ElemsPerChunk);
 114
 115   // If the input is a buildvector just emit a smaller one.
 116   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
 117     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
 118                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
 119                                     ElemsPerChunk));
 120
 121   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 122   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 123 }
 124
 125 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 126 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
 127 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
 128 /// instructions or a simple subregister reference. Idx is an index in the
 129 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
 130 /// lowering EXTRACT_VECTOR_ELT operations easier.
 131 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
 132                                    SelectionDAG &DAG, SDLoc dl) {
 133   assert((Vec.getValueType().is256BitVector() ||
 134           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
 135   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
 136 }
 137
 138 /// Generate a DAG to grab 256-bits from a 512-bit vector.
 139 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
 140                                    SelectionDAG &DAG, SDLoc dl) {
 141   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
 142   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
 143 }
 144
 145 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
 146                                unsigned IdxVal, SelectionDAG &DAG,
 147                                SDLoc dl, unsigned vectorWidth) {
 148   assert((vectorWidth == 128 || vectorWidth == 256) &&
 149          "Unsupported vector width");
 150   // Inserting UNDEF is Result
 151   if (Vec.getOpcode() == ISD::UNDEF)
 152     return Result;
 153   EVT VT = Vec.getValueType();
 154   EVT ElVT = VT.getVectorElementType();
 155   EVT ResultVT = Result.getValueType();
 156
 157   // Insert the relevant vectorWidth bits.
 158   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
 159
 160   // This is the index of the first element of the vectorWidth-bit chunk
 161   // we want.
 162   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
 163                                * ElemsPerChunk);
 164
 165   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 166   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 167 }
 168
 169 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
 170 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
 171 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
 172 /// simple superregister reference.  Idx is an index in the 128 bits
 173 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 174 /// lowering INSERT_VECTOR_ELT operations easier.
 175 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 176                                   SelectionDAG &DAG,SDLoc dl) {
 177   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
 178   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 179 }
 180
 181 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 182                                   SelectionDAG &DAG, SDLoc dl) {
 183   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
 184   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 185 }
 186
 187 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
 188 /// instructions. This is used because creating CONCAT_VECTOR nodes of
 189 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
 190 /// large BUILD_VECTORS.
 191 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
 192                                    unsigned NumElems, SelectionDAG &DAG,
 193                                    SDLoc dl) {
 194   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 195   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
 196 }
 197
 198 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
 199                                    unsigned NumElems, SelectionDAG &DAG,
 200                                    SDLoc dl) {
 201   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 202   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 203 }
 204
 205 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 206                                      const X86Subtarget &STI)
 207     : TargetLowering(TM), Subtarget(&STI) {
 208   X86ScalarSSEf64 = Subtarget->hasSSE2();
 209   X86ScalarSSEf32 = Subtarget->hasSSE1();
 210   TD = getDataLayout();
 211
 212   // Set up the TargetLowering object.
 213   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 214
 215   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 216   setBooleanContents(ZeroOrOneBooleanContent);
 217   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 218   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 219
 220   // For 64-bit, since we have so many registers, use the ILP scheduler.
 221   // For 32-bit, use the register pressure specific scheduling.
 222   // For Atom, always use ILP scheduling.
 223   if (Subtarget->isAtom())
 224     setSchedulingPreference(Sched::ILP);
 225   else if (Subtarget->is64Bit())
 226     setSchedulingPreference(Sched::ILP);
 227   else
 228     setSchedulingPreference(Sched::RegPressure);
 229   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
 230   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 231
 232   // Bypass expensive divides on Atom when compiling with O2.
 233   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 234     if (Subtarget->hasSlowDivide32())
 235       addBypassSlowDiv(32, 8);
 236     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
 237       addBypassSlowDiv(64, 16);
 238   }
 239
 240   if (Subtarget->isTargetKnownWindowsMSVC()) {
 241     // Setup Windows compiler runtime calls.
 242     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 243     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 244     setLibcallName(RTLIB::SREM_I64, "_allrem");
 245     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 246     setLibcallName(RTLIB::MUL_I64, "_allmul");
 247     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 248     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 249     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 250     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 251     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 252
 253     // The _ftol2 runtime function has an unusual calling conv, which
 254     // is modeled by a special pseudo-instruction.
 255     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
 256     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
 257     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
 258     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
 259   }
 260
 261   if (Subtarget->isTargetDarwin()) {
 262     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 263     setUseUnderscoreSetJmp(false);
 264     setUseUnderscoreLongJmp(false);
 265   } else if (Subtarget->isTargetWindowsGNU()) {
 266     // MS runtime is weird: it exports _setjmp, but longjmp!
 267     setUseUnderscoreSetJmp(true);
 268     setUseUnderscoreLongJmp(false);
 269   } else {
 270     setUseUnderscoreSetJmp(true);
 271     setUseUnderscoreLongJmp(true);
 272   }
 273
 274   // Set up the register classes.
 275   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 276   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 277   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 278   if (Subtarget->is64Bit())
 279     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 280
 281   for (MVT VT : MVT::integer_valuetypes())
 282     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 283
 284   // We don't accept any truncstore of integer registers.
 285   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 286   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 287   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 288   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 289   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 290   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 291
 292   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 293
 294   // SETOEQ and SETUNE require checking two conditions.
 295   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 296   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 297   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 298   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 299   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 300   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 301
 302   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 303   // operation.
 304   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 305   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 306   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 307
 308   if (Subtarget->is64Bit()) {
 309     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
 310     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 311   } else if (!TM.Options.UseSoftFloat) {
 312     // We have an algorithm for SSE2->double, and we turn this into a
 313     // 64-bit FILD followed by conditional FADD for other targets.
 314     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 315     // We have an algorithm for SSE2, and we turn this into a 64-bit
 316     // FILD for other targets.
 317     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 318   }
 319
 320   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 321   // this operation.
 322   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 323   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 324
 325   if (!TM.Options.UseSoftFloat) {
 326     // SSE has no i16 to fp conversion, only i32
 327     if (X86ScalarSSEf32) {
 328       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 329       // f32 and f64 cases are Legal, f80 case is not
 330       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 331     } else {
 332       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 333       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 334     }
 335   } else {
 336     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 337     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 338   }
 339
 340   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 341   // are Legal, f80 is custom lowered.
 342   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 343   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 344
 345   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 346   // this operation.
 347   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 348   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 349
 350   if (X86ScalarSSEf32) {
 351     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 352     // f32 and f64 cases are Legal, f80 case is not
 353     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 354   } else {
 355     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 356     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 357   }
 358
 359   // Handle FP_TO_UINT by promoting the destination to a larger signed
 360   // conversion.
 361   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 362   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 363   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 364
 365   if (Subtarget->is64Bit()) {
 366     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
 367     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
 368   } else if (!TM.Options.UseSoftFloat) {
 369     // Since AVX is a superset of SSE3, only check for SSE here.
 370     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
 371       // Expand FP_TO_UINT into a select.
 372       // FIXME: We would like to use a Custom expander here eventually to do
 373       // the optimal thing for SSE vs. the default expansion in the legalizer.
 374       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 375     else
 376       // With SSE3 we can use fisttpll to convert to a signed i64; without
 377       // SSE, we're stuck with a fistpll.
 378       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 379   }
 380
 381   if (isTargetFTOL()) {
 382     // Use the _ftol2 runtime function, which has a pseudo-instruction
 383     // to handle its weird calling convention.
 384     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 385   }
 386
 387   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 388   if (!X86ScalarSSEf64) {
 389     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 390     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 391     if (Subtarget->is64Bit()) {
 392       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 393       // Without SSE, i64->f64 goes through memory.
 394       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 395     }
 396   }
 397
 398   // Scalar integer divide and remainder are lowered to use operations that
 399   // produce two results, to match the available instructions. This exposes
 400   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 401   // into a single instruction.
 402   //
 403   // Scalar integer multiply-high is also lowered to use two-result
 404   // operations, to match the available instructions. However, plain multiply
 405   // (low) operations are left as Legal, as there are single-result
 406   // instructions for this in x86. Using the two-result multiply instructions
 407   // when both high and low results are needed must be arranged by dagcombine.
 408   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 409     MVT VT = IntVTs[i];
 410     setOperationAction(ISD::MULHS, VT, Expand);
 411     setOperationAction(ISD::MULHU, VT, Expand);
 412     setOperationAction(ISD::SDIV, VT, Expand);
 413     setOperationAction(ISD::UDIV, VT, Expand);
 414     setOperationAction(ISD::SREM, VT, Expand);
 415     setOperationAction(ISD::UREM, VT, Expand);
 416
 417     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
 418     setOperationAction(ISD::ADDC, VT, Custom);
 419     setOperationAction(ISD::ADDE, VT, Custom);
 420     setOperationAction(ISD::SUBC, VT, Custom);
 421     setOperationAction(ISD::SUBE, VT, Custom);
 422   }
 423
 424   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 425   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 426   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
 427   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
 428   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
 429   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
 430   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
 431   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
 432   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
 433   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
 434   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
 435   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
 436   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
 437   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
 438   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
 439   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
 440   if (Subtarget->is64Bit())
 441     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 442   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 443   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 444   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 445   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 446   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 447   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 448   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 449   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 450
 451   // Promote the i8 variants and force them on up to i32 which has a shorter
 452   // encoding.
 453   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
 454   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
 455   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
 456   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
 457   if (Subtarget->hasBMI()) {
 458     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
 459     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
 460     if (Subtarget->is64Bit())
 461       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
 462   } else {
 463     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 464     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 465     if (Subtarget->is64Bit())
 466       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 467   }
 468
 469   if (Subtarget->hasLZCNT()) {
 470     // When promoting the i8 variants, force them to i32 for a shorter
 471     // encoding.
 472     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
 473     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
 474     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
 475     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 476     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
 477     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
 478     if (Subtarget->is64Bit())
 479       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 480   } else {
 481     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 482     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 483     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 484     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 485     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 486     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 487     if (Subtarget->is64Bit()) {
 488       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 489       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 490     }
 491   }
 492
 493   // Special handling for half-precision floating point conversions.
 494   // If we don't have F16C support, then lower half float conversions
 495   // into library calls.
 496   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
 497     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 498     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 499   }
 500
 501   // There's never any support for operations beyond MVT::f32.
 502   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 503   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 504   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 505   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 506
 507   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 508   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 509   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 510   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 511   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 512   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 513
 514   if (Subtarget->hasPOPCNT()) {
 515     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 516   } else {
 517     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 518     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 519     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 520     if (Subtarget->is64Bit())
 521       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 522   }
 523
 524   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 525
 526   if (!Subtarget->hasMOVBE())
 527     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 528
 529   // These should be promoted to a larger select which is supported.
 530   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 531   // X86 wants to expand cmov itself.
 532   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
 533   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
 534   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
 535   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
 536   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
 537   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
 538   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
 539   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
 540   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
 541   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
 542   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
 543   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
 544   if (Subtarget->is64Bit()) {
 545     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
 546     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
 547   }
 548   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 549   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 550   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 551   // support continuation, user-level threading, and etc.. As a result, no
 552   // other SjLj exception interfaces are implemented and please don't build
 553   // your own exception handling based on them.
 554   // LLVM/Clang supports zero-cost DWARF exception handling.
 555   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 556   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 557
 558   // Darwin ABI issue.
 559   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
 560   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
 561   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
 562   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
 563   if (Subtarget->is64Bit())
 564     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 565   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
 566   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
 567   if (Subtarget->is64Bit()) {
 568     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
 569     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
 570     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
 571     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
 572     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
 573   }
 574   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
 575   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
 576   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
 577   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
 578   if (Subtarget->is64Bit()) {
 579     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
 580     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
 581     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
 582   }
 583
 584   if (Subtarget->hasSSE1())
 585     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 586
 587   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 588
 589   // Expand certain atomics
 590   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 591     MVT VT = IntVTs[i];
 592     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 593     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 594     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 595   }
 596
 597   if (Subtarget->hasCmpxchg16b()) {
 598     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 599   }
 600
 601   // FIXME - use subtarget debug flags
 602   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
 603       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
 604     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 605   }
 606
 607   if (Subtarget->is64Bit()) {
 608     setExceptionPointerRegister(X86::RAX);
 609     setExceptionSelectorRegister(X86::RDX);
 610   } else {
 611     setExceptionPointerRegister(X86::EAX);
 612     setExceptionSelectorRegister(X86::EDX);
 613   }
 614   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 615   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 616
 617   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 618   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 619
 620   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 621   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 622
 623   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 624   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 625   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 626   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
 627     // TargetInfo::X86_64ABIBuiltinVaList
 628     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
 629     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
 630   } else {
 631     // TargetInfo::CharPtrBuiltinVaList
 632     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
 633     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
 634   }
 635
 636   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 637   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 638
 639   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
 640
 641   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
 642     // f32 and f64 use SSE.
 643     // Set up the FP register classes.
 644     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 645     addRegisterClass(MVT::f64, &X86::FR64RegClass);
 646
 647     // Use ANDPD to simulate FABS.
 648     setOperationAction(ISD::FABS , MVT::f64, Custom);
 649     setOperationAction(ISD::FABS , MVT::f32, Custom);
 650
 651     // Use XORP to simulate FNEG.
 652     setOperationAction(ISD::FNEG , MVT::f64, Custom);
 653     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 654
 655     // Use ANDPD and ORPD to simulate FCOPYSIGN.
 656     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 657     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 658
 659     // Lower this to FGETSIGNx86 plus an AND.
 660     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 661     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 662
 663     // We don't support sin/cos/fmod
 664     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 665     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 666     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 667     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 668     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 669     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 670
 671     // Expand FP immediates into loads from the stack, except for the special
 672     // cases we handle.
 673     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 674     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 675   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
 676     // Use SSE for f32, x87 for f64.
 677     // Set up the FP register classes.
 678     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 679     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 680
 681     // Use ANDPS to simulate FABS.
 682     setOperationAction(ISD::FABS , MVT::f32, Custom);
 683
 684     // Use XORP to simulate FNEG.
 685     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 686
 687     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 688
 689     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 690     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 691     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 692
 693     // We don't support sin/cos/fmod
 694     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 695     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 696     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 697
 698     // Special cases we handle for FP constants.
 699     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 700     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 701     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 702     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 703     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 704
 705     if (!TM.Options.UnsafeFPMath) {
 706       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 707       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 708       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 709     }
 710   } else if (!TM.Options.UseSoftFloat) {
 711     // f32 and f64 in x87.
 712     // Set up the FP register classes.
 713     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 714     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 715
 716     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 717     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
 718     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 719     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 720
 721     if (!TM.Options.UnsafeFPMath) {
 722       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 723       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 724       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 725       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 726       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 727       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 728     }
 729     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 730     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 731     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 732     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 733     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 734     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 735     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 736     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 737   }
 738
 739   // We don't support FMA.
 740   setOperationAction(ISD::FMA, MVT::f64, Expand);
 741   setOperationAction(ISD::FMA, MVT::f32, Expand);
 742
 743   // Long double always uses X87.
 744   if (!TM.Options.UseSoftFloat) {
 745     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 746     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 747     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 748     {
 749       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
 750       addLegalFPImmediate(TmpFlt);  // FLD0
 751       TmpFlt.changeSign();
 752       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 753
 754       bool ignored;
 755       APFloat TmpFlt2(+1.0);
 756       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
 757                       &ignored);
 758       addLegalFPImmediate(TmpFlt2);  // FLD1
 759       TmpFlt2.changeSign();
 760       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 761     }
 762
 763     if (!TM.Options.UnsafeFPMath) {
 764       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 765       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 766       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 767     }
 768
 769     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 770     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 771     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 772     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 773     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 774     setOperationAction(ISD::FMA, MVT::f80, Expand);
 775   }
 776
 777   // Always use a library call for pow.
 778   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 779   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 780   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 781
 782   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 783   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 784   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 785   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 786   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 787   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 788   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 789
 790   // First set operation action for all vector types to either promote
 791   // (for widening) or expand (for scalarization). Then we will selectively
 792   // turn on ones that can be effectively codegen'd.
 793   for (MVT VT : MVT::vector_valuetypes()) {
 794     setOperationAction(ISD::ADD , VT, Expand);
 795     setOperationAction(ISD::SUB , VT, Expand);
 796     setOperationAction(ISD::FADD, VT, Expand);
 797     setOperationAction(ISD::FNEG, VT, Expand);
 798     setOperationAction(ISD::FSUB, VT, Expand);
 799     setOperationAction(ISD::MUL , VT, Expand);
 800     setOperationAction(ISD::FMUL, VT, Expand);
 801     setOperationAction(ISD::SDIV, VT, Expand);
 802     setOperationAction(ISD::UDIV, VT, Expand);
 803     setOperationAction(ISD::FDIV, VT, Expand);
 804     setOperationAction(ISD::SREM, VT, Expand);
 805     setOperationAction(ISD::UREM, VT, Expand);
 806     setOperationAction(ISD::LOAD, VT, Expand);
 807     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 808     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 809     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 810     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 811     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 812     setOperationAction(ISD::FABS, VT, Expand);
 813     setOperationAction(ISD::FSIN, VT, Expand);
 814     setOperationAction(ISD::FSINCOS, VT, Expand);
 815     setOperationAction(ISD::FCOS, VT, Expand);
 816     setOperationAction(ISD::FSINCOS, VT, Expand);
 817     setOperationAction(ISD::FREM, VT, Expand);
 818     setOperationAction(ISD::FMA,  VT, Expand);
 819     setOperationAction(ISD::FPOWI, VT, Expand);
 820     setOperationAction(ISD::FSQRT, VT, Expand);
 821     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 822     setOperationAction(ISD::FFLOOR, VT, Expand);
 823     setOperationAction(ISD::FCEIL, VT, Expand);
 824     setOperationAction(ISD::FTRUNC, VT, Expand);
 825     setOperationAction(ISD::FRINT, VT, Expand);
 826     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 827     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 828     setOperationAction(ISD::MULHS, VT, Expand);
 829     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 830     setOperationAction(ISD::MULHU, VT, Expand);
 831     setOperationAction(ISD::SDIVREM, VT, Expand);
 832     setOperationAction(ISD::UDIVREM, VT, Expand);
 833     setOperationAction(ISD::FPOW, VT, Expand);
 834     setOperationAction(ISD::CTPOP, VT, Expand);
 835     setOperationAction(ISD::CTTZ, VT, Expand);
 836     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
 837     setOperationAction(ISD::CTLZ, VT, Expand);
 838     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
 839     setOperationAction(ISD::SHL, VT, Expand);
 840     setOperationAction(ISD::SRA, VT, Expand);
 841     setOperationAction(ISD::SRL, VT, Expand);
 842     setOperationAction(ISD::ROTL, VT, Expand);
 843     setOperationAction(ISD::ROTR, VT, Expand);
 844     setOperationAction(ISD::BSWAP, VT, Expand);
 845     setOperationAction(ISD::SETCC, VT, Expand);
 846     setOperationAction(ISD::FLOG, VT, Expand);
 847     setOperationAction(ISD::FLOG2, VT, Expand);
 848     setOperationAction(ISD::FLOG10, VT, Expand);
 849     setOperationAction(ISD::FEXP, VT, Expand);
 850     setOperationAction(ISD::FEXP2, VT, Expand);
 851     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 852     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 853     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 854     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 855     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 856     setOperationAction(ISD::TRUNCATE, VT, Expand);
 857     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 858     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 859     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 860     setOperationAction(ISD::VSELECT, VT, Expand);
 861     setOperationAction(ISD::SELECT_CC, VT, Expand);
 862     for (MVT InnerVT : MVT::vector_valuetypes()) {
 863       setTruncStoreAction(InnerVT, VT, Expand);
 864
 865       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 866       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 867
 868       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 869       // types, we have to deal with them whether we ask for Expansion or not.
 870       // Setting Expand causes its own optimisation problems though, so leave
 871       // them legal.
 872       if (VT.getVectorElementType() == MVT::i1)
 873         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 874     }
 875   }
 876
 877   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 878   // with -msoft-float, disable use of MMX as well.
 879   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
 880     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 881     // No operations on x86mmx supported, everything uses intrinsics.
 882   }
 883
 884   // MMX-sized vectors (other than x86mmx) are expected to be expanded
 885   // into smaller operations.
 886   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
 887   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
 888   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
 889   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
 890   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
 891   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
 892   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
 893   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
 894   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
 895   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
 896   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
 897   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
 898   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
 899   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
 900   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
 901   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
 902   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
 903   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
 904   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
 905   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
 906   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
 907   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
 908   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
 909   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
 910   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
 911   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
 912   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
 913   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
 914   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
 915
 916   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
 917     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 918
 919     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
 920     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
 921     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
 922     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
 923     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
 924     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 925     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 926     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
 927     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 928     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 929     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 930     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 931     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 932   }
 933
 934   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
 935     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 936
 937     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 938     // registers cannot be used even for integer operations.
 939     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
 940     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
 941     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
 942     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 943
 944     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
 945     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
 946     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
 947     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
 948     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 949     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 950     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 951     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 952     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 953     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 954     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
 955     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
 956     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
 957     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
 958     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 959     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
 960     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
 961     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
 962     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
 963     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
 964     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 965     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 966
 967     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
 968     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
 969     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
 970     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
 971
 972     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
 973     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
 974     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 975     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 976     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 977
 978     // Only provide customized ctpop vector bit twiddling for vector types we
 979     // know to perform better than using the popcnt instructions on each vector
 980     // element. If popcnt isn't supported, always provide the custom version.
 981     if (!Subtarget->hasPOPCNT()) {
 982       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
 983       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
 984     }
 985
 986     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
 987     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
 988       MVT VT = (MVT::SimpleValueType)i;
 989       // Do not attempt to custom lower non-power-of-2 vectors
 990       if (!isPowerOf2_32(VT.getVectorNumElements()))
 991         continue;
 992       // Do not attempt to custom lower non-128-bit vectors
 993       if (!VT.is128BitVector())
 994         continue;
 995       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 996       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 997       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 998     }
 999
1000     // We support custom legalizing of sext and anyext loads for specific
1001     // memory vector types which we can load as a scalar (or sequence of
1002     // scalars) and extend in-register to a legal 128-bit vector type. For sext
1003     // loads these must work with a single scalar load.
1004     for (MVT VT : MVT::integer_vector_valuetypes()) {
1005       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
1006       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
1007       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
1008       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
1009       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
1010       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
1011       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
1012       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
1013       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
1014     }
1015
1016     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
1017     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
1018     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
1019     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
1020     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
1021     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
1022
1023     if (Subtarget->is64Bit()) {
1024       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1025       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1026     }
1027
1028     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
1029     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1030       MVT VT = (MVT::SimpleValueType)i;
1031
1032       // Do not attempt to promote non-128-bit vectors
1033       if (!VT.is128BitVector())
1034         continue;
1035
1036       setOperationAction(ISD::AND,    VT, Promote);
1037       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
1038       setOperationAction(ISD::OR,     VT, Promote);
1039       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
1040       setOperationAction(ISD::XOR,    VT, Promote);
1041       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
1042       setOperationAction(ISD::LOAD,   VT, Promote);
1043       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
1044       setOperationAction(ISD::SELECT, VT, Promote);
1045       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1046     }
1047
1048     // Custom lower v2i64 and v2f64 selects.
1049     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
1050     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
1051     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1052     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1053
1054     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1055     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1056
1057     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
1058     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
1059     // As there is no 64-bit GPR available, we need build a special custom
1060     // sequence to convert from v2i32 to v2f32.
1061     if (!Subtarget->is64Bit())
1062       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
1063
1064     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1065     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1066
1067     for (MVT VT : MVT::fp_vector_valuetypes())
1068       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
1069
1070     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1071     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1072     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1073   }
1074
1075   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1076     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
1077     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
1078     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
1079     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
1080     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
1081     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
1082     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
1083     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
1084     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
1085     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
1086
1087     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
1088     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
1089     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
1090     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
1091     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
1092     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
1093     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
1094     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
1095     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
1096     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
1097
1098     // FIXME: Do we need to handle scalar-to-vector here?
1099     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1100
1101     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
1102     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
1103     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
1104     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
1105     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
1106     // There is no BLENDI for byte vectors. We don't need to custom lower
1107     // some vselects for now.
1108     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1109
1110     // SSE41 brings specific instructions for doing vector sign extend even in
1111     // cases where we don't have SRA.
1112     for (MVT VT : MVT::integer_vector_valuetypes()) {
1113       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
1114       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
1115       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
1116     }
1117
1118     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1119     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1120     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1121     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1122     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1123     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1124     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1125
1126     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1127     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1128     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1129     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1130     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1131     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1132
1133     // i8 and i16 vectors are custom because the source register and source
1134     // source memory operand types are not the same width.  f32 vectors are
1135     // custom since the immediate controlling the insert encodes additional
1136     // information.
1137     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1138     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1139     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1140     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1141
1142     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1143     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1144     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1145     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1146
1147     // FIXME: these should be Legal, but that's only for the case where
1148     // the index is constant.  For now custom expand to deal with that.
1149     if (Subtarget->is64Bit()) {
1150       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1151       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1152     }
1153   }
1154
1155   if (Subtarget->hasSSE2()) {
1156     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1157     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1158
1159     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1160     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1161
1162     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1163     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1164
1165     // In the customized shift lowering, the legal cases in AVX2 will be
1166     // recognized.
1167     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
1168     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
1169
1170     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
1171     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
1172
1173     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
1174   }
1175
1176   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1177     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1178     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1179     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1180     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1181     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1182     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1183
1184     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1185     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1186     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1187
1188     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1189     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1190     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1191     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1192     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1193     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1194     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1195     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1196     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1197     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1198     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1199     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1200
1201     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1202     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1203     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1204     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1205     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1206     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1207     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1208     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1209     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1210     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1211     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1212     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1213
1214     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1215     // even though v8i16 is a legal type.
1216     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1217     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1218     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1219
1220     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1221     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1222     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1223
1224     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1225     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1226
1227     for (MVT VT : MVT::fp_vector_valuetypes())
1228       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1229
1230     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1231     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1232
1233     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1234     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1235
1236     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1237     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1238
1239     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1240     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1241     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1242     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1243
1244     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1245     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1246     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1247
1248     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
1249     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
1250     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
1251     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
1252
1253     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
1254     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
1255     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1256     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1257     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1258     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1259     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1260     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1261     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1262     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1263     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1264     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1265
1266     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1267       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1268       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1269       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1270       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1271       setOperationAction(ISD::FMA,             MVT::f32, Legal);
1272       setOperationAction(ISD::FMA,             MVT::f64, Legal);
1273     }
1274
1275     if (Subtarget->hasInt256()) {
1276       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1277       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1278       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1279       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1280
1281       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1282       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1283       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1284       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1285
1286       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1287       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1288       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1289       // Don't lower v32i8 because there is no 128-bit byte mul
1290
1291       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
1292       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
1293       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
1294       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
1295
1296       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
1297       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1298
1299       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1300       // when we have a 256bit-wide blend with immediate.
1301       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1302
1303       // Only provide customized ctpop vector bit twiddling for vector types we
1304       // know to perform better than using the popcnt instructions on each
1305       // vector element. If popcnt isn't supported, always provide the custom
1306       // version.
1307       if (!Subtarget->hasPOPCNT())
1308         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
1309
1310       // Custom CTPOP always performs better on natively supported v8i32
1311       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
1312
1313       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1314       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1315       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1316       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1317       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1318       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1319       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1320
1321       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1322       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1323       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1324       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1325       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1326       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1327     } else {
1328       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1329       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1330       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1331       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1332
1333       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1334       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1335       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1336       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1337
1338       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1339       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1340       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1341       // Don't lower v32i8 because there is no 128-bit byte mul
1342     }
1343
1344     // In the customized shift lowering, the legal cases in AVX2 will be
1345     // recognized.
1346     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
1347     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
1348
1349     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
1350     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
1351
1352     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
1353
1354     // Custom lower several nodes for 256-bit types.
1355     for (MVT VT : MVT::vector_valuetypes()) {
1356       if (VT.getScalarSizeInBits() >= 32) {
1357         setOperationAction(ISD::MLOAD,  VT, Legal);
1358         setOperationAction(ISD::MSTORE, VT, Legal);
1359       }
1360       // Extract subvector is special because the value type
1361       // (result) is 128-bit but the source is 256-bit wide.
1362       if (VT.is128BitVector()) {
1363         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1364       }
1365       // Do not attempt to custom lower other non-256-bit vectors
1366       if (!VT.is256BitVector())
1367         continue;
1368
1369       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1370       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1371       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1372       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1373       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1374       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1375       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1376     }
1377
1378     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1379     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1380       MVT VT = (MVT::SimpleValueType)i;
1381
1382       // Do not attempt to promote non-256-bit vectors
1383       if (!VT.is256BitVector())
1384         continue;
1385
1386       setOperationAction(ISD::AND,    VT, Promote);
1387       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1388       setOperationAction(ISD::OR,     VT, Promote);
1389       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1390       setOperationAction(ISD::XOR,    VT, Promote);
1391       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1392       setOperationAction(ISD::LOAD,   VT, Promote);
1393       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1394       setOperationAction(ISD::SELECT, VT, Promote);
1395       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1396     }
1397   }
1398
1399   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1400     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1401     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1402     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1403     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1404
1405     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1406     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1407     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1408
1409     for (MVT VT : MVT::fp_vector_valuetypes())
1410       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1411
1412     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1413     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1414     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1415     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1416     setOperationAction(ISD::AND,                MVT::i1,    Legal);
1417     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
1418     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
1419     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
1420     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
1421     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
1422
1423     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
1424     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
1425     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
1426     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
1427     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
1428     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
1429
1430     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
1431     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
1432     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
1433     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
1434     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
1435     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
1436     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
1437     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
1438
1439     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
1440     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
1441     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
1442     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
1443     if (Subtarget->is64Bit()) {
1444       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
1445       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
1446       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
1447       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
1448     }
1449     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1450     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1451     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1452     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1453     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1454     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1455     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1456     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1457     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1458     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1459     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1460     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1461     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1462     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1463
1464     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1465     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1466     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1467     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1468     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1469     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1470     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1471     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1472     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1473     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1474     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1475     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1476     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1477
1478     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1479     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1480     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1481     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1482     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
1483     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
1484
1485     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1486     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1487
1488     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1489
1490     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1491     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1492     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1493     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1494     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1495     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1496     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1497     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1498     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1499
1500     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
1501     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
1502
1503     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
1504     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
1505
1506     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1507
1508     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
1509     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
1510
1511     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
1512     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
1513
1514     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
1515     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
1516
1517     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
1518     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
1519     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
1520     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
1521     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
1522     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
1523
1524     if (Subtarget->hasCDI()) {
1525       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
1526       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1527     }
1528
1529     // Custom lower several nodes.
1530     for (MVT VT : MVT::vector_valuetypes()) {
1531       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1532       // Extract subvector is special because the value type
1533       // (result) is 256/128-bit but the source is 512-bit wide.
1534       if (VT.is128BitVector() || VT.is256BitVector()) {
1535         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1536       }
1537       if (VT.getVectorElementType() == MVT::i1)
1538         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1539
1540       // Do not attempt to custom lower other non-512-bit vectors
1541       if (!VT.is512BitVector())
1542         continue;
1543
1544       if ( EltSize >= 32) {
1545         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1546         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1547         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1548         setOperationAction(ISD::VSELECT,             VT, Legal);
1549         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1550         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1551         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1552         setOperationAction(ISD::MLOAD,               VT, Legal);
1553         setOperationAction(ISD::MSTORE,              VT, Legal);
1554       }
1555     }
1556     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1557       MVT VT = (MVT::SimpleValueType)i;
1558
1559       // Do not attempt to promote non-512-bit vectors.
1560       if (!VT.is512BitVector())
1561         continue;
1562
1563       setOperationAction(ISD::SELECT, VT, Promote);
1564       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1565     }
1566   }// has  AVX-512
1567
1568   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
1569     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1570     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1571
1572     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1573     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1574
1575     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
1576     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
1577     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1578     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1579     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
1580     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
1581     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
1582     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
1583     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1584
1585     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1586       const MVT VT = (MVT::SimpleValueType)i;
1587
1588       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1589
1590       // Do not attempt to promote non-512-bit vectors.
1591       if (!VT.is512BitVector())
1592         continue;
1593
1594       if (EltSize < 32) {
1595         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1596         setOperationAction(ISD::VSELECT,             VT, Legal);
1597       }
1598     }
1599   }
1600
1601   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
1602     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1603     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1604
1605     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
1606     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
1607     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
1608
1609     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
1610     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
1611     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
1612     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
1613     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
1614     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
1615   }
1616
1617   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1618   // of this type with custom code.
1619   for (MVT VT : MVT::vector_valuetypes())
1620     setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1621
1622   // We want to custom lower some of our intrinsics.
1623   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1624   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1625   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1626   if (!Subtarget->is64Bit())
1627     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1628
1629   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1630   // handle type legalization for these operations here.
1631   //
1632   // FIXME: We really should do custom legalization for addition and
1633   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1634   // than generic legalization for 64-bit multiplication-with-overflow, though.
1635   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1636     // Add/Sub/Mul with overflow operations are custom lowered.
1637     MVT VT = IntVTs[i];
1638     setOperationAction(ISD::SADDO, VT, Custom);
1639     setOperationAction(ISD::UADDO, VT, Custom);
1640     setOperationAction(ISD::SSUBO, VT, Custom);
1641     setOperationAction(ISD::USUBO, VT, Custom);
1642     setOperationAction(ISD::SMULO, VT, Custom);
1643     setOperationAction(ISD::UMULO, VT, Custom);
1644   }
1645
1646
1647   if (!Subtarget->is64Bit()) {
1648     // These libcalls are not available in 32-bit.
1649     setLibcallName(RTLIB::SHL_I128, nullptr);
1650     setLibcallName(RTLIB::SRL_I128, nullptr);
1651     setLibcallName(RTLIB::SRA_I128, nullptr);
1652   }
1653
1654   // Combine sin / cos into one node or libcall if possible.
1655   if (Subtarget->hasSinCos()) {
1656     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1657     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1658     if (Subtarget->isTargetDarwin()) {
1659       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1660       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1661       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1662       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1663     }
1664   }
1665
1666   if (Subtarget->isTargetWin64()) {
1667     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1668     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1669     setOperationAction(ISD::SREM, MVT::i128, Custom);
1670     setOperationAction(ISD::UREM, MVT::i128, Custom);
1671     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1672     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1673   }
1674
1675   // We have target-specific dag combine patterns for the following nodes:
1676   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1677   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1678   setTargetDAGCombine(ISD::BITCAST);
1679   setTargetDAGCombine(ISD::VSELECT);
1680   setTargetDAGCombine(ISD::SELECT);
1681   setTargetDAGCombine(ISD::SHL);
1682   setTargetDAGCombine(ISD::SRA);
1683   setTargetDAGCombine(ISD::SRL);
1684   setTargetDAGCombine(ISD::OR);
1685   setTargetDAGCombine(ISD::AND);
1686   setTargetDAGCombine(ISD::ADD);
1687   setTargetDAGCombine(ISD::FADD);
1688   setTargetDAGCombine(ISD::FSUB);
1689   setTargetDAGCombine(ISD::FMA);
1690   setTargetDAGCombine(ISD::SUB);
1691   setTargetDAGCombine(ISD::LOAD);
1692   setTargetDAGCombine(ISD::MLOAD);
1693   setTargetDAGCombine(ISD::STORE);
1694   setTargetDAGCombine(ISD::MSTORE);
1695   setTargetDAGCombine(ISD::ZERO_EXTEND);
1696   setTargetDAGCombine(ISD::ANY_EXTEND);
1697   setTargetDAGCombine(ISD::SIGN_EXTEND);
1698   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1699   setTargetDAGCombine(ISD::TRUNCATE);
1700   setTargetDAGCombine(ISD::SINT_TO_FP);
1701   setTargetDAGCombine(ISD::SETCC);
1702   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1703   setTargetDAGCombine(ISD::BUILD_VECTOR);
1704   setTargetDAGCombine(ISD::MUL);
1705   setTargetDAGCombine(ISD::XOR);
1706
1707   computeRegisterProperties();
1708
1709   // On Darwin, -Os means optimize for size without hurting performance,
1710   // do not reduce the limit.
1711   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1712   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1713   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1714   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1715   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1716   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1717   setPrefLoopAlignment(4); // 2^4 bytes.
1718
1719   // Predictable cmov don't hurt on atom because it's in-order.
1720   PredictableSelectIsExpensive = !Subtarget->isAtom();
1721   EnableExtLdPromotion = true;
1722   setPrefFunctionAlignment(4); // 2^4 bytes.
1723
1724   verifyIntrinsicTables();
1725 }
1726
1727 // This has so far only been implemented for 64-bit MachO.
1728 bool X86TargetLowering::useLoadStackGuardNode() const {
1729   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
1730 }
1731
1732 TargetLoweringBase::LegalizeTypeAction
1733 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1734   if (ExperimentalVectorWideningLegalization &&
1735       VT.getVectorNumElements() != 1 &&
1736       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1737     return TypeWidenVector;
1738
1739   return TargetLoweringBase::getPreferredVectorAction(VT);
1740 }
1741
1742 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1743   if (!VT.isVector())
1744     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1745
1746   const unsigned NumElts = VT.getVectorNumElements();
1747   const EVT EltVT = VT.getVectorElementType();
1748   if (VT.is512BitVector()) {
1749     if (Subtarget->hasAVX512())
1750       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1751           EltVT == MVT::f32 || EltVT == MVT::f64)
1752         switch(NumElts) {
1753         case  8: return MVT::v8i1;
1754         case 16: return MVT::v16i1;
1755       }
1756     if (Subtarget->hasBWI())
1757       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1758         switch(NumElts) {
1759         case 32: return MVT::v32i1;
1760         case 64: return MVT::v64i1;
1761       }
1762   }
1763
1764   if (VT.is256BitVector() || VT.is128BitVector()) {
1765     if (Subtarget->hasVLX())
1766       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1767           EltVT == MVT::f32 || EltVT == MVT::f64)
1768         switch(NumElts) {
1769         case 2: return MVT::v2i1;
1770         case 4: return MVT::v4i1;
1771         case 8: return MVT::v8i1;
1772       }
1773     if (Subtarget->hasBWI() && Subtarget->hasVLX())
1774       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1775         switch(NumElts) {
1776         case  8: return MVT::v8i1;
1777         case 16: return MVT::v16i1;
1778         case 32: return MVT::v32i1;
1779       }
1780   }
1781
1782   return VT.changeVectorElementTypeToInteger();
1783 }
1784
1785 /// Helper for getByValTypeAlignment to determine
1786 /// the desired ByVal argument alignment.
1787 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1788   if (MaxAlign == 16)
1789     return;
1790   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1791     if (VTy->getBitWidth() == 128)
1792       MaxAlign = 16;
1793   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1794     unsigned EltAlign = 0;
1795     getMaxByValAlign(ATy->getElementType(), EltAlign);
1796     if (EltAlign > MaxAlign)
1797       MaxAlign = EltAlign;
1798   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1799     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1800       unsigned EltAlign = 0;
1801       getMaxByValAlign(STy->getElementType(i), EltAlign);
1802       if (EltAlign > MaxAlign)
1803         MaxAlign = EltAlign;
1804       if (MaxAlign == 16)
1805         break;
1806     }
1807   }
1808 }
1809
1810 /// Return the desired alignment for ByVal aggregate
1811 /// function arguments in the caller parameter area. For X86, aggregates
1812 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1813 /// are at 4-byte boundaries.
1814 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1815   if (Subtarget->is64Bit()) {
1816     // Max of 8 and alignment of type.
1817     unsigned TyAlign = TD->getABITypeAlignment(Ty);
1818     if (TyAlign > 8)
1819       return TyAlign;
1820     return 8;
1821   }
1822
1823   unsigned Align = 4;
1824   if (Subtarget->hasSSE1())
1825     getMaxByValAlign(Ty, Align);
1826   return Align;
1827 }
1828
1829 /// Returns the target specific optimal type for load
1830 /// and store operations as a result of memset, memcpy, and memmove
1831 /// lowering. If DstAlign is zero that means it's safe to destination
1832 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1833 /// means there isn't a need to check it against alignment requirement,
1834 /// probably because the source does not need to be loaded. If 'IsMemset' is
1835 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1836 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1837 /// source is constant so it does not need to be loaded.
1838 /// It returns EVT::Other if the type should be determined using generic
1839 /// target-independent logic.
1840 EVT
1841 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1842                                        unsigned DstAlign, unsigned SrcAlign,
1843                                        bool IsMemset, bool ZeroMemset,
1844                                        bool MemcpyStrSrc,
1845                                        MachineFunction &MF) const {
1846   const Function *F = MF.getFunction();
1847   if ((!IsMemset || ZeroMemset) &&
1848       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1849                                        Attribute::NoImplicitFloat)) {
1850     if (Size >= 16 &&
1851         (Subtarget->isUnalignedMemAccessFast() ||
1852          ((DstAlign == 0 || DstAlign >= 16) &&
1853           (SrcAlign == 0 || SrcAlign >= 16)))) {
1854       if (Size >= 32) {
1855         if (Subtarget->hasInt256())
1856           return MVT::v8i32;
1857         if (Subtarget->hasFp256())
1858           return MVT::v8f32;
1859       }
1860       if (Subtarget->hasSSE2())
1861         return MVT::v4i32;
1862       if (Subtarget->hasSSE1())
1863         return MVT::v4f32;
1864     } else if (!MemcpyStrSrc && Size >= 8 &&
1865                !Subtarget->is64Bit() &&
1866                Subtarget->hasSSE2()) {
1867       // Do not use f64 to lower memcpy if source is string constant. It's
1868       // better to use i32 to avoid the loads.
1869       return MVT::f64;
1870     }
1871   }
1872   if (Subtarget->is64Bit() && Size >= 8)
1873     return MVT::i64;
1874   return MVT::i32;
1875 }
1876
1877 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1878   if (VT == MVT::f32)
1879     return X86ScalarSSEf32;
1880   else if (VT == MVT::f64)
1881     return X86ScalarSSEf64;
1882   return true;
1883 }
1884
1885 bool
1886 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1887                                                   unsigned,
1888                                                   unsigned,
1889                                                   bool *Fast) const {
1890   if (Fast)
1891     *Fast = Subtarget->isUnalignedMemAccessFast();
1892   return true;
1893 }
1894
1895 /// Return the entry encoding for a jump table in the
1896 /// current function.  The returned value is a member of the
1897 /// MachineJumpTableInfo::JTEntryKind enum.
1898 unsigned X86TargetLowering::getJumpTableEncoding() const {
1899   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1900   // symbol.
1901   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1902       Subtarget->isPICStyleGOT())
1903     return MachineJumpTableInfo::EK_Custom32;
1904
1905   // Otherwise, use the normal jump table encoding heuristics.
1906   return TargetLowering::getJumpTableEncoding();
1907 }
1908
1909 const MCExpr *
1910 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1911                                              const MachineBasicBlock *MBB,
1912                                              unsigned uid,MCContext &Ctx) const{
1913   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
1914          Subtarget->isPICStyleGOT());
1915   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1916   // entries.
1917   return MCSymbolRefExpr::Create(MBB->getSymbol(),
1918                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1919 }
1920
1921 /// Returns relocation base for the given PIC jumptable.
1922 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1923                                                     SelectionDAG &DAG) const {
1924   if (!Subtarget->is64Bit())
1925     // This doesn't have SDLoc associated with it, but is not really the
1926     // same as a Register.
1927     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1928   return Table;
1929 }
1930
1931 /// This returns the relocation base for the given PIC jumptable,
1932 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1933 const MCExpr *X86TargetLowering::
1934 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1935                              MCContext &Ctx) const {
1936   // X86-64 uses RIP relative addressing based on the jump table label.
1937   if (Subtarget->isPICStyleRIPRel())
1938     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1939
1940   // Otherwise, the reference is relative to the PIC base.
1941   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1942 }
1943
1944 // FIXME: Why this routine is here? Move to RegInfo!
1945 std::pair<const TargetRegisterClass*, uint8_t>
1946 X86TargetLowering::findRepresentativeClass(MVT VT) const{
1947   const TargetRegisterClass *RRC = nullptr;
1948   uint8_t Cost = 1;
1949   switch (VT.SimpleTy) {
1950   default:
1951     return TargetLowering::findRepresentativeClass(VT);
1952   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1953     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1954     break;
1955   case MVT::x86mmx:
1956     RRC = &X86::VR64RegClass;
1957     break;
1958   case MVT::f32: case MVT::f64:
1959   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1960   case MVT::v4f32: case MVT::v2f64:
1961   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1962   case MVT::v4f64:
1963     RRC = &X86::VR128RegClass;
1964     break;
1965   }
1966   return std::make_pair(RRC, Cost);
1967 }
1968
1969 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1970                                                unsigned &Offset) const {
1971   if (!Subtarget->isTargetLinux())
1972     return false;
1973
1974   if (Subtarget->is64Bit()) {
1975     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1976     Offset = 0x28;
1977     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1978       AddressSpace = 256;
1979     else
1980       AddressSpace = 257;
1981   } else {
1982     // %gs:0x14 on i386
1983     Offset = 0x14;
1984     AddressSpace = 256;
1985   }
1986   return true;
1987 }
1988
1989 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1990                                             unsigned DestAS) const {
1991   assert(SrcAS != DestAS && "Expected different address spaces!");
1992
1993   return SrcAS < 256 && DestAS < 256;
1994 }
1995
1996 //===----------------------------------------------------------------------===//
1997 //               Return Value Calling Convention Implementation
1998 //===----------------------------------------------------------------------===//
1999
2000 #include "X86GenCallingConv.inc"
2001
2002 bool
2003 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2004                                   MachineFunction &MF, bool isVarArg,
2005                         const SmallVectorImpl<ISD::OutputArg> &Outs,
2006                         LLVMContext &Context) const {
2007   SmallVector<CCValAssign, 16> RVLocs;
2008   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2009   return CCInfo.CheckReturn(Outs, RetCC_X86);
2010 }
2011
2012 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2013   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2014   return ScratchRegs;
2015 }
2016
2017 SDValue
2018 X86TargetLowering::LowerReturn(SDValue Chain,
2019                                CallingConv::ID CallConv, bool isVarArg,
2020                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2021                                const SmallVectorImpl<SDValue> &OutVals,
2022                                SDLoc dl, SelectionDAG &DAG) const {
2023   MachineFunction &MF = DAG.getMachineFunction();
2024   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2025
2026   SmallVector<CCValAssign, 16> RVLocs;
2027   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2028   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2029
2030   SDValue Flag;
2031   SmallVector<SDValue, 6> RetOps;
2032   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2033   // Operand #1 = Bytes To Pop
2034   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
2035                    MVT::i16));
2036
2037   // Copy the result values into the output registers.
2038   for (unsigned i = 0; i != RVLocs.size(); ++i) {
2039     CCValAssign &VA = RVLocs[i];
2040     assert(VA.isRegLoc() && "Can only return in registers!");
2041     SDValue ValToCopy = OutVals[i];
2042     EVT ValVT = ValToCopy.getValueType();
2043
2044     // Promote values to the appropriate types.
2045     if (VA.getLocInfo() == CCValAssign::SExt)
2046       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2047     else if (VA.getLocInfo() == CCValAssign::ZExt)
2048       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2049     else if (VA.getLocInfo() == CCValAssign::AExt)
2050       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2051     else if (VA.getLocInfo() == CCValAssign::BCvt)
2052       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
2053
2054     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2055            "Unexpected FP-extend for return value.");
2056
2057     // If this is x86-64, and we disabled SSE, we can't return FP values,
2058     // or SSE or MMX vectors.
2059     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2060          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2061           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
2062       report_fatal_error("SSE register return with SSE disabled");
2063     }
2064     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2065     // llvm-gcc has never done it right and no one has noticed, so this
2066     // should be OK for now.
2067     if (ValVT == MVT::f64 &&
2068         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
2069       report_fatal_error("SSE2 register return with SSE2 disabled");
2070
2071     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2072     // the RET instruction and handled by the FP Stackifier.
2073     if (VA.getLocReg() == X86::FP0 ||
2074         VA.getLocReg() == X86::FP1) {
2075       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2076       // change the value to the FP stack register class.
2077       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2078         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2079       RetOps.push_back(ValToCopy);
2080       // Don't emit a copytoreg.
2081       continue;
2082     }
2083
2084     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2085     // which is returned in RAX / RDX.
2086     if (Subtarget->is64Bit()) {
2087       if (ValVT == MVT::x86mmx) {
2088         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2089           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
2090           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2091                                   ValToCopy);
2092           // If we don't have SSE2 available, convert to v4f32 so the generated
2093           // register is legal.
2094           if (!Subtarget->hasSSE2())
2095             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
2096         }
2097       }
2098     }
2099
2100     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2101     Flag = Chain.getValue(1);
2102     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2103   }
2104
2105   // The x86-64 ABIs require that for returning structs by value we copy
2106   // the sret argument into %rax/%eax (depending on ABI) for the return.
2107   // Win32 requires us to put the sret argument to %eax as well.
2108   // We saved the argument into a virtual register in the entry block,
2109   // so now we copy the value out and into %rax/%eax.
2110   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
2111       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
2112     MachineFunction &MF = DAG.getMachineFunction();
2113     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2114     unsigned Reg = FuncInfo->getSRetReturnReg();
2115     assert(Reg &&
2116            "SRetReturnReg should have been set in LowerFormalArguments().");
2117     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
2118
2119     unsigned RetValReg
2120         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
2121           X86::RAX : X86::EAX;
2122     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2123     Flag = Chain.getValue(1);
2124
2125     // RAX/EAX now acts like a return value.
2126     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
2127   }
2128
2129   RetOps[0] = Chain;  // Update chain.
2130
2131   // Add the flag if we have it.
2132   if (Flag.getNode())
2133     RetOps.push_back(Flag);
2134
2135   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
2136 }
2137
2138 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2139   if (N->getNumValues() != 1)
2140     return false;
2141   if (!N->hasNUsesOfValue(1, 0))
2142     return false;
2143
2144   SDValue TCChain = Chain;
2145   SDNode *Copy = *N->use_begin();
2146   if (Copy->getOpcode() == ISD::CopyToReg) {
2147     // If the copy has a glue operand, we conservatively assume it isn't safe to
2148     // perform a tail call.
2149     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2150       return false;
2151     TCChain = Copy->getOperand(0);
2152   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2153     return false;
2154
2155   bool HasRet = false;
2156   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2157        UI != UE; ++UI) {
2158     if (UI->getOpcode() != X86ISD::RET_FLAG)
2159       return false;
2160     // If we are returning more than one value, we can definitely
2161     // not make a tail call see PR19530
2162     if (UI->getNumOperands() > 4)
2163       return false;
2164     if (UI->getNumOperands() == 4 &&
2165         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2166       return false;
2167     HasRet = true;
2168   }
2169
2170   if (!HasRet)
2171     return false;
2172
2173   Chain = TCChain;
2174   return true;
2175 }
2176
2177 EVT
2178 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
2179                                             ISD::NodeType ExtendKind) const {
2180   MVT ReturnMVT;
2181   // TODO: Is this also valid on 32-bit?
2182   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2183     ReturnMVT = MVT::i8;
2184   else
2185     ReturnMVT = MVT::i32;
2186
2187   EVT MinVT = getRegisterType(Context, ReturnMVT);
2188   return VT.bitsLT(MinVT) ? MinVT : VT;
2189 }
2190
2191 /// Lower the result values of a call into the
2192 /// appropriate copies out of appropriate physical registers.
2193 ///
2194 SDValue
2195 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2196                                    CallingConv::ID CallConv, bool isVarArg,
2197                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2198                                    SDLoc dl, SelectionDAG &DAG,
2199                                    SmallVectorImpl<SDValue> &InVals) const {
2200
2201   // Assign locations to each value returned by this call.
2202   SmallVector<CCValAssign, 16> RVLocs;
2203   bool Is64Bit = Subtarget->is64Bit();
2204   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2205                  *DAG.getContext());
2206   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2207
2208   // Copy all of the result registers out of their specified physreg.
2209   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2210     CCValAssign &VA = RVLocs[i];
2211     EVT CopyVT = VA.getValVT();
2212
2213     // If this is x86-64, and we disabled SSE, we can't return FP values
2214     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2215         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2216       report_fatal_error("SSE register return with SSE disabled");
2217     }
2218
2219     // If we prefer to use the value in xmm registers, copy it out as f80 and
2220     // use a truncate to move it from fp stack reg to xmm reg.
2221     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2222         isScalarFPTypeInSSEReg(VA.getValVT()))
2223       CopyVT = MVT::f80;
2224
2225     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2226                                CopyVT, InFlag).getValue(1);
2227     SDValue Val = Chain.getValue(0);
2228
2229     if (CopyVT != VA.getValVT())
2230       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2231                         // This truncation won't change the value.
2232                         DAG.getIntPtrConstant(1));
2233
2234     InFlag = Chain.getValue(2);
2235     InVals.push_back(Val);
2236   }
2237
2238   return Chain;
2239 }
2240
2241 //===----------------------------------------------------------------------===//
2242 //                C & StdCall & Fast Calling Convention implementation
2243 //===----------------------------------------------------------------------===//
2244 //  StdCall calling convention seems to be standard for many Windows' API
2245 //  routines and around. It differs from C calling convention just a little:
2246 //  callee should clean up the stack, not caller. Symbols should be also
2247 //  decorated in some fancy way :) It doesn't support any vector arguments.
2248 //  For info on fast calling convention see Fast Calling Convention (tail call)
2249 //  implementation LowerX86_32FastCCCallTo.
2250
2251 /// CallIsStructReturn - Determines whether a call uses struct return
2252 /// semantics.
2253 enum StructReturnType {
2254   NotStructReturn,
2255   RegStructReturn,
2256   StackStructReturn
2257 };
2258 static StructReturnType
2259 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2260   if (Outs.empty())
2261     return NotStructReturn;
2262
2263   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2264   if (!Flags.isSRet())
2265     return NotStructReturn;
2266   if (Flags.isInReg())
2267     return RegStructReturn;
2268   return StackStructReturn;
2269 }
2270
2271 /// Determines whether a function uses struct return semantics.
2272 static StructReturnType
2273 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2274   if (Ins.empty())
2275     return NotStructReturn;
2276
2277   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2278   if (!Flags.isSRet())
2279     return NotStructReturn;
2280   if (Flags.isInReg())
2281     return RegStructReturn;
2282   return StackStructReturn;
2283 }
2284
2285 /// Make a copy of an aggregate at address specified by "Src" to address
2286 /// "Dst" with size and alignment information specified by the specific
2287 /// parameter attribute. The copy will be passed as a byval function parameter.
2288 static SDValue
2289 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2290                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2291                           SDLoc dl) {
2292   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2293
2294   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2295                        /*isVolatile*/false, /*AlwaysInline=*/true,
2296                        MachinePointerInfo(), MachinePointerInfo());
2297 }
2298
2299 /// Return true if the calling convention is one that
2300 /// supports tail call optimization.
2301 static bool IsTailCallConvention(CallingConv::ID CC) {
2302   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2303           CC == CallingConv::HiPE);
2304 }
2305
2306 /// \brief Return true if the calling convention is a C calling convention.
2307 static bool IsCCallConvention(CallingConv::ID CC) {
2308   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2309           CC == CallingConv::X86_64_SysV);
2310 }
2311
2312 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2313   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2314     return false;
2315
2316   CallSite CS(CI);
2317   CallingConv::ID CalleeCC = CS.getCallingConv();
2318   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2319     return false;
2320
2321   return true;
2322 }
2323
2324 /// Return true if the function is being made into
2325 /// a tailcall target by changing its ABI.
2326 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2327                                    bool GuaranteedTailCallOpt) {
2328   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2329 }
2330
2331 SDValue
2332 X86TargetLowering::LowerMemArgument(SDValue Chain,
2333                                     CallingConv::ID CallConv,
2334                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2335                                     SDLoc dl, SelectionDAG &DAG,
2336                                     const CCValAssign &VA,
2337                                     MachineFrameInfo *MFI,
2338                                     unsigned i) const {
2339   // Create the nodes corresponding to a load from this parameter slot.
2340   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2341   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2342       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2343   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2344   EVT ValVT;
2345
2346   // If value is passed by pointer we have address passed instead of the value
2347   // itself.
2348   if (VA.getLocInfo() == CCValAssign::Indirect)
2349     ValVT = VA.getLocVT();
2350   else
2351     ValVT = VA.getValVT();
2352
2353   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2354   // changed with more analysis.
2355   // In case of tail call optimization mark all arguments mutable. Since they
2356   // could be overwritten by lowering of arguments in case of a tail call.
2357   if (Flags.isByVal()) {
2358     unsigned Bytes = Flags.getByValSize();
2359     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2360     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2361     return DAG.getFrameIndex(FI, getPointerTy());
2362   } else {
2363     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2364                                     VA.getLocMemOffset(), isImmutable);
2365     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2366     return DAG.getLoad(ValVT, dl, Chain, FIN,
2367                        MachinePointerInfo::getFixedStack(FI),
2368                        false, false, false, 0);
2369   }
2370 }
2371
2372 // FIXME: Get this from tablegen.
2373 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2374                                                 const X86Subtarget *Subtarget) {
2375   assert(Subtarget->is64Bit());
2376
2377   if (Subtarget->isCallingConvWin64(CallConv)) {
2378     static const MCPhysReg GPR64ArgRegsWin64[] = {
2379       X86::RCX, X86::RDX, X86::R8,  X86::R9
2380     };
2381     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2382   }
2383
2384   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2385     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2386   };
2387   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2388 }
2389
2390 // FIXME: Get this from tablegen.
2391 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2392                                                 CallingConv::ID CallConv,
2393                                                 const X86Subtarget *Subtarget) {
2394   assert(Subtarget->is64Bit());
2395   if (Subtarget->isCallingConvWin64(CallConv)) {
2396     // The XMM registers which might contain var arg parameters are shadowed
2397     // in their paired GPR.  So we only need to save the GPR to their home
2398     // slots.
2399     // TODO: __vectorcall will change this.
2400     return None;
2401   }
2402
2403   const Function *Fn = MF.getFunction();
2404   bool NoImplicitFloatOps = Fn->getAttributes().
2405       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2406   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
2407          "SSE register cannot be used when SSE is disabled!");
2408   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2409       !Subtarget->hasSSE1())
2410     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2411     // registers.
2412     return None;
2413
2414   static const MCPhysReg XMMArgRegs64Bit[] = {
2415     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2416     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2417   };
2418   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2419 }
2420
2421 SDValue
2422 X86TargetLowering::LowerFormalArguments(SDValue Chain,
2423                                         CallingConv::ID CallConv,
2424                                         bool isVarArg,
2425                                       const SmallVectorImpl<ISD::InputArg> &Ins,
2426                                         SDLoc dl,
2427                                         SelectionDAG &DAG,
2428                                         SmallVectorImpl<SDValue> &InVals)
2429                                           const {
2430   MachineFunction &MF = DAG.getMachineFunction();
2431   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2432
2433   const Function* Fn = MF.getFunction();
2434   if (Fn->hasExternalLinkage() &&
2435       Subtarget->isTargetCygMing() &&
2436       Fn->getName() == "main")
2437     FuncInfo->setForceFramePointer(true);
2438
2439   MachineFrameInfo *MFI = MF.getFrameInfo();
2440   bool Is64Bit = Subtarget->is64Bit();
2441   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2442
2443   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2444          "Var args not supported with calling convention fastcc, ghc or hipe");
2445
2446   // Assign locations to all of the incoming arguments.
2447   SmallVector<CCValAssign, 16> ArgLocs;
2448   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2449
2450   // Allocate shadow area for Win64
2451   if (IsWin64)
2452     CCInfo.AllocateStack(32, 8);
2453
2454   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2455
2456   unsigned LastVal = ~0U;
2457   SDValue ArgValue;
2458   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2459     CCValAssign &VA = ArgLocs[i];
2460     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2461     // places.
2462     assert(VA.getValNo() != LastVal &&
2463            "Don't support value assigned to multiple locs yet");
2464     (void)LastVal;
2465     LastVal = VA.getValNo();
2466
2467     if (VA.isRegLoc()) {
2468       EVT RegVT = VA.getLocVT();
2469       const TargetRegisterClass *RC;
2470       if (RegVT == MVT::i32)
2471         RC = &X86::GR32RegClass;
2472       else if (Is64Bit && RegVT == MVT::i64)
2473         RC = &X86::GR64RegClass;
2474       else if (RegVT == MVT::f32)
2475         RC = &X86::FR32RegClass;
2476       else if (RegVT == MVT::f64)
2477         RC = &X86::FR64RegClass;
2478       else if (RegVT.is512BitVector())
2479         RC = &X86::VR512RegClass;
2480       else if (RegVT.is256BitVector())
2481         RC = &X86::VR256RegClass;
2482       else if (RegVT.is128BitVector())
2483         RC = &X86::VR128RegClass;
2484       else if (RegVT == MVT::x86mmx)
2485         RC = &X86::VR64RegClass;
2486       else if (RegVT == MVT::i1)
2487         RC = &X86::VK1RegClass;
2488       else if (RegVT == MVT::v8i1)
2489         RC = &X86::VK8RegClass;
2490       else if (RegVT == MVT::v16i1)
2491         RC = &X86::VK16RegClass;
2492       else if (RegVT == MVT::v32i1)
2493         RC = &X86::VK32RegClass;
2494       else if (RegVT == MVT::v64i1)
2495         RC = &X86::VK64RegClass;
2496       else
2497         llvm_unreachable("Unknown argument type!");
2498
2499       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2500       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2501
2502       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2503       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2504       // right size.
2505       if (VA.getLocInfo() == CCValAssign::SExt)
2506         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2507                                DAG.getValueType(VA.getValVT()));
2508       else if (VA.getLocInfo() == CCValAssign::ZExt)
2509         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2510                                DAG.getValueType(VA.getValVT()));
2511       else if (VA.getLocInfo() == CCValAssign::BCvt)
2512         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2513
2514       if (VA.isExtInLoc()) {
2515         // Handle MMX values passed in XMM regs.
2516         if (RegVT.isVector())
2517           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2518         else
2519           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2520       }
2521     } else {
2522       assert(VA.isMemLoc());
2523       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2524     }
2525
2526     // If value is passed via pointer - do a load.
2527     if (VA.getLocInfo() == CCValAssign::Indirect)
2528       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2529                              MachinePointerInfo(), false, false, false, 0);
2530
2531     InVals.push_back(ArgValue);
2532   }
2533
2534   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
2535     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2536       // The x86-64 ABIs require that for returning structs by value we copy
2537       // the sret argument into %rax/%eax (depending on ABI) for the return.
2538       // Win32 requires us to put the sret argument to %eax as well.
2539       // Save the argument into a virtual register so that we can access it
2540       // from the return points.
2541       if (Ins[i].Flags.isSRet()) {
2542         unsigned Reg = FuncInfo->getSRetReturnReg();
2543         if (!Reg) {
2544           MVT PtrTy = getPointerTy();
2545           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2546           FuncInfo->setSRetReturnReg(Reg);
2547         }
2548         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2549         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2550         break;
2551       }
2552     }
2553   }
2554
2555   unsigned StackSize = CCInfo.getNextStackOffset();
2556   // Align stack specially for tail calls.
2557   if (FuncIsMadeTailCallSafe(CallConv,
2558                              MF.getTarget().Options.GuaranteedTailCallOpt))
2559     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2560
2561   // If the function takes variable number of arguments, make a frame index for
2562   // the start of the first vararg value... for expansion of llvm.va_start. We
2563   // can skip this if there are no va_start calls.
2564   if (MFI->hasVAStart() &&
2565       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2566                    CallConv != CallingConv::X86_ThisCall))) {
2567     FuncInfo->setVarArgsFrameIndex(
2568         MFI->CreateFixedObject(1, StackSize, true));
2569   }
2570
2571   // Figure out if XMM registers are in use.
2572   assert(!(MF.getTarget().Options.UseSoftFloat &&
2573            Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
2574                                             Attribute::NoImplicitFloat)) &&
2575          "SSE register cannot be used when SSE is disabled!");
2576
2577   // 64-bit calling conventions support varargs and register parameters, so we
2578   // have to do extra work to spill them in the prologue.
2579   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2580     // Find the first unallocated argument registers.
2581     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2582     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2583     unsigned NumIntRegs =
2584         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
2585     unsigned NumXMMRegs =
2586         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
2587     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2588            "SSE register cannot be used when SSE is disabled!");
2589
2590     // Gather all the live in physical registers.
2591     SmallVector<SDValue, 6> LiveGPRs;
2592     SmallVector<SDValue, 8> LiveXMMRegs;
2593     SDValue ALVal;
2594     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2595       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2596       LiveGPRs.push_back(
2597           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2598     }
2599     if (!ArgXMMs.empty()) {
2600       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2601       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2602       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2603         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2604         LiveXMMRegs.push_back(
2605             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2606       }
2607     }
2608
2609     if (IsWin64) {
2610       const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
2611       // Get to the caller-allocated home save location.  Add 8 to account
2612       // for the return address.
2613       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2614       FuncInfo->setRegSaveFrameIndex(
2615           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2616       // Fixup to set vararg frame on shadow area (4 x i64).
2617       if (NumIntRegs < 4)
2618         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2619     } else {
2620       // For X86-64, if there are vararg parameters that are passed via
2621       // registers, then we must store them to their spots on the stack so
2622       // they may be loaded by deferencing the result of va_next.
2623       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2624       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2625       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2626           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2627     }
2628
2629     // Store the integer parameter registers.
2630     SmallVector<SDValue, 8> MemOps;
2631     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2632                                       getPointerTy());
2633     unsigned Offset = FuncInfo->getVarArgsGPOffset();
2634     for (SDValue Val : LiveGPRs) {
2635       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2636                                 DAG.getIntPtrConstant(Offset));
2637       SDValue Store =
2638         DAG.getStore(Val.getValue(1), dl, Val, FIN,
2639                      MachinePointerInfo::getFixedStack(
2640                        FuncInfo->getRegSaveFrameIndex(), Offset),
2641                      false, false, 0);
2642       MemOps.push_back(Store);
2643       Offset += 8;
2644     }
2645
2646     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2647       // Now store the XMM (fp + vector) parameter registers.
2648       SmallVector<SDValue, 12> SaveXMMOps;
2649       SaveXMMOps.push_back(Chain);
2650       SaveXMMOps.push_back(ALVal);
2651       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2652                              FuncInfo->getRegSaveFrameIndex()));
2653       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2654                              FuncInfo->getVarArgsFPOffset()));
2655       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2656                         LiveXMMRegs.end());
2657       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2658                                    MVT::Other, SaveXMMOps));
2659     }
2660
2661     if (!MemOps.empty())
2662       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2663   }
2664
2665   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2666     // Find the largest legal vector type.
2667     MVT VecVT = MVT::Other;
2668     // FIXME: Only some x86_32 calling conventions support AVX512.
2669     if (Subtarget->hasAVX512() &&
2670         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2671                      CallConv == CallingConv::Intel_OCL_BI)))
2672       VecVT = MVT::v16f32;
2673     else if (Subtarget->hasAVX())
2674       VecVT = MVT::v8f32;
2675     else if (Subtarget->hasSSE2())
2676       VecVT = MVT::v4f32;
2677
2678     // We forward some GPRs and some vector types.
2679     SmallVector<MVT, 2> RegParmTypes;
2680     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2681     RegParmTypes.push_back(IntVT);
2682     if (VecVT != MVT::Other)
2683       RegParmTypes.push_back(VecVT);
2684
2685     // Compute the set of forwarded registers. The rest are scratch.
2686     SmallVectorImpl<ForwardedRegister> &Forwards =
2687         FuncInfo->getForwardedMustTailRegParms();
2688     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2689
2690     // Conservatively forward AL on x86_64, since it might be used for varargs.
2691     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2692       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2693       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2694     }
2695
2696     // Copy all forwards from physical to virtual registers.
2697     for (ForwardedRegister &F : Forwards) {
2698       // FIXME: Can we use a less constrained schedule?
2699       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2700       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2701       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2702     }
2703   }
2704
2705   // Some CCs need callee pop.
2706   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2707                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
2708     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2709   } else {
2710     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2711     // If this is an sret function, the return should pop the hidden pointer.
2712     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2713         !Subtarget->getTargetTriple().isOSMSVCRT() &&
2714         argsAreStructReturn(Ins) == StackStructReturn)
2715       FuncInfo->setBytesToPopOnReturn(4);
2716   }
2717
2718   if (!Is64Bit) {
2719     // RegSaveFrameIndex is X86-64 only.
2720     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2721     if (CallConv == CallingConv::X86_FastCall ||
2722         CallConv == CallingConv::X86_ThisCall)
2723       // fastcc functions can't have varargs.
2724       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2725   }
2726
2727   FuncInfo->setArgumentStackSize(StackSize);
2728
2729   return Chain;
2730 }
2731
2732 SDValue
2733 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2734                                     SDValue StackPtr, SDValue Arg,
2735                                     SDLoc dl, SelectionDAG &DAG,
2736                                     const CCValAssign &VA,
2737                                     ISD::ArgFlagsTy Flags) const {
2738   unsigned LocMemOffset = VA.getLocMemOffset();
2739   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2740   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2741   if (Flags.isByVal())
2742     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2743
2744   return DAG.getStore(Chain, dl, Arg, PtrOff,
2745                       MachinePointerInfo::getStack(LocMemOffset),
2746                       false, false, 0);
2747 }
2748
2749 /// Emit a load of return address if tail call
2750 /// optimization is performed and it is required.
2751 SDValue
2752 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2753                                            SDValue &OutRetAddr, SDValue Chain,
2754                                            bool IsTailCall, bool Is64Bit,
2755                                            int FPDiff, SDLoc dl) const {
2756   // Adjust the Return address stack slot.
2757   EVT VT = getPointerTy();
2758   OutRetAddr = getReturnAddressFrameIndex(DAG);
2759
2760   // Load the "old" Return address.
2761   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2762                            false, false, false, 0);
2763   return SDValue(OutRetAddr.getNode(), 1);
2764 }
2765
2766 /// Emit a store of the return address if tail call
2767 /// optimization is performed and it is required (FPDiff!=0).
2768 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2769                                         SDValue Chain, SDValue RetAddrFrIdx,
2770                                         EVT PtrVT, unsigned SlotSize,
2771                                         int FPDiff, SDLoc dl) {
2772   // Store the return address to the appropriate stack slot.
2773   if (!FPDiff) return Chain;
2774   // Calculate the new stack slot for the return address.
2775   int NewReturnAddrFI =
2776     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2777                                          false);
2778   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2779   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2780                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2781                        false, false, 0);
2782   return Chain;
2783 }
2784
2785 SDValue
2786 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2787                              SmallVectorImpl<SDValue> &InVals) const {
2788   SelectionDAG &DAG                     = CLI.DAG;
2789   SDLoc &dl                             = CLI.DL;
2790   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2791   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2792   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2793   SDValue Chain                         = CLI.Chain;
2794   SDValue Callee                        = CLI.Callee;
2795   CallingConv::ID CallConv              = CLI.CallConv;
2796   bool &isTailCall                      = CLI.IsTailCall;
2797   bool isVarArg                         = CLI.IsVarArg;
2798
2799   MachineFunction &MF = DAG.getMachineFunction();
2800   bool Is64Bit        = Subtarget->is64Bit();
2801   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
2802   StructReturnType SR = callIsStructReturn(Outs);
2803   bool IsSibcall      = false;
2804   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2805
2806   if (MF.getTarget().Options.DisableTailCalls)
2807     isTailCall = false;
2808
2809   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2810   if (IsMustTail) {
2811     // Force this to be a tail call.  The verifier rules are enough to ensure
2812     // that we can lower this successfully without moving the return address
2813     // around.
2814     isTailCall = true;
2815   } else if (isTailCall) {
2816     // Check if it's really possible to do a tail call.
2817     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2818                     isVarArg, SR != NotStructReturn,
2819                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2820                     Outs, OutVals, Ins, DAG);
2821
2822     // Sibcalls are automatically detected tailcalls which do not require
2823     // ABI changes.
2824     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2825       IsSibcall = true;
2826
2827     if (isTailCall)
2828       ++NumTailCalls;
2829   }
2830
2831   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2832          "Var args not supported with calling convention fastcc, ghc or hipe");
2833
2834   // Analyze operands of the call, assigning locations to each operand.
2835   SmallVector<CCValAssign, 16> ArgLocs;
2836   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2837
2838   // Allocate shadow area for Win64
2839   if (IsWin64)
2840     CCInfo.AllocateStack(32, 8);
2841
2842   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2843
2844   // Get a count of how many bytes are to be pushed on the stack.
2845   unsigned NumBytes = CCInfo.getNextStackOffset();
2846   if (IsSibcall)
2847     // This is a sibcall. The memory operands are available in caller's
2848     // own caller's stack.
2849     NumBytes = 0;
2850   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2851            IsTailCallConvention(CallConv))
2852     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2853
2854   int FPDiff = 0;
2855   if (isTailCall && !IsSibcall && !IsMustTail) {
2856     // Lower arguments at fp - stackoffset + fpdiff.
2857     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2858
2859     FPDiff = NumBytesCallerPushed - NumBytes;
2860
2861     // Set the delta of movement of the returnaddr stackslot.
2862     // But only set if delta is greater than previous delta.
2863     if (FPDiff < X86Info->getTCReturnAddrDelta())
2864       X86Info->setTCReturnAddrDelta(FPDiff);
2865   }
2866
2867   unsigned NumBytesToPush = NumBytes;
2868   unsigned NumBytesToPop = NumBytes;
2869
2870   // If we have an inalloca argument, all stack space has already been allocated
2871   // for us and be right at the top of the stack.  We don't support multiple
2872   // arguments passed in memory when using inalloca.
2873   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2874     NumBytesToPush = 0;
2875     if (!ArgLocs.back().isMemLoc())
2876       report_fatal_error("cannot use inalloca attribute on a register "
2877                          "parameter");
2878     if (ArgLocs.back().getLocMemOffset() != 0)
2879       report_fatal_error("any parameter with the inalloca attribute must be "
2880                          "the only memory argument");
2881   }
2882
2883   if (!IsSibcall)
2884     Chain = DAG.getCALLSEQ_START(
2885         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
2886
2887   SDValue RetAddrFrIdx;
2888   // Load return address for tail calls.
2889   if (isTailCall && FPDiff)
2890     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2891                                     Is64Bit, FPDiff, dl);
2892
2893   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2894   SmallVector<SDValue, 8> MemOpChains;
2895   SDValue StackPtr;
2896
2897   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2898   // of tail call optimization arguments are handle later.
2899   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2900   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2901     // Skip inalloca arguments, they have already been written.
2902     ISD::ArgFlagsTy Flags = Outs[i].Flags;
2903     if (Flags.isInAlloca())
2904       continue;
2905
2906     CCValAssign &VA = ArgLocs[i];
2907     EVT RegVT = VA.getLocVT();
2908     SDValue Arg = OutVals[i];
2909     bool isByVal = Flags.isByVal();
2910
2911     // Promote the value if needed.
2912     switch (VA.getLocInfo()) {
2913     default: llvm_unreachable("Unknown loc info!");
2914     case CCValAssign::Full: break;
2915     case CCValAssign::SExt:
2916       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2917       break;
2918     case CCValAssign::ZExt:
2919       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2920       break;
2921     case CCValAssign::AExt:
2922       if (RegVT.is128BitVector()) {
2923         // Special case: passing MMX values in XMM registers.
2924         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2925         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2926         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2927       } else
2928         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2929       break;
2930     case CCValAssign::BCvt:
2931       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2932       break;
2933     case CCValAssign::Indirect: {
2934       // Store the argument.
2935       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2936       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2937       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2938                            MachinePointerInfo::getFixedStack(FI),
2939                            false, false, 0);
2940       Arg = SpillSlot;
2941       break;
2942     }
2943     }
2944
2945     if (VA.isRegLoc()) {
2946       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2947       if (isVarArg && IsWin64) {
2948         // Win64 ABI requires argument XMM reg to be copied to the corresponding
2949         // shadow reg if callee is a varargs function.
2950         unsigned ShadowReg = 0;
2951         switch (VA.getLocReg()) {
2952         case X86::XMM0: ShadowReg = X86::RCX; break;
2953         case X86::XMM1: ShadowReg = X86::RDX; break;
2954         case X86::XMM2: ShadowReg = X86::R8; break;
2955         case X86::XMM3: ShadowReg = X86::R9; break;
2956         }
2957         if (ShadowReg)
2958           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2959       }
2960     } else if (!IsSibcall && (!isTailCall || isByVal)) {
2961       assert(VA.isMemLoc());
2962       if (!StackPtr.getNode())
2963         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2964                                       getPointerTy());
2965       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2966                                              dl, DAG, VA, Flags));
2967     }
2968   }
2969
2970   if (!MemOpChains.empty())
2971     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2972
2973   if (Subtarget->isPICStyleGOT()) {
2974     // ELF / PIC requires GOT in the EBX register before function calls via PLT
2975     // GOT pointer.
2976     if (!isTailCall) {
2977       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2978                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2979     } else {
2980       // If we are tail calling and generating PIC/GOT style code load the
2981       // address of the callee into ECX. The value in ecx is used as target of
2982       // the tail jump. This is done to circumvent the ebx/callee-saved problem
2983       // for tail calls on PIC/GOT architectures. Normally we would just put the
2984       // address of GOT into ebx and then call target@PLT. But for tail calls
2985       // ebx would be restored (since ebx is callee saved) before jumping to the
2986       // target@PLT.
2987
2988       // Note: The actual moving to ECX is done further down.
2989       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2990       if (G && !G->getGlobal()->hasHiddenVisibility() &&
2991           !G->getGlobal()->hasProtectedVisibility())
2992         Callee = LowerGlobalAddress(Callee, DAG);
2993       else if (isa<ExternalSymbolSDNode>(Callee))
2994         Callee = LowerExternalSymbol(Callee, DAG);
2995     }
2996   }
2997
2998   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
2999     // From AMD64 ABI document:
3000     // For calls that may call functions that use varargs or stdargs
3001     // (prototype-less calls or calls to functions containing ellipsis (...) in
3002     // the declaration) %al is used as hidden argument to specify the number
3003     // of SSE registers used. The contents of %al do not need to match exactly
3004     // the number of registers, but must be an ubound on the number of SSE
3005     // registers used and is in the range 0 - 8 inclusive.
3006
3007     // Count the number of XMM registers allocated.
3008     static const MCPhysReg XMMArgRegs[] = {
3009       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3010       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3011     };
3012     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
3013     assert((Subtarget->hasSSE1() || !NumXMMRegs)
3014            && "SSE registers cannot be used when SSE is disabled");
3015
3016     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3017                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
3018   }
3019
3020   if (isVarArg && IsMustTail) {
3021     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3022     for (const auto &F : Forwards) {
3023       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3024       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3025     }
3026   }
3027
3028   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3029   // don't need this because the eligibility check rejects calls that require
3030   // shuffling arguments passed in memory.
3031   if (!IsSibcall && isTailCall) {
3032     // Force all the incoming stack arguments to be loaded from the stack
3033     // before any new outgoing arguments are stored to the stack, because the
3034     // outgoing stack slots may alias the incoming argument stack slots, and
3035     // the alias isn't otherwise explicit. This is slightly more conservative
3036     // than necessary, because it means that each store effectively depends
3037     // on every argument instead of just those arguments it would clobber.
3038     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3039
3040     SmallVector<SDValue, 8> MemOpChains2;
3041     SDValue FIN;
3042     int FI = 0;
3043     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3044       CCValAssign &VA = ArgLocs[i];
3045       if (VA.isRegLoc())
3046         continue;
3047       assert(VA.isMemLoc());
3048       SDValue Arg = OutVals[i];
3049       ISD::ArgFlagsTy Flags = Outs[i].Flags;
3050       // Skip inalloca arguments.  They don't require any work.
3051       if (Flags.isInAlloca())
3052         continue;
3053       // Create frame index.
3054       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3055       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3056       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3057       FIN = DAG.getFrameIndex(FI, getPointerTy());
3058
3059       if (Flags.isByVal()) {
3060         // Copy relative to framepointer.
3061         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
3062         if (!StackPtr.getNode())
3063           StackPtr = DAG.getCopyFromReg(Chain, dl,
3064                                         RegInfo->getStackRegister(),
3065                                         getPointerTy());
3066         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
3067
3068         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3069                                                          ArgChain,
3070                                                          Flags, DAG, dl));
3071       } else {
3072         // Store relative to framepointer.
3073         MemOpChains2.push_back(
3074           DAG.getStore(ArgChain, dl, Arg, FIN,
3075                        MachinePointerInfo::getFixedStack(FI),
3076                        false, false, 0));
3077       }
3078     }
3079
3080     if (!MemOpChains2.empty())
3081       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3082
3083     // Store the return address to the appropriate stack slot.
3084     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3085                                      getPointerTy(), RegInfo->getSlotSize(),
3086                                      FPDiff, dl);
3087   }
3088
3089   // Build a sequence of copy-to-reg nodes chained together with token chain
3090   // and flag operands which copy the outgoing args into registers.
3091   SDValue InFlag;
3092   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3093     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3094                              RegsToPass[i].second, InFlag);
3095     InFlag = Chain.getValue(1);
3096   }
3097
3098   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3099     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3100     // In the 64-bit large code model, we have to make all calls
3101     // through a register, since the call instruction's 32-bit
3102     // pc-relative offset may not be large enough to hold the whole
3103     // address.
3104   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3105     // If the callee is a GlobalAddress node (quite common, every direct call
3106     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3107     // it.
3108     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3109
3110     // We should use extra load for direct calls to dllimported functions in
3111     // non-JIT mode.
3112     const GlobalValue *GV = G->getGlobal();
3113     if (!GV->hasDLLImportStorageClass()) {
3114       unsigned char OpFlags = 0;
3115       bool ExtraLoad = false;
3116       unsigned WrapperKind = ISD::DELETED_NODE;
3117
3118       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
3119       // external symbols most go through the PLT in PIC mode.  If the symbol
3120       // has hidden or protected visibility, or if it is static or local, then
3121       // we don't need to use the PLT - we can directly call it.
3122       if (Subtarget->isTargetELF() &&
3123           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
3124           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
3125         OpFlags = X86II::MO_PLT;
3126       } else if (Subtarget->isPICStyleStubAny() &&
3127                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
3128                  (!Subtarget->getTargetTriple().isMacOSX() ||
3129                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3130         // PC-relative references to external symbols should go through $stub,
3131         // unless we're building with the leopard linker or later, which
3132         // automatically synthesizes these stubs.
3133         OpFlags = X86II::MO_DARWIN_STUB;
3134       } else if (Subtarget->isPICStyleRIPRel() &&
3135                  isa<Function>(GV) &&
3136                  cast<Function>(GV)->getAttributes().
3137                    hasAttribute(AttributeSet::FunctionIndex,
3138                                 Attribute::NonLazyBind)) {
3139         // If the function is marked as non-lazy, generate an indirect call
3140         // which loads from the GOT directly. This avoids runtime overhead
3141         // at the cost of eager binding (and one extra byte of encoding).
3142         OpFlags = X86II::MO_GOTPCREL;
3143         WrapperKind = X86ISD::WrapperRIP;
3144         ExtraLoad = true;
3145       }
3146
3147       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
3148                                           G->getOffset(), OpFlags);
3149
3150       // Add a wrapper if needed.
3151       if (WrapperKind != ISD::DELETED_NODE)
3152         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
3153       // Add extra indirection if needed.
3154       if (ExtraLoad)
3155         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
3156                              MachinePointerInfo::getGOT(),
3157                              false, false, false, 0);
3158     }
3159   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3160     unsigned char OpFlags = 0;
3161
3162     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
3163     // external symbols should go through the PLT.
3164     if (Subtarget->isTargetELF() &&
3165         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
3166       OpFlags = X86II::MO_PLT;
3167     } else if (Subtarget->isPICStyleStubAny() &&
3168                (!Subtarget->getTargetTriple().isMacOSX() ||
3169                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3170       // PC-relative references to external symbols should go through $stub,
3171       // unless we're building with the leopard linker or later, which
3172       // automatically synthesizes these stubs.
3173       OpFlags = X86II::MO_DARWIN_STUB;
3174     }
3175
3176     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
3177                                          OpFlags);
3178   } else if (Subtarget->isTarget64BitILP32() &&
3179              Callee->getValueType(0) == MVT::i32) {
3180     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3181     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3182   }
3183
3184   // Returns a chain & a flag for retval copy to use.
3185   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3186   SmallVector<SDValue, 8> Ops;
3187
3188   if (!IsSibcall && isTailCall) {
3189     Chain = DAG.getCALLSEQ_END(Chain,
3190                                DAG.getIntPtrConstant(NumBytesToPop, true),
3191                                DAG.getIntPtrConstant(0, true), InFlag, dl);
3192     InFlag = Chain.getValue(1);
3193   }
3194
3195   Ops.push_back(Chain);
3196   Ops.push_back(Callee);
3197
3198   if (isTailCall)
3199     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
3200
3201   // Add argument registers to the end of the list so that they are known live
3202   // into the call.
3203   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3204     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3205                                   RegsToPass[i].second.getValueType()));
3206
3207   // Add a register mask operand representing the call-preserved registers.
3208   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
3209   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
3210   assert(Mask && "Missing call preserved mask for calling convention");
3211   Ops.push_back(DAG.getRegisterMask(Mask));
3212
3213   if (InFlag.getNode())
3214     Ops.push_back(InFlag);
3215
3216   if (isTailCall) {
3217     // We used to do:
3218     //// If this is the first return lowered for this function, add the regs
3219     //// to the liveout set for the function.
3220     // This isn't right, although it's probably harmless on x86; liveouts
3221     // should be computed from returns not tail calls.  Consider a void
3222     // function making a tail call to a function returning int.
3223     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3224   }
3225
3226   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3227   InFlag = Chain.getValue(1);
3228
3229   // Create the CALLSEQ_END node.
3230   unsigned NumBytesForCalleeToPop;
3231   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3232                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3233     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3234   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3235            !Subtarget->getTargetTriple().isOSMSVCRT() &&
3236            SR == StackStructReturn)
3237     // If this is a call to a struct-return function, the callee
3238     // pops the hidden struct pointer, so we have to push it back.
3239     // This is common for Darwin/X86, Linux & Mingw32 targets.
3240     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3241     NumBytesForCalleeToPop = 4;
3242   else
3243     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3244
3245   // Returns a flag for retval copy to use.
3246   if (!IsSibcall) {
3247     Chain = DAG.getCALLSEQ_END(Chain,
3248                                DAG.getIntPtrConstant(NumBytesToPop, true),
3249                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
3250                                                      true),
3251                                InFlag, dl);
3252     InFlag = Chain.getValue(1);
3253   }
3254
3255   // Handle result values, copying them out of physregs into vregs that we
3256   // return.
3257   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3258                          Ins, dl, DAG, InVals);
3259 }
3260
3261 //===----------------------------------------------------------------------===//
3262 //                Fast Calling Convention (tail call) implementation
3263 //===----------------------------------------------------------------------===//
3264
3265 //  Like std call, callee cleans arguments, convention except that ECX is
3266 //  reserved for storing the tail called function address. Only 2 registers are
3267 //  free for argument passing (inreg). Tail call optimization is performed
3268 //  provided:
3269 //                * tailcallopt is enabled
3270 //                * caller/callee are fastcc
3271 //  On X86_64 architecture with GOT-style position independent code only local
3272 //  (within module) calls are supported at the moment.
3273 //  To keep the stack aligned according to platform abi the function
3274 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3275 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3276 //  If a tail called function callee has more arguments than the caller the
3277 //  caller needs to make sure that there is room to move the RETADDR to. This is
3278 //  achieved by reserving an area the size of the argument delta right after the
3279 //  original RETADDR, but before the saved framepointer or the spilled registers
3280 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3281 //  stack layout:
3282 //    arg1
3283 //    arg2
3284 //    RETADDR
3285 //    [ new RETADDR
3286 //      move area ]
3287 //    (possible EBP)
3288 //    ESI
3289 //    EDI
3290 //    local1 ..
3291
3292 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3293 /// for a 16 byte align requirement.
3294 unsigned
3295 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3296                                                SelectionDAG& DAG) const {
3297   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3298   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
3299   unsigned StackAlignment = TFI.getStackAlignment();
3300   uint64_t AlignMask = StackAlignment - 1;
3301   int64_t Offset = StackSize;
3302   unsigned SlotSize = RegInfo->getSlotSize();
3303   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3304     // Number smaller than 12 so just add the difference.
3305     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3306   } else {
3307     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3308     Offset = ((~AlignMask) & Offset) + StackAlignment +
3309       (StackAlignment-SlotSize);
3310   }
3311   return Offset;
3312 }
3313
3314 /// MatchingStackOffset - Return true if the given stack call argument is
3315 /// already available in the same position (relatively) of the caller's
3316 /// incoming argument stack.
3317 static
3318 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3319                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3320                          const X86InstrInfo *TII) {
3321   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3322   int FI = INT_MAX;
3323   if (Arg.getOpcode() == ISD::CopyFromReg) {
3324     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3325     if (!TargetRegisterInfo::isVirtualRegister(VR))
3326       return false;
3327     MachineInstr *Def = MRI->getVRegDef(VR);
3328     if (!Def)
3329       return false;
3330     if (!Flags.isByVal()) {
3331       if (!TII->isLoadFromStackSlot(Def, FI))
3332         return false;
3333     } else {
3334       unsigned Opcode = Def->getOpcode();
3335       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3336            Opcode == X86::LEA64_32r) &&
3337           Def->getOperand(1).isFI()) {
3338         FI = Def->getOperand(1).getIndex();
3339         Bytes = Flags.getByValSize();
3340       } else
3341         return false;
3342     }
3343   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3344     if (Flags.isByVal())
3345       // ByVal argument is passed in as a pointer but it's now being
3346       // dereferenced. e.g.
3347       // define @foo(%struct.X* %A) {
3348       //   tail call @bar(%struct.X* byval %A)
3349       // }
3350       return false;
3351     SDValue Ptr = Ld->getBasePtr();
3352     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3353     if (!FINode)
3354       return false;
3355     FI = FINode->getIndex();
3356   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3357     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3358     FI = FINode->getIndex();
3359     Bytes = Flags.getByValSize();
3360   } else
3361     return false;
3362
3363   assert(FI != INT_MAX);
3364   if (!MFI->isFixedObjectIndex(FI))
3365     return false;
3366   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3367 }
3368
3369 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
3370 /// for tail call optimization. Targets which want to do tail call
3371 /// optimization should implement this function.
3372 bool
3373 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3374                                                      CallingConv::ID CalleeCC,
3375                                                      bool isVarArg,
3376                                                      bool isCalleeStructRet,
3377                                                      bool isCallerStructRet,
3378                                                      Type *RetTy,
3379                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
3380                                     const SmallVectorImpl<SDValue> &OutVals,
3381                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3382                                                      SelectionDAG &DAG) const {
3383   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3384     return false;
3385
3386   // If -tailcallopt is specified, make fastcc functions tail-callable.
3387   const MachineFunction &MF = DAG.getMachineFunction();
3388   const Function *CallerF = MF.getFunction();
3389
3390   // If the function return type is x86_fp80 and the callee return type is not,
3391   // then the FP_EXTEND of the call result is not a nop. It's not safe to
3392   // perform a tailcall optimization here.
3393   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3394     return false;
3395
3396   CallingConv::ID CallerCC = CallerF->getCallingConv();
3397   bool CCMatch = CallerCC == CalleeCC;
3398   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3399   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3400
3401   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3402     if (IsTailCallConvention(CalleeCC) && CCMatch)
3403       return true;
3404     return false;
3405   }
3406
3407   // Look for obvious safe cases to perform tail call optimization that do not
3408   // require ABI changes. This is what gcc calls sibcall.
3409
3410   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3411   // emit a special epilogue.
3412   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3413   if (RegInfo->needsStackRealignment(MF))
3414     return false;
3415
3416   // Also avoid sibcall optimization if either caller or callee uses struct
3417   // return semantics.
3418   if (isCalleeStructRet || isCallerStructRet)
3419     return false;
3420
3421   // An stdcall/thiscall caller is expected to clean up its arguments; the
3422   // callee isn't going to do that.
3423   // FIXME: this is more restrictive than needed. We could produce a tailcall
3424   // when the stack adjustment matches. For example, with a thiscall that takes
3425   // only one argument.
3426   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3427                    CallerCC == CallingConv::X86_ThisCall))
3428     return false;
3429
3430   // Do not sibcall optimize vararg calls unless all arguments are passed via
3431   // registers.
3432   if (isVarArg && !Outs.empty()) {
3433
3434     // Optimizing for varargs on Win64 is unlikely to be safe without
3435     // additional testing.
3436     if (IsCalleeWin64 || IsCallerWin64)
3437       return false;
3438
3439     SmallVector<CCValAssign, 16> ArgLocs;
3440     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3441                    *DAG.getContext());
3442
3443     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3444     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3445       if (!ArgLocs[i].isRegLoc())
3446         return false;
3447   }
3448
3449   // If the call result is in ST0 / ST1, it needs to be popped off the x87
3450   // stack.  Therefore, if it's not used by the call it is not safe to optimize
3451   // this into a sibcall.
3452   bool Unused = false;
3453   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3454     if (!Ins[i].Used) {
3455       Unused = true;
3456       break;
3457     }
3458   }
3459   if (Unused) {
3460     SmallVector<CCValAssign, 16> RVLocs;
3461     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
3462                    *DAG.getContext());
3463     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3464     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3465       CCValAssign &VA = RVLocs[i];
3466       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3467         return false;
3468     }
3469   }
3470
3471   // If the calling conventions do not match, then we'd better make sure the
3472   // results are returned in the same way as what the caller expects.
3473   if (!CCMatch) {
3474     SmallVector<CCValAssign, 16> RVLocs1;
3475     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
3476                     *DAG.getContext());
3477     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3478
3479     SmallVector<CCValAssign, 16> RVLocs2;
3480     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
3481                     *DAG.getContext());
3482     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3483
3484     if (RVLocs1.size() != RVLocs2.size())
3485       return false;
3486     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3487       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3488         return false;
3489       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3490         return false;
3491       if (RVLocs1[i].isRegLoc()) {
3492         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3493           return false;
3494       } else {
3495         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3496           return false;
3497       }
3498     }
3499   }
3500
3501   // If the callee takes no arguments then go on to check the results of the
3502   // call.
3503   if (!Outs.empty()) {
3504     // Check if stack adjustment is needed. For now, do not do this if any
3505     // argument is passed on the stack.
3506     SmallVector<CCValAssign, 16> ArgLocs;
3507     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3508                    *DAG.getContext());
3509
3510     // Allocate shadow area for Win64
3511     if (IsCalleeWin64)
3512       CCInfo.AllocateStack(32, 8);
3513
3514     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3515     if (CCInfo.getNextStackOffset()) {
3516       MachineFunction &MF = DAG.getMachineFunction();
3517       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3518         return false;
3519
3520       // Check if the arguments are already laid out in the right way as
3521       // the caller's fixed stack objects.
3522       MachineFrameInfo *MFI = MF.getFrameInfo();
3523       const MachineRegisterInfo *MRI = &MF.getRegInfo();
3524       const X86InstrInfo *TII = Subtarget->getInstrInfo();
3525       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3526         CCValAssign &VA = ArgLocs[i];
3527         SDValue Arg = OutVals[i];
3528         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3529         if (VA.getLocInfo() == CCValAssign::Indirect)
3530           return false;
3531         if (!VA.isRegLoc()) {
3532           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3533                                    MFI, MRI, TII))
3534             return false;
3535         }
3536       }
3537     }
3538
3539     // If the tailcall address may be in a register, then make sure it's
3540     // possible to register allocate for it. In 32-bit, the call address can
3541     // only target EAX, EDX, or ECX since the tail call must be scheduled after
3542     // callee-saved registers are restored. These happen to be the same
3543     // registers used to pass 'inreg' arguments so watch out for those.
3544     if (!Subtarget->is64Bit() &&
3545         ((!isa<GlobalAddressSDNode>(Callee) &&
3546           !isa<ExternalSymbolSDNode>(Callee)) ||
3547          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3548       unsigned NumInRegs = 0;
3549       // In PIC we need an extra register to formulate the address computation
3550       // for the callee.
3551       unsigned MaxInRegs =
3552         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3553
3554       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3555         CCValAssign &VA = ArgLocs[i];
3556         if (!VA.isRegLoc())
3557           continue;
3558         unsigned Reg = VA.getLocReg();
3559         switch (Reg) {
3560         default: break;
3561         case X86::EAX: case X86::EDX: case X86::ECX:
3562           if (++NumInRegs == MaxInRegs)
3563             return false;
3564           break;
3565         }
3566       }
3567     }
3568   }
3569
3570   return true;
3571 }
3572
3573 FastISel *
3574 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3575                                   const TargetLibraryInfo *libInfo) const {
3576   return X86::createFastISel(funcInfo, libInfo);
3577 }
3578
3579 //===----------------------------------------------------------------------===//
3580 //                           Other Lowering Hooks
3581 //===----------------------------------------------------------------------===//
3582
3583 static bool MayFoldLoad(SDValue Op) {
3584   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3585 }
3586
3587 static bool MayFoldIntoStore(SDValue Op) {
3588   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3589 }
3590
3591 static bool isTargetShuffle(unsigned Opcode) {
3592   switch(Opcode) {
3593   default: return false;
3594   case X86ISD::BLENDI:
3595   case X86ISD::PSHUFB:
3596   case X86ISD::PSHUFD:
3597   case X86ISD::PSHUFHW:
3598   case X86ISD::PSHUFLW:
3599   case X86ISD::SHUFP:
3600   case X86ISD::PALIGNR:
3601   case X86ISD::MOVLHPS:
3602   case X86ISD::MOVLHPD:
3603   case X86ISD::MOVHLPS:
3604   case X86ISD::MOVLPS:
3605   case X86ISD::MOVLPD:
3606   case X86ISD::MOVSHDUP:
3607   case X86ISD::MOVSLDUP:
3608   case X86ISD::MOVDDUP:
3609   case X86ISD::MOVSS:
3610   case X86ISD::MOVSD:
3611   case X86ISD::UNPCKL:
3612   case X86ISD::UNPCKH:
3613   case X86ISD::VPERMILPI:
3614   case X86ISD::VPERM2X128:
3615   case X86ISD::VPERMI:
3616     return true;
3617   }
3618 }
3619
3620 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3621                                     SDValue V1, SelectionDAG &DAG) {
3622   switch(Opc) {
3623   default: llvm_unreachable("Unknown x86 shuffle node");
3624   case X86ISD::MOVSHDUP:
3625   case X86ISD::MOVSLDUP:
3626   case X86ISD::MOVDDUP:
3627     return DAG.getNode(Opc, dl, VT, V1);
3628   }
3629 }
3630
3631 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3632                                     SDValue V1, unsigned TargetMask,
3633                                     SelectionDAG &DAG) {
3634   switch(Opc) {
3635   default: llvm_unreachable("Unknown x86 shuffle node");
3636   case X86ISD::PSHUFD:
3637   case X86ISD::PSHUFHW:
3638   case X86ISD::PSHUFLW:
3639   case X86ISD::VPERMILPI:
3640   case X86ISD::VPERMI:
3641     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3642   }
3643 }
3644
3645 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3646                                     SDValue V1, SDValue V2, unsigned TargetMask,
3647                                     SelectionDAG &DAG) {
3648   switch(Opc) {
3649   default: llvm_unreachable("Unknown x86 shuffle node");
3650   case X86ISD::PALIGNR:
3651   case X86ISD::VALIGN:
3652   case X86ISD::SHUFP:
3653   case X86ISD::VPERM2X128:
3654     return DAG.getNode(Opc, dl, VT, V1, V2,
3655                        DAG.getConstant(TargetMask, MVT::i8));
3656   }
3657 }
3658
3659 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3660                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
3661   switch(Opc) {
3662   default: llvm_unreachable("Unknown x86 shuffle node");
3663   case X86ISD::MOVLHPS:
3664   case X86ISD::MOVLHPD:
3665   case X86ISD::MOVHLPS:
3666   case X86ISD::MOVLPS:
3667   case X86ISD::MOVLPD:
3668   case X86ISD::MOVSS:
3669   case X86ISD::MOVSD:
3670   case X86ISD::UNPCKL:
3671   case X86ISD::UNPCKH:
3672     return DAG.getNode(Opc, dl, VT, V1, V2);
3673   }
3674 }
3675
3676 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3677   MachineFunction &MF = DAG.getMachineFunction();
3678   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3679   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3680   int ReturnAddrIndex = FuncInfo->getRAIndex();
3681
3682   if (ReturnAddrIndex == 0) {
3683     // Set up a frame object for the return address.
3684     unsigned SlotSize = RegInfo->getSlotSize();
3685     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3686                                                            -(int64_t)SlotSize,
3687                                                            false);
3688     FuncInfo->setRAIndex(ReturnAddrIndex);
3689   }
3690
3691   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3692 }
3693
3694 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3695                                        bool hasSymbolicDisplacement) {
3696   // Offset should fit into 32 bit immediate field.
3697   if (!isInt<32>(Offset))
3698     return false;
3699
3700   // If we don't have a symbolic displacement - we don't have any extra
3701   // restrictions.
3702   if (!hasSymbolicDisplacement)
3703     return true;
3704
3705   // FIXME: Some tweaks might be needed for medium code model.
3706   if (M != CodeModel::Small && M != CodeModel::Kernel)
3707     return false;
3708
3709   // For small code model we assume that latest object is 16MB before end of 31
3710   // bits boundary. We may also accept pretty large negative constants knowing
3711   // that all objects are in the positive half of address space.
3712   if (M == CodeModel::Small && Offset < 16*1024*1024)
3713     return true;
3714
3715   // For kernel code model we know that all object resist in the negative half
3716   // of 32bits address space. We may not accept negative offsets, since they may
3717   // be just off and we may accept pretty large positive ones.
3718   if (M == CodeModel::Kernel && Offset >= 0)
3719     return true;
3720
3721   return false;
3722 }
3723
3724 /// isCalleePop - Determines whether the callee is required to pop its
3725 /// own arguments. Callee pop is necessary to support tail calls.
3726 bool X86::isCalleePop(CallingConv::ID CallingConv,
3727                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3728   switch (CallingConv) {
3729   default:
3730     return false;
3731   case CallingConv::X86_StdCall:
3732   case CallingConv::X86_FastCall:
3733   case CallingConv::X86_ThisCall:
3734     return !is64Bit;
3735   case CallingConv::Fast:
3736   case CallingConv::GHC:
3737   case CallingConv::HiPE:
3738     if (IsVarArg)
3739       return false;
3740     return TailCallOpt;
3741   }
3742 }
3743
3744 /// \brief Return true if the condition is an unsigned comparison operation.
3745 static bool isX86CCUnsigned(unsigned X86CC) {
3746   switch (X86CC) {
3747   default: llvm_unreachable("Invalid integer condition!");
3748   case X86::COND_E:     return true;
3749   case X86::COND_G:     return false;
3750   case X86::COND_GE:    return false;
3751   case X86::COND_L:     return false;
3752   case X86::COND_LE:    return false;
3753   case X86::COND_NE:    return true;
3754   case X86::COND_B:     return true;
3755   case X86::COND_A:     return true;
3756   case X86::COND_BE:    return true;
3757   case X86::COND_AE:    return true;
3758   }
3759   llvm_unreachable("covered switch fell through?!");
3760 }
3761
3762 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3763 /// specific condition code, returning the condition code and the LHS/RHS of the
3764 /// comparison to make.
3765 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3766                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3767   if (!isFP) {
3768     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3769       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3770         // X > -1   -> X == 0, jump !sign.
3771         RHS = DAG.getConstant(0, RHS.getValueType());
3772         return X86::COND_NS;
3773       }
3774       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3775         // X < 0   -> X == 0, jump on sign.
3776         return X86::COND_S;
3777       }
3778       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3779         // X < 1   -> X <= 0
3780         RHS = DAG.getConstant(0, RHS.getValueType());
3781         return X86::COND_LE;
3782       }
3783     }
3784
3785     switch (SetCCOpcode) {
3786     default: llvm_unreachable("Invalid integer condition!");
3787     case ISD::SETEQ:  return X86::COND_E;
3788     case ISD::SETGT:  return X86::COND_G;
3789     case ISD::SETGE:  return X86::COND_GE;
3790     case ISD::SETLT:  return X86::COND_L;
3791     case ISD::SETLE:  return X86::COND_LE;
3792     case ISD::SETNE:  return X86::COND_NE;
3793     case ISD::SETULT: return X86::COND_B;
3794     case ISD::SETUGT: return X86::COND_A;
3795     case ISD::SETULE: return X86::COND_BE;
3796     case ISD::SETUGE: return X86::COND_AE;
3797     }
3798   }
3799
3800   // First determine if it is required or is profitable to flip the operands.
3801
3802   // If LHS is a foldable load, but RHS is not, flip the condition.
3803   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3804       !ISD::isNON_EXTLoad(RHS.getNode())) {
3805     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3806     std::swap(LHS, RHS);
3807   }
3808
3809   switch (SetCCOpcode) {
3810   default: break;
3811   case ISD::SETOLT:
3812   case ISD::SETOLE:
3813   case ISD::SETUGT:
3814   case ISD::SETUGE:
3815     std::swap(LHS, RHS);
3816     break;
3817   }
3818
3819   // On a floating point condition, the flags are set as follows:
3820   // ZF  PF  CF   op
3821   //  0 | 0 | 0 | X > Y
3822   //  0 | 0 | 1 | X < Y
3823   //  1 | 0 | 0 | X == Y
3824   //  1 | 1 | 1 | unordered
3825   switch (SetCCOpcode) {
3826   default: llvm_unreachable("Condcode should be pre-legalized away");
3827   case ISD::SETUEQ:
3828   case ISD::SETEQ:   return X86::COND_E;
3829   case ISD::SETOLT:              // flipped
3830   case ISD::SETOGT:
3831   case ISD::SETGT:   return X86::COND_A;
3832   case ISD::SETOLE:              // flipped
3833   case ISD::SETOGE:
3834   case ISD::SETGE:   return X86::COND_AE;
3835   case ISD::SETUGT:              // flipped
3836   case ISD::SETULT:
3837   case ISD::SETLT:   return X86::COND_B;
3838   case ISD::SETUGE:              // flipped
3839   case ISD::SETULE:
3840   case ISD::SETLE:   return X86::COND_BE;
3841   case ISD::SETONE:
3842   case ISD::SETNE:   return X86::COND_NE;
3843   case ISD::SETUO:   return X86::COND_P;
3844   case ISD::SETO:    return X86::COND_NP;
3845   case ISD::SETOEQ:
3846   case ISD::SETUNE:  return X86::COND_INVALID;
3847   }
3848 }
3849
3850 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
3851 /// code. Current x86 isa includes the following FP cmov instructions:
3852 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3853 static bool hasFPCMov(unsigned X86CC) {
3854   switch (X86CC) {
3855   default:
3856     return false;
3857   case X86::COND_B:
3858   case X86::COND_BE:
3859   case X86::COND_E:
3860   case X86::COND_P:
3861   case X86::COND_A:
3862   case X86::COND_AE:
3863   case X86::COND_NE:
3864   case X86::COND_NP:
3865     return true;
3866   }
3867 }
3868
3869 /// isFPImmLegal - Returns true if the target can instruction select the
3870 /// specified FP immediate natively. If false, the legalizer will
3871 /// materialize the FP immediate as a load from a constant pool.
3872 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3873   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3874     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3875       return true;
3876   }
3877   return false;
3878 }
3879
3880 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
3881                                               ISD::LoadExtType ExtTy,
3882                                               EVT NewVT) const {
3883   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3884   // relocation target a movq or addq instruction: don't let the load shrink.
3885   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3886   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3887     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3888       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3889   return true;
3890 }
3891
3892 /// \brief Returns true if it is beneficial to convert a load of a constant
3893 /// to just the constant itself.
3894 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3895                                                           Type *Ty) const {
3896   assert(Ty->isIntegerTy());
3897
3898   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3899   if (BitSize == 0 || BitSize > 64)
3900     return false;
3901   return true;
3902 }
3903
3904 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
3905                                                 unsigned Index) const {
3906   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3907     return false;
3908
3909   return (Index == 0 || Index == ResVT.getVectorNumElements());
3910 }
3911
3912 bool X86TargetLowering::isCheapToSpeculateCttz() const {
3913   // Speculate cttz only if we can directly use TZCNT.
3914   return Subtarget->hasBMI();
3915 }
3916
3917 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
3918   // Speculate ctlz only if we can directly use LZCNT.
3919   return Subtarget->hasLZCNT();
3920 }
3921
3922 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
3923 /// the specified range (L, H].
3924 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3925   return (Val < 0) || (Val >= Low && Val < Hi);
3926 }
3927
3928 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3929 /// specified value.
3930 static bool isUndefOrEqual(int Val, int CmpVal) {
3931   return (Val < 0 || Val == CmpVal);
3932 }
3933
3934 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3935 /// from position Pos and ending in Pos+Size, falls within the specified
3936 /// sequential range (Low, Low+Size]. or is undef.
3937 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3938                                        unsigned Pos, unsigned Size, int Low) {
3939   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3940     if (!isUndefOrEqual(Mask[i], Low))
3941       return false;
3942   return true;
3943 }
3944
3945 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3946 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
3947 /// operand - by default will match for first operand.
3948 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
3949                          bool TestSecondOperand = false) {
3950   if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
3951       VT != MVT::v2f64 && VT != MVT::v2i64)
3952     return false;
3953
3954   unsigned NumElems = VT.getVectorNumElements();
3955   unsigned Lo = TestSecondOperand ? NumElems : 0;
3956   unsigned Hi = Lo + NumElems;
3957
3958   for (unsigned i = 0; i < NumElems; ++i)
3959     if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
3960       return false;
3961
3962   return true;
3963 }
3964
3965 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3966 /// is suitable for input to PSHUFHW.
3967 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3968   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3969     return false;
3970
3971   // Lower quadword copied in order or undef.
3972   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3973     return false;
3974
3975   // Upper quadword shuffled.
3976   for (unsigned i = 4; i != 8; ++i)
3977     if (!isUndefOrInRange(Mask[i], 4, 8))
3978       return false;
3979
3980   if (VT == MVT::v16i16) {
3981     // Lower quadword copied in order or undef.
3982     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3983       return false;
3984
3985     // Upper quadword shuffled.
3986     for (unsigned i = 12; i != 16; ++i)
3987       if (!isUndefOrInRange(Mask[i], 12, 16))
3988         return false;
3989   }
3990
3991   return true;
3992 }
3993
3994 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3995 /// is suitable for input to PSHUFLW.
3996 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3997   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3998     return false;
3999
4000   // Upper quadword copied in order.
4001   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
4002     return false;
4003
4004   // Lower quadword shuffled.
4005   for (unsigned i = 0; i != 4; ++i)
4006     if (!isUndefOrInRange(Mask[i], 0, 4))
4007       return false;
4008
4009   if (VT == MVT::v16i16) {
4010     // Upper quadword copied in order.
4011     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
4012       return false;
4013
4014     // Lower quadword shuffled.
4015     for (unsigned i = 8; i != 12; ++i)
4016       if (!isUndefOrInRange(Mask[i], 8, 12))
4017         return false;
4018   }
4019
4020   return true;
4021 }
4022
4023 /// \brief Return true if the mask specifies a shuffle of elements that is
4024 /// suitable for input to intralane (palignr) or interlane (valign) vector
4025 /// right-shift.
4026 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
4027   unsigned NumElts = VT.getVectorNumElements();
4028   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
4029   unsigned NumLaneElts = NumElts/NumLanes;
4030
4031   // Do not handle 64-bit element shuffles with palignr.
4032   if (NumLaneElts == 2)
4033     return false;
4034
4035   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
4036     unsigned i;
4037     for (i = 0; i != NumLaneElts; ++i) {
4038       if (Mask[i+l] >= 0)
4039         break;
4040     }
4041
4042     // Lane is all undef, go to next lane
4043     if (i == NumLaneElts)
4044       continue;
4045
4046     int Start = Mask[i+l];
4047
4048     // Make sure its in this lane in one of the sources
4049     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
4050         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
4051       return false;
4052
4053     // If not lane 0, then we must match lane 0
4054     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
4055       return false;
4056
4057     // Correct second source to be contiguous with first source
4058     if (Start >= (int)NumElts)
4059       Start -= NumElts - NumLaneElts;
4060
4061     // Make sure we're shifting in the right direction.
4062     if (Start <= (int)(i+l))
4063       return false;
4064
4065     Start -= i;
4066
4067     // Check the rest of the elements to see if they are consecutive.
4068     for (++i; i != NumLaneElts; ++i) {
4069       int Idx = Mask[i+l];
4070
4071       // Make sure its in this lane
4072       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
4073           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
4074         return false;
4075
4076       // If not lane 0, then we must match lane 0
4077       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
4078         return false;
4079
4080       if (Idx >= (int)NumElts)
4081         Idx -= NumElts - NumLaneElts;
4082
4083       if (!isUndefOrEqual(Idx, Start+i))
4084         return false;
4085
4086     }
4087   }
4088
4089   return true;
4090 }
4091
4092 /// \brief Return true if the node specifies a shuffle of elements that is
4093 /// suitable for input to PALIGNR.
4094 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
4095                           const X86Subtarget *Subtarget) {
4096   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
4097       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
4098       VT.is512BitVector())
4099     // FIXME: Add AVX512BW.
4100     return false;
4101
4102   return isAlignrMask(Mask, VT, false);
4103 }
4104
4105 /// \brief Return true if the node specifies a shuffle of elements that is
4106 /// suitable for input to VALIGN.
4107 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
4108                           const X86Subtarget *Subtarget) {
4109   // FIXME: Add AVX512VL.
4110   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
4111     return false;
4112   return isAlignrMask(Mask, VT, true);
4113 }
4114
4115 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
4116 /// the two vector operands have swapped position.
4117 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
4118                                      unsigned NumElems) {
4119   for (unsigned i = 0; i != NumElems; ++i) {
4120     int idx = Mask[i];
4121     if (idx < 0)
4122       continue;
4123     else if (idx < (int)NumElems)
4124       Mask[i] = idx + NumElems;
4125     else
4126       Mask[i] = idx - NumElems;
4127   }
4128 }
4129
4130 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
4131 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
4132 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
4133 /// reverse of what x86 shuffles want.
4134 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
4135
4136   unsigned NumElems = VT.getVectorNumElements();
4137   unsigned NumLanes = VT.getSizeInBits()/128;
4138   unsigned NumLaneElems = NumElems/NumLanes;
4139
4140   if (NumLaneElems != 2 && NumLaneElems != 4)
4141     return false;
4142
4143   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4144   bool symetricMaskRequired =
4145     (VT.getSizeInBits() >= 256) && (EltSize == 32);
4146
4147   // VSHUFPSY divides the resulting vector into 4 chunks.
4148   // The sources are also splitted into 4 chunks, and each destination
4149   // chunk must come from a different source chunk.
4150   //
4151   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
4152   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
4153   //
4154   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
4155   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
4156   //
4157   // VSHUFPDY divides the resulting vector into 4 chunks.
4158   // The sources are also splitted into 4 chunks, and each destination
4159   // chunk must come from a different source chunk.
4160   //
4161   //  SRC1 =>      X3       X2       X1       X0
4162   //  SRC2 =>      Y3       Y2       Y1       Y0
4163   //
4164   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
4165   //
4166   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
4167   unsigned HalfLaneElems = NumLaneElems/2;
4168   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
4169     for (unsigned i = 0; i != NumLaneElems; ++i) {
4170       int Idx = Mask[i+l];
4171       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
4172       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
4173         return false;
4174       // For VSHUFPSY, the mask of the second half must be the same as the
4175       // first but with the appropriate offsets. This works in the same way as
4176       // VPERMILPS works with masks.
4177       if (!symetricMaskRequired || Idx < 0)
4178         continue;
4179       if (MaskVal[i] < 0) {
4180         MaskVal[i] = Idx - l;
4181         continue;
4182       }
4183       if ((signed)(Idx - l) != MaskVal[i])
4184         return false;
4185     }
4186   }
4187
4188   return true;
4189 }
4190
4191 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
4192 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
4193 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
4194   if (!VT.is128BitVector())
4195     return false;
4196
4197   unsigned NumElems = VT.getVectorNumElements();
4198
4199   if (NumElems != 4)
4200     return false;
4201
4202   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
4203   return isUndefOrEqual(Mask[0], 6) &&
4204          isUndefOrEqual(Mask[1], 7) &&
4205          isUndefOrEqual(Mask[2], 2) &&
4206          isUndefOrEqual(Mask[3], 3);
4207 }
4208
4209 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
4210 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
4211 /// <2, 3, 2, 3>
4212 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
4213   if (!VT.is128BitVector())
4214     return false;
4215
4216   unsigned NumElems = VT.getVectorNumElements();
4217
4218   if (NumElems != 4)
4219     return false;
4220
4221   return isUndefOrEqual(Mask[0], 2) &&
4222          isUndefOrEqual(Mask[1], 3) &&
4223          isUndefOrEqual(Mask[2], 2) &&
4224          isUndefOrEqual(Mask[3], 3);
4225 }
4226
4227 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
4228 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
4229 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
4230   if (!VT.is128BitVector())
4231     return false;
4232
4233   unsigned NumElems = VT.getVectorNumElements();
4234
4235   if (NumElems != 2 && NumElems != 4)
4236     return false;
4237
4238   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4239     if (!isUndefOrEqual(Mask[i], i + NumElems))
4240       return false;
4241
4242   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4243     if (!isUndefOrEqual(Mask[i], i))
4244       return false;
4245
4246   return true;
4247 }
4248
4249 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
4250 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
4251 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
4252   if (!VT.is128BitVector())
4253     return false;
4254
4255   unsigned NumElems = VT.getVectorNumElements();
4256
4257   if (NumElems != 2 && NumElems != 4)
4258     return false;
4259
4260   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4261     if (!isUndefOrEqual(Mask[i], i))
4262       return false;
4263
4264   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4265     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
4266       return false;
4267
4268   return true;
4269 }
4270
4271 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
4272 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
4273 /// i. e: If all but one element come from the same vector.
4274 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
4275   // TODO: Deal with AVX's VINSERTPS
4276   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
4277     return false;
4278
4279   unsigned CorrectPosV1 = 0;
4280   unsigned CorrectPosV2 = 0;
4281   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
4282     if (Mask[i] == -1) {
4283       ++CorrectPosV1;
4284       ++CorrectPosV2;
4285       continue;
4286     }
4287
4288     if (Mask[i] == i)
4289       ++CorrectPosV1;
4290     else if (Mask[i] == i + 4)
4291       ++CorrectPosV2;
4292   }
4293
4294   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
4295     // We have 3 elements (undefs count as elements from any vector) from one
4296     // vector, and one from another.
4297     return true;
4298
4299   return false;
4300 }
4301
4302 //
4303 // Some special combinations that can be optimized.
4304 //
4305 static
4306 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
4307                                SelectionDAG &DAG) {
4308   MVT VT = SVOp->getSimpleValueType(0);
4309   SDLoc dl(SVOp);
4310
4311   if (VT != MVT::v8i32 && VT != MVT::v8f32)
4312     return SDValue();
4313
4314   ArrayRef<int> Mask = SVOp->getMask();
4315
4316   // These are the special masks that may be optimized.
4317   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
4318   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
4319   bool MatchEvenMask = true;
4320   bool MatchOddMask  = true;
4321   for (int i=0; i<8; ++i) {
4322     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
4323       MatchEvenMask = false;
4324     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
4325       MatchOddMask = false;
4326   }
4327
4328   if (!MatchEvenMask && !MatchOddMask)
4329     return SDValue();
4330
4331   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
4332
4333   SDValue Op0 = SVOp->getOperand(0);
4334   SDValue Op1 = SVOp->getOperand(1);
4335
4336   if (MatchEvenMask) {
4337     // Shift the second operand right to 32 bits.
4338     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
4339     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
4340   } else {
4341     // Shift the first operand left to 32 bits.
4342     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
4343     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
4344   }
4345   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
4346   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
4347 }
4348
4349 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
4350 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
4351 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
4352                          bool HasInt256, bool V2IsSplat = false) {
4353
4354   assert(VT.getSizeInBits() >= 128 &&
4355          "Unsupported vector type for unpckl");
4356
4357   unsigned NumElts = VT.getVectorNumElements();
4358   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4359       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4360     return false;
4361
4362   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4363          "Unsupported vector type for unpckh");
4364
4365   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4366   unsigned NumLanes = VT.getSizeInBits()/128;
4367   unsigned NumLaneElts = NumElts/NumLanes;
4368
4369   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4370     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4371       int BitI  = Mask[l+i];
4372       int BitI1 = Mask[l+i+1];
4373       if (!isUndefOrEqual(BitI, j))
4374         return false;
4375       if (V2IsSplat) {
4376         if (!isUndefOrEqual(BitI1, NumElts))
4377           return false;
4378       } else {
4379         if (!isUndefOrEqual(BitI1, j + NumElts))
4380           return false;
4381       }
4382     }
4383   }
4384
4385   return true;
4386 }
4387
4388 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
4389 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
4390 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
4391                          bool HasInt256, bool V2IsSplat = false) {
4392   assert(VT.getSizeInBits() >= 128 &&
4393          "Unsupported vector type for unpckh");
4394
4395   unsigned NumElts = VT.getVectorNumElements();
4396   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4397       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4398     return false;
4399
4400   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4401          "Unsupported vector type for unpckh");
4402
4403   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4404   unsigned NumLanes = VT.getSizeInBits()/128;
4405   unsigned NumLaneElts = NumElts/NumLanes;
4406
4407   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4408     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4409       int BitI  = Mask[l+i];
4410       int BitI1 = Mask[l+i+1];
4411       if (!isUndefOrEqual(BitI, j))
4412         return false;
4413       if (V2IsSplat) {
4414         if (isUndefOrEqual(BitI1, NumElts))
4415           return false;
4416       } else {
4417         if (!isUndefOrEqual(BitI1, j+NumElts))
4418           return false;
4419       }
4420     }
4421   }
4422   return true;
4423 }
4424
4425 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4426 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4427 /// <0, 0, 1, 1>
4428 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4429   unsigned NumElts = VT.getVectorNumElements();
4430   bool Is256BitVec = VT.is256BitVector();
4431
4432   if (VT.is512BitVector())
4433     return false;
4434   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4435          "Unsupported vector type for unpckh");
4436
4437   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4438       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4439     return false;
4440
4441   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4442   // FIXME: Need a better way to get rid of this, there's no latency difference
4443   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4444   // the former later. We should also remove the "_undef" special mask.
4445   if (NumElts == 4 && Is256BitVec)
4446     return false;
4447
4448   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4449   // independently on 128-bit lanes.
4450   unsigned NumLanes = VT.getSizeInBits()/128;
4451   unsigned NumLaneElts = NumElts/NumLanes;
4452
4453   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4454     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4455       int BitI  = Mask[l+i];
4456       int BitI1 = Mask[l+i+1];
4457
4458       if (!isUndefOrEqual(BitI, j))
4459         return false;
4460       if (!isUndefOrEqual(BitI1, j))
4461         return false;
4462     }
4463   }
4464
4465   return true;
4466 }
4467
4468 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4469 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4470 /// <2, 2, 3, 3>
4471 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4472   unsigned NumElts = VT.getVectorNumElements();
4473
4474   if (VT.is512BitVector())
4475     return false;
4476
4477   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4478          "Unsupported vector type for unpckh");
4479
4480   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4481       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4482     return false;
4483
4484   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4485   // independently on 128-bit lanes.
4486   unsigned NumLanes = VT.getSizeInBits()/128;
4487   unsigned NumLaneElts = NumElts/NumLanes;
4488
4489   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4490     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4491       int BitI  = Mask[l+i];
4492       int BitI1 = Mask[l+i+1];
4493       if (!isUndefOrEqual(BitI, j))
4494         return false;
4495       if (!isUndefOrEqual(BitI1, j))
4496         return false;
4497     }
4498   }
4499   return true;
4500 }
4501
4502 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
4503 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
4504 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
4505   if (!VT.is512BitVector())
4506     return false;
4507
4508   unsigned NumElts = VT.getVectorNumElements();
4509   unsigned HalfSize = NumElts/2;
4510   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
4511     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
4512       *Imm = 1;
4513       return true;
4514     }
4515   }
4516   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
4517     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
4518       *Imm = 0;
4519       return true;
4520     }
4521   }
4522   return false;
4523 }
4524
4525 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4526 /// specifies a shuffle of elements that is suitable for input to MOVSS,
4527 /// MOVSD, and MOVD, i.e. setting the lowest element.
4528 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4529   if (VT.getVectorElementType().getSizeInBits() < 32)
4530     return false;
4531   if (!VT.is128BitVector())
4532     return false;
4533
4534   unsigned NumElts = VT.getVectorNumElements();
4535
4536   if (!isUndefOrEqual(Mask[0], NumElts))
4537     return false;
4538
4539   for (unsigned i = 1; i != NumElts; ++i)
4540     if (!isUndefOrEqual(Mask[i], i))
4541       return false;
4542
4543   return true;
4544 }
4545
4546 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4547 /// as permutations between 128-bit chunks or halves. As an example: this
4548 /// shuffle bellow:
4549 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4550 /// The first half comes from the second half of V1 and the second half from the
4551 /// the second half of V2.
4552 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4553   if (!HasFp256 || !VT.is256BitVector())
4554     return false;
4555
4556   // The shuffle result is divided into half A and half B. In total the two
4557   // sources have 4 halves, namely: C, D, E, F. The final values of A and
4558   // B must come from C, D, E or F.
4559   unsigned HalfSize = VT.getVectorNumElements()/2;
4560   bool MatchA = false, MatchB = false;
4561
4562   // Check if A comes from one of C, D, E, F.
4563   for (unsigned Half = 0; Half != 4; ++Half) {
4564     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4565       MatchA = true;
4566       break;
4567     }
4568   }
4569
4570   // Check if B comes from one of C, D, E, F.
4571   for (unsigned Half = 0; Half != 4; ++Half) {
4572     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4573       MatchB = true;
4574       break;
4575     }
4576   }
4577
4578   return MatchA && MatchB;
4579 }
4580
4581 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4582 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4583 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4584   MVT VT = SVOp->getSimpleValueType(0);
4585
4586   unsigned HalfSize = VT.getVectorNumElements()/2;
4587
4588   unsigned FstHalf = 0, SndHalf = 0;
4589   for (unsigned i = 0; i < HalfSize; ++i) {
4590     if (SVOp->getMaskElt(i) > 0) {
4591       FstHalf = SVOp->getMaskElt(i)/HalfSize;
4592       break;
4593     }
4594   }
4595   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4596     if (SVOp->getMaskElt(i) > 0) {
4597       SndHalf = SVOp->getMaskElt(i)/HalfSize;
4598       break;
4599     }
4600   }
4601
4602   return (FstHalf | (SndHalf << 4));
4603 }
4604
4605 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
4606 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4607   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4608   if (EltSize < 32)
4609     return false;
4610
4611   unsigned NumElts = VT.getVectorNumElements();
4612   Imm8 = 0;
4613   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4614     for (unsigned i = 0; i != NumElts; ++i) {
4615       if (Mask[i] < 0)
4616         continue;
4617       Imm8 |= Mask[i] << (i*2);
4618     }
4619     return true;
4620   }
4621
4622   unsigned LaneSize = 4;
4623   SmallVector<int, 4> MaskVal(LaneSize, -1);
4624
4625   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4626     for (unsigned i = 0; i != LaneSize; ++i) {
4627       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4628         return false;
4629       if (Mask[i+l] < 0)
4630         continue;
4631       if (MaskVal[i] < 0) {
4632         MaskVal[i] = Mask[i+l] - l;
4633         Imm8 |= MaskVal[i] << (i*2);
4634         continue;
4635       }
4636       if (Mask[i+l] != (signed)(MaskVal[i]+l))
4637         return false;
4638     }
4639   }
4640   return true;
4641 }
4642
4643 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4644 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4645 /// Note that VPERMIL mask matching is different depending whether theunderlying
4646 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4647 /// to the same elements of the low, but to the higher half of the source.
4648 /// In VPERMILPD the two lanes could be shuffled independently of each other
4649 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4650 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4651   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4652   if (VT.getSizeInBits() < 256 || EltSize < 32)
4653     return false;
4654   bool symetricMaskRequired = (EltSize == 32);
4655   unsigned NumElts = VT.getVectorNumElements();
4656
4657   unsigned NumLanes = VT.getSizeInBits()/128;
4658   unsigned LaneSize = NumElts/NumLanes;
4659   // 2 or 4 elements in one lane
4660
4661   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4662   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4663     for (unsigned i = 0; i != LaneSize; ++i) {
4664       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4665         return false;
4666       if (symetricMaskRequired) {
4667         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4668           ExpectedMaskVal[i] = Mask[i+l] - l;
4669           continue;
4670         }
4671         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4672           return false;
4673       }
4674     }
4675   }
4676   return true;
4677 }
4678
4679 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4680 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
4681 /// element of vector 2 and the other elements to come from vector 1 in order.
4682 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4683                                bool V2IsSplat = false, bool V2IsUndef = false) {
4684   if (!VT.is128BitVector())
4685     return false;
4686
4687   unsigned NumOps = VT.getVectorNumElements();
4688   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4689     return false;
4690
4691   if (!isUndefOrEqual(Mask[0], 0))
4692     return false;
4693
4694   for (unsigned i = 1; i != NumOps; ++i)
4695     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4696           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4697           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4698       return false;
4699
4700   return true;
4701 }
4702
4703 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4704 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4705 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4706 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4707                            const X86Subtarget *Subtarget) {
4708   if (!Subtarget->hasSSE3())
4709     return false;
4710
4711   unsigned NumElems = VT.getVectorNumElements();
4712
4713   if ((VT.is128BitVector() && NumElems != 4) ||
4714       (VT.is256BitVector() && NumElems != 8) ||
4715       (VT.is512BitVector() && NumElems != 16))
4716     return false;
4717
4718   // "i+1" is the value the indexed mask element must have
4719   for (unsigned i = 0; i != NumElems; i += 2)
4720     if (!isUndefOrEqual(Mask[i], i+1) ||
4721         !isUndefOrEqual(Mask[i+1], i+1))
4722       return false;
4723
4724   return true;
4725 }
4726
4727 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4728 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4729 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4730 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4731                            const X86Subtarget *Subtarget) {
4732   if (!Subtarget->hasSSE3())
4733     return false;
4734
4735   unsigned NumElems = VT.getVectorNumElements();
4736
4737   if ((VT.is128BitVector() && NumElems != 4) ||
4738       (VT.is256BitVector() && NumElems != 8) ||
4739       (VT.is512BitVector() && NumElems != 16))
4740     return false;
4741
4742   // "i" is the value the indexed mask element must have
4743   for (unsigned i = 0; i != NumElems; i += 2)
4744     if (!isUndefOrEqual(Mask[i], i) ||
4745         !isUndefOrEqual(Mask[i+1], i))
4746       return false;
4747
4748   return true;
4749 }
4750
4751 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4752 /// specifies a shuffle of elements that is suitable for input to 256-bit
4753 /// version of MOVDDUP.
4754 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4755   if (!HasFp256 || !VT.is256BitVector())
4756     return false;
4757
4758   unsigned NumElts = VT.getVectorNumElements();
4759   if (NumElts != 4)
4760     return false;
4761
4762   for (unsigned i = 0; i != NumElts/2; ++i)
4763     if (!isUndefOrEqual(Mask[i], 0))
4764       return false;
4765   for (unsigned i = NumElts/2; i != NumElts; ++i)
4766     if (!isUndefOrEqual(Mask[i], NumElts/2))
4767       return false;
4768   return true;
4769 }
4770
4771 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4772 /// specifies a shuffle of elements that is suitable for input to 128-bit
4773 /// version of MOVDDUP.
4774 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4775   if (!VT.is128BitVector())
4776     return false;
4777
4778   unsigned e = VT.getVectorNumElements() / 2;
4779   for (unsigned i = 0; i != e; ++i)
4780     if (!isUndefOrEqual(Mask[i], i))
4781       return false;
4782   for (unsigned i = 0; i != e; ++i)
4783     if (!isUndefOrEqual(Mask[e+i], i))
4784       return false;
4785   return true;
4786 }
4787
4788 /// isVEXTRACTIndex - Return true if the specified
4789 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4790 /// suitable for instruction that extract 128 or 256 bit vectors
4791 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4792   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4793   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4794     return false;
4795
4796   // The index should be aligned on a vecWidth-bit boundary.
4797   uint64_t Index =
4798     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4799
4800   MVT VT = N->getSimpleValueType(0);
4801   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4802   bool Result = (Index * ElSize) % vecWidth == 0;
4803
4804   return Result;
4805 }
4806
4807 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4808 /// operand specifies a subvector insert that is suitable for input to
4809 /// insertion of 128 or 256-bit subvectors
4810 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4811   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4812   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4813     return false;
4814   // The index should be aligned on a vecWidth-bit boundary.
4815   uint64_t Index =
4816     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4817
4818   MVT VT = N->getSimpleValueType(0);
4819   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4820   bool Result = (Index * ElSize) % vecWidth == 0;
4821
4822   return Result;
4823 }
4824
4825 bool X86::isVINSERT128Index(SDNode *N) {
4826   return isVINSERTIndex(N, 128);
4827 }
4828
4829 bool X86::isVINSERT256Index(SDNode *N) {
4830   return isVINSERTIndex(N, 256);
4831 }
4832
4833 bool X86::isVEXTRACT128Index(SDNode *N) {
4834   return isVEXTRACTIndex(N, 128);
4835 }
4836
4837 bool X86::isVEXTRACT256Index(SDNode *N) {
4838   return isVEXTRACTIndex(N, 256);
4839 }
4840
4841 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4842 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4843 /// Handles 128-bit and 256-bit.
4844 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4845   MVT VT = N->getSimpleValueType(0);
4846
4847   assert((VT.getSizeInBits() >= 128) &&
4848          "Unsupported vector type for PSHUF/SHUFP");
4849
4850   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4851   // independently on 128-bit lanes.
4852   unsigned NumElts = VT.getVectorNumElements();
4853   unsigned NumLanes = VT.getSizeInBits()/128;
4854   unsigned NumLaneElts = NumElts/NumLanes;
4855
4856   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
4857          "Only supports 2, 4 or 8 elements per lane");
4858
4859   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4860   unsigned Mask = 0;
4861   for (unsigned i = 0; i != NumElts; ++i) {
4862     int Elt = N->getMaskElt(i);
4863     if (Elt < 0) continue;
4864     Elt &= NumLaneElts - 1;
4865     unsigned ShAmt = (i << Shift) % 8;
4866     Mask |= Elt << ShAmt;
4867   }
4868
4869   return Mask;
4870 }
4871
4872 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4873 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4874 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4875   MVT VT = N->getSimpleValueType(0);
4876
4877   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4878          "Unsupported vector type for PSHUFHW");
4879
4880   unsigned NumElts = VT.getVectorNumElements();
4881
4882   unsigned Mask = 0;
4883   for (unsigned l = 0; l != NumElts; l += 8) {
4884     // 8 nodes per lane, but we only care about the last 4.
4885     for (unsigned i = 0; i < 4; ++i) {
4886       int Elt = N->getMaskElt(l+i+4);
4887       if (Elt < 0) continue;
4888       Elt &= 0x3; // only 2-bits.
4889       Mask |= Elt << (i * 2);
4890     }
4891   }
4892
4893   return Mask;
4894 }
4895
4896 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4897 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4898 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4899   MVT VT = N->getSimpleValueType(0);
4900
4901   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4902          "Unsupported vector type for PSHUFHW");
4903
4904   unsigned NumElts = VT.getVectorNumElements();
4905
4906   unsigned Mask = 0;
4907   for (unsigned l = 0; l != NumElts; l += 8) {
4908     // 8 nodes per lane, but we only care about the first 4.
4909     for (unsigned i = 0; i < 4; ++i) {
4910       int Elt = N->getMaskElt(l+i);
4911       if (Elt < 0) continue;
4912       Elt &= 0x3; // only 2-bits
4913       Mask |= Elt << (i * 2);
4914     }
4915   }
4916
4917   return Mask;
4918 }
4919
4920 /// \brief Return the appropriate immediate to shuffle the specified
4921 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
4922 /// VALIGN (if Interlane is true) instructions.
4923 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
4924                                            bool InterLane) {
4925   MVT VT = SVOp->getSimpleValueType(0);
4926   unsigned EltSize = InterLane ? 1 :
4927     VT.getVectorElementType().getSizeInBits() >> 3;
4928
4929   unsigned NumElts = VT.getVectorNumElements();
4930   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4931   unsigned NumLaneElts = NumElts/NumLanes;
4932
4933   int Val = 0;
4934   unsigned i;
4935   for (i = 0; i != NumElts; ++i) {
4936     Val = SVOp->getMaskElt(i);
4937     if (Val >= 0)
4938       break;
4939   }
4940   if (Val >= (int)NumElts)
4941     Val -= NumElts - NumLaneElts;
4942
4943   assert(Val - i > 0 && "PALIGNR imm should be positive");
4944   return (Val - i) * EltSize;
4945 }
4946
4947 /// \brief Return the appropriate immediate to shuffle the specified
4948 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
4949 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4950   return getShuffleAlignrImmediate(SVOp, false);
4951 }
4952
4953 /// \brief Return the appropriate immediate to shuffle the specified
4954 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
4955 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
4956   return getShuffleAlignrImmediate(SVOp, true);
4957 }
4958
4959
4960 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4961   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4962   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4963     llvm_unreachable("Illegal extract subvector for VEXTRACT");
4964
4965   uint64_t Index =
4966     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4967
4968   MVT VecVT = N->getOperand(0).getSimpleValueType();
4969   MVT ElVT = VecVT.getVectorElementType();
4970
4971   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4972   return Index / NumElemsPerChunk;
4973 }
4974
4975 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4976   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4977   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4978     llvm_unreachable("Illegal insert subvector for VINSERT");
4979
4980   uint64_t Index =
4981     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4982
4983   MVT VecVT = N->getSimpleValueType(0);
4984   MVT ElVT = VecVT.getVectorElementType();
4985
4986   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4987   return Index / NumElemsPerChunk;
4988 }
4989
4990 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
4991 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4992 /// and VINSERTI128 instructions.
4993 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4994   return getExtractVEXTRACTImmediate(N, 128);
4995 }
4996
4997 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
4998 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
4999 /// and VINSERTI64x4 instructions.
5000 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
5001   return getExtractVEXTRACTImmediate(N, 256);
5002 }
5003
5004 /// getInsertVINSERT128Immediate - Return the appropriate immediate
5005 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
5006 /// and VINSERTI128 instructions.
5007 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
5008   return getInsertVINSERTImmediate(N, 128);
5009 }
5010
5011 /// getInsertVINSERT256Immediate - Return the appropriate immediate
5012 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
5013 /// and VINSERTI64x4 instructions.
5014 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
5015   return getInsertVINSERTImmediate(N, 256);
5016 }
5017
5018 /// isZero - Returns true if Elt is a constant integer zero
5019 static bool isZero(SDValue V) {
5020   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
5021   return C && C->isNullValue();
5022 }
5023
5024 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
5025 /// constant +0.0.
5026 bool X86::isZeroNode(SDValue Elt) {
5027   if (isZero(Elt))
5028     return true;
5029   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
5030     return CFP->getValueAPF().isPosZero();
5031   return false;
5032 }
5033
5034 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
5035 /// match movhlps. The lower half elements should come from upper half of
5036 /// V1 (and in order), and the upper half elements should come from the upper
5037 /// half of V2 (and in order).
5038 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
5039   if (!VT.is128BitVector())
5040     return false;
5041   if (VT.getVectorNumElements() != 4)
5042     return false;
5043   for (unsigned i = 0, e = 2; i != e; ++i)
5044     if (!isUndefOrEqual(Mask[i], i+2))
5045       return false;
5046   for (unsigned i = 2; i != 4; ++i)
5047     if (!isUndefOrEqual(Mask[i], i+4))
5048       return false;
5049   return true;
5050 }
5051
5052 /// isScalarLoadToVector - Returns true if the node is a scalar load that
5053 /// is promoted to a vector. It also returns the LoadSDNode by reference if
5054 /// required.
5055 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
5056   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
5057     return false;
5058   N = N->getOperand(0).getNode();
5059   if (!ISD::isNON_EXTLoad(N))
5060     return false;
5061   if (LD)
5062     *LD = cast<LoadSDNode>(N);
5063   return true;
5064 }
5065
5066 // Test whether the given value is a vector value which will be legalized
5067 // into a load.
5068 static bool WillBeConstantPoolLoad(SDNode *N) {
5069   if (N->getOpcode() != ISD::BUILD_VECTOR)
5070     return false;
5071
5072   // Check for any non-constant elements.
5073   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
5074     switch (N->getOperand(i).getNode()->getOpcode()) {
5075     case ISD::UNDEF:
5076     case ISD::ConstantFP:
5077     case ISD::Constant:
5078       break;
5079     default:
5080       return false;
5081     }
5082
5083   // Vectors of all-zeros and all-ones are materialized with special
5084   // instructions rather than being loaded.
5085   return !ISD::isBuildVectorAllZeros(N) &&
5086          !ISD::isBuildVectorAllOnes(N);
5087 }
5088
5089 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
5090 /// match movlp{s|d}. The lower half elements should come from lower half of
5091 /// V1 (and in order), and the upper half elements should come from the upper
5092 /// half of V2 (and in order). And since V1 will become the source of the
5093 /// MOVLP, it must be either a vector load or a scalar load to vector.
5094 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
5095                                ArrayRef<int> Mask, MVT VT) {
5096   if (!VT.is128BitVector())
5097     return false;
5098
5099   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
5100     return false;
5101   // Is V2 is a vector load, don't do this transformation. We will try to use
5102   // load folding shufps op.
5103   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
5104     return false;
5105
5106   unsigned NumElems = VT.getVectorNumElements();
5107
5108   if (NumElems != 2 && NumElems != 4)
5109     return false;
5110   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
5111     if (!isUndefOrEqual(Mask[i], i))
5112       return false;
5113   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
5114     if (!isUndefOrEqual(Mask[i], i+NumElems))
5115       return false;
5116   return true;
5117 }
5118
5119 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
5120 /// to an zero vector.
5121 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
5122 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
5123   SDValue V1 = N->getOperand(0);
5124   SDValue V2 = N->getOperand(1);
5125   unsigned NumElems = N->getValueType(0).getVectorNumElements();
5126   for (unsigned i = 0; i != NumElems; ++i) {
5127     int Idx = N->getMaskElt(i);
5128     if (Idx >= (int)NumElems) {
5129       unsigned Opc = V2.getOpcode();
5130       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
5131         continue;
5132       if (Opc != ISD::BUILD_VECTOR ||
5133           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
5134         return false;
5135     } else if (Idx >= 0) {
5136       unsigned Opc = V1.getOpcode();
5137       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
5138         continue;
5139       if (Opc != ISD::BUILD_VECTOR ||
5140           !X86::isZeroNode(V1.getOperand(Idx)))
5141         return false;
5142     }
5143   }
5144   return true;
5145 }
5146
5147 /// getZeroVector - Returns a vector of specified type with all zero elements.
5148 ///
5149 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
5150                              SelectionDAG &DAG, SDLoc dl) {
5151   assert(VT.isVector() && "Expected a vector type");
5152
5153   // Always build SSE zero vectors as <4 x i32> bitcasted
5154   // to their dest type. This ensures they get CSE'd.
5155   SDValue Vec;
5156   if (VT.is128BitVector()) {  // SSE
5157     if (Subtarget->hasSSE2()) {  // SSE2
5158       SDValue Cst = DAG.getConstant(0, MVT::i32);
5159       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5160     } else { // SSE1
5161       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5162       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
5163     }
5164   } else if (VT.is256BitVector()) { // AVX
5165     if (Subtarget->hasInt256()) { // AVX2
5166       SDValue Cst = DAG.getConstant(0, MVT::i32);
5167       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5168       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5169     } else {
5170       // 256-bit logic and arithmetic instructions in AVX are all
5171       // floating-point, no support for integer ops. Emit fp zeroed vectors.
5172       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5173       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5174       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
5175     }
5176   } else if (VT.is512BitVector()) { // AVX-512
5177       SDValue Cst = DAG.getConstant(0, MVT::i32);
5178       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5179                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5180       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
5181   } else if (VT.getScalarType() == MVT::i1) {
5182     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
5183     SDValue Cst = DAG.getConstant(0, MVT::i1);
5184     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5185     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5186   } else
5187     llvm_unreachable("Unexpected vector type");
5188
5189   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5190 }
5191
5192 /// getOnesVector - Returns a vector of specified type with all bits set.
5193 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5194 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
5195 /// Then bitcast to their original type, ensuring they get CSE'd.
5196 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
5197                              SDLoc dl) {
5198   assert(VT.isVector() && "Expected a vector type");
5199
5200   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
5201   SDValue Vec;
5202   if (VT.is256BitVector()) {
5203     if (HasInt256) { // AVX2
5204       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5205       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5206     } else { // AVX
5207       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5208       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5209     }
5210   } else if (VT.is128BitVector()) {
5211     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5212   } else
5213     llvm_unreachable("Unexpected vector type");
5214
5215   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5216 }
5217
5218 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
5219 /// that point to V2 points to its first element.
5220 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
5221   for (unsigned i = 0; i != NumElems; ++i) {
5222     if (Mask[i] > (int)NumElems) {
5223       Mask[i] = NumElems;
5224     }
5225   }
5226 }
5227
5228 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
5229 /// operation of specified width.
5230 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
5231                        SDValue V2) {
5232   unsigned NumElems = VT.getVectorNumElements();
5233   SmallVector<int, 8> Mask;
5234   Mask.push_back(NumElems);
5235   for (unsigned i = 1; i != NumElems; ++i)
5236     Mask.push_back(i);
5237   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5238 }
5239
5240 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
5241 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5242                           SDValue V2) {
5243   unsigned NumElems = VT.getVectorNumElements();
5244   SmallVector<int, 8> Mask;
5245   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
5246     Mask.push_back(i);
5247     Mask.push_back(i + NumElems);
5248   }
5249   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5250 }
5251
5252 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
5253 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5254                           SDValue V2) {
5255   unsigned NumElems = VT.getVectorNumElements();
5256   SmallVector<int, 8> Mask;
5257   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
5258     Mask.push_back(i + Half);
5259     Mask.push_back(i + NumElems + Half);
5260   }
5261   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5262 }
5263
5264 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
5265 // a generic shuffle instruction because the target has no such instructions.
5266 // Generate shuffles which repeat i16 and i8 several times until they can be
5267 // represented by v4f32 and then be manipulated by target suported shuffles.
5268 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
5269   MVT VT = V.getSimpleValueType();
5270   int NumElems = VT.getVectorNumElements();
5271   SDLoc dl(V);
5272
5273   while (NumElems > 4) {
5274     if (EltNo < NumElems/2) {
5275       V = getUnpackl(DAG, dl, VT, V, V);
5276     } else {
5277       V = getUnpackh(DAG, dl, VT, V, V);
5278       EltNo -= NumElems/2;
5279     }
5280     NumElems >>= 1;
5281   }
5282   return V;
5283 }
5284
5285 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
5286 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
5287   MVT VT = V.getSimpleValueType();
5288   SDLoc dl(V);
5289
5290   if (VT.is128BitVector()) {
5291     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
5292     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
5293     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
5294                              &SplatMask[0]);
5295   } else if (VT.is256BitVector()) {
5296     // To use VPERMILPS to splat scalars, the second half of indicies must
5297     // refer to the higher part, which is a duplication of the lower one,
5298     // because VPERMILPS can only handle in-lane permutations.
5299     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
5300                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
5301
5302     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
5303     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
5304                              &SplatMask[0]);
5305   } else
5306     llvm_unreachable("Vector size not supported");
5307
5308   return DAG.getNode(ISD::BITCAST, dl, VT, V);
5309 }
5310
5311 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
5312 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
5313   MVT SrcVT = SV->getSimpleValueType(0);
5314   SDValue V1 = SV->getOperand(0);
5315   SDLoc dl(SV);
5316
5317   int EltNo = SV->getSplatIndex();
5318   int NumElems = SrcVT.getVectorNumElements();
5319   bool Is256BitVec = SrcVT.is256BitVector();
5320
5321   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
5322          "Unknown how to promote splat for type");
5323
5324   // Extract the 128-bit part containing the splat element and update
5325   // the splat element index when it refers to the higher register.
5326   if (Is256BitVec) {
5327     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
5328     if (EltNo >= NumElems/2)
5329       EltNo -= NumElems/2;
5330   }
5331
5332   // All i16 and i8 vector types can't be used directly by a generic shuffle
5333   // instruction because the target has no such instruction. Generate shuffles
5334   // which repeat i16 and i8 several times until they fit in i32, and then can
5335   // be manipulated by target suported shuffles.
5336   MVT EltVT = SrcVT.getVectorElementType();
5337   if (EltVT == MVT::i8 || EltVT == MVT::i16)
5338     V1 = PromoteSplati8i16(V1, DAG, EltNo);
5339
5340   // Recreate the 256-bit vector and place the same 128-bit vector
5341   // into the low and high part. This is necessary because we want
5342   // to use VPERM* to shuffle the vectors
5343   if (Is256BitVec) {
5344     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
5345   }
5346
5347   return getLegalSplat(DAG, V1, EltNo);
5348 }
5349
5350 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
5351 /// vector of zero or undef vector.  This produces a shuffle where the low
5352 /// element of V2 is swizzled into the zero/undef vector, landing at element
5353 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
5354 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
5355                                            bool IsZero,
5356                                            const X86Subtarget *Subtarget,
5357                                            SelectionDAG &DAG) {
5358   MVT VT = V2.getSimpleValueType();
5359   SDValue V1 = IsZero
5360     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5361   unsigned NumElems = VT.getVectorNumElements();
5362   SmallVector<int, 16> MaskVec;
5363   for (unsigned i = 0; i != NumElems; ++i)
5364     // If this is the insertion idx, put the low elt of V2 here.
5365     MaskVec.push_back(i == Idx ? NumElems : i);
5366   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
5367 }
5368
5369 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
5370 /// target specific opcode. Returns true if the Mask could be calculated. Sets
5371 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
5372 /// shuffles which use a single input multiple times, and in those cases it will
5373 /// adjust the mask to only have indices within that single input.
5374 static bool getTargetShuffleMask(SDNode *N, MVT VT,
5375                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
5376   unsigned NumElems = VT.getVectorNumElements();
5377   SDValue ImmN;
5378
5379   IsUnary = false;
5380   bool IsFakeUnary = false;
5381   switch(N->getOpcode()) {
5382   case X86ISD::BLENDI:
5383     ImmN = N->getOperand(N->getNumOperands()-1);
5384     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5385     break;
5386   case X86ISD::SHUFP:
5387     ImmN = N->getOperand(N->getNumOperands()-1);
5388     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5389     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5390     break;
5391   case X86ISD::UNPCKH:
5392     DecodeUNPCKHMask(VT, Mask);
5393     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5394     break;
5395   case X86ISD::UNPCKL:
5396     DecodeUNPCKLMask(VT, Mask);
5397     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5398     break;
5399   case X86ISD::MOVHLPS:
5400     DecodeMOVHLPSMask(NumElems, Mask);
5401     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5402     break;
5403   case X86ISD::MOVLHPS:
5404     DecodeMOVLHPSMask(NumElems, Mask);
5405     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5406     break;
5407   case X86ISD::PALIGNR:
5408     ImmN = N->getOperand(N->getNumOperands()-1);
5409     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5410     break;
5411   case X86ISD::PSHUFD:
5412   case X86ISD::VPERMILPI:
5413     ImmN = N->getOperand(N->getNumOperands()-1);
5414     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5415     IsUnary = true;
5416     break;
5417   case X86ISD::PSHUFHW:
5418     ImmN = N->getOperand(N->getNumOperands()-1);
5419     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5420     IsUnary = true;
5421     break;
5422   case X86ISD::PSHUFLW:
5423     ImmN = N->getOperand(N->getNumOperands()-1);
5424     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5425     IsUnary = true;
5426     break;
5427   case X86ISD::PSHUFB: {
5428     IsUnary = true;
5429     SDValue MaskNode = N->getOperand(1);
5430     while (MaskNode->getOpcode() == ISD::BITCAST)
5431       MaskNode = MaskNode->getOperand(0);
5432
5433     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
5434       // If we have a build-vector, then things are easy.
5435       EVT VT = MaskNode.getValueType();
5436       assert(VT.isVector() &&
5437              "Can't produce a non-vector with a build_vector!");
5438       if (!VT.isInteger())
5439         return false;
5440
5441       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
5442
5443       SmallVector<uint64_t, 32> RawMask;
5444       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
5445         SDValue Op = MaskNode->getOperand(i);
5446         if (Op->getOpcode() == ISD::UNDEF) {
5447           RawMask.push_back((uint64_t)SM_SentinelUndef);
5448           continue;
5449         }
5450         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
5451         if (!CN)
5452           return false;
5453         APInt MaskElement = CN->getAPIntValue();
5454
5455         // We now have to decode the element which could be any integer size and
5456         // extract each byte of it.
5457         for (int j = 0; j < NumBytesPerElement; ++j) {
5458           // Note that this is x86 and so always little endian: the low byte is
5459           // the first byte of the mask.
5460           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
5461           MaskElement = MaskElement.lshr(8);
5462         }
5463       }
5464       DecodePSHUFBMask(RawMask, Mask);
5465       break;
5466     }
5467
5468     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
5469     if (!MaskLoad)
5470       return false;
5471
5472     SDValue Ptr = MaskLoad->getBasePtr();
5473     if (Ptr->getOpcode() == X86ISD::Wrapper)
5474       Ptr = Ptr->getOperand(0);
5475
5476     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
5477     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
5478       return false;
5479
5480     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
5481       DecodePSHUFBMask(C, Mask);
5482       break;
5483     }
5484
5485     return false;
5486   }
5487   case X86ISD::VPERMI:
5488     ImmN = N->getOperand(N->getNumOperands()-1);
5489     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5490     IsUnary = true;
5491     break;
5492   case X86ISD::MOVSS:
5493   case X86ISD::MOVSD:
5494     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5495     break;
5496   case X86ISD::VPERM2X128:
5497     ImmN = N->getOperand(N->getNumOperands()-1);
5498     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5499     if (Mask.empty()) return false;
5500     break;
5501   case X86ISD::MOVSLDUP:
5502     DecodeMOVSLDUPMask(VT, Mask);
5503     IsUnary = true;
5504     break;
5505   case X86ISD::MOVSHDUP:
5506     DecodeMOVSHDUPMask(VT, Mask);
5507     IsUnary = true;
5508     break;
5509   case X86ISD::MOVDDUP:
5510     DecodeMOVDDUPMask(VT, Mask);
5511     IsUnary = true;
5512     break;
5513   case X86ISD::MOVLHPD:
5514   case X86ISD::MOVLPD:
5515   case X86ISD::MOVLPS:
5516     // Not yet implemented
5517     return false;
5518   default: llvm_unreachable("unknown target shuffle node");
5519   }
5520
5521   // If we have a fake unary shuffle, the shuffle mask is spread across two
5522   // inputs that are actually the same node. Re-map the mask to always point
5523   // into the first input.
5524   if (IsFakeUnary)
5525     for (int &M : Mask)
5526       if (M >= (int)Mask.size())
5527         M -= Mask.size();
5528
5529   return true;
5530 }
5531
5532 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
5533 /// element of the result of the vector shuffle.
5534 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5535                                    unsigned Depth) {
5536   if (Depth == 6)
5537     return SDValue();  // Limit search depth.
5538
5539   SDValue V = SDValue(N, 0);
5540   EVT VT = V.getValueType();
5541   unsigned Opcode = V.getOpcode();
5542
5543   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5544   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5545     int Elt = SV->getMaskElt(Index);
5546
5547     if (Elt < 0)
5548       return DAG.getUNDEF(VT.getVectorElementType());
5549
5550     unsigned NumElems = VT.getVectorNumElements();
5551     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5552                                          : SV->getOperand(1);
5553     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5554   }
5555
5556   // Recurse into target specific vector shuffles to find scalars.
5557   if (isTargetShuffle(Opcode)) {
5558     MVT ShufVT = V.getSimpleValueType();
5559     unsigned NumElems = ShufVT.getVectorNumElements();
5560     SmallVector<int, 16> ShuffleMask;
5561     bool IsUnary;
5562
5563     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5564       return SDValue();
5565
5566     int Elt = ShuffleMask[Index];
5567     if (Elt < 0)
5568       return DAG.getUNDEF(ShufVT.getVectorElementType());
5569
5570     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5571                                          : N->getOperand(1);
5572     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5573                                Depth+1);
5574   }
5575
5576   // Actual nodes that may contain scalar elements
5577   if (Opcode == ISD::BITCAST) {
5578     V = V.getOperand(0);
5579     EVT SrcVT = V.getValueType();
5580     unsigned NumElems = VT.getVectorNumElements();
5581
5582     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5583       return SDValue();
5584   }
5585
5586   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5587     return (Index == 0) ? V.getOperand(0)
5588                         : DAG.getUNDEF(VT.getVectorElementType());
5589
5590   if (V.getOpcode() == ISD::BUILD_VECTOR)
5591     return V.getOperand(Index);
5592
5593   return SDValue();
5594 }
5595
5596 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
5597 /// shuffle operation which come from a consecutively from a zero. The
5598 /// search can start in two different directions, from left or right.
5599 /// We count undefs as zeros until PreferredNum is reached.
5600 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
5601                                          unsigned NumElems, bool ZerosFromLeft,
5602                                          SelectionDAG &DAG,
5603                                          unsigned PreferredNum = -1U) {
5604   unsigned NumZeros = 0;
5605   for (unsigned i = 0; i != NumElems; ++i) {
5606     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5607     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5608     if (!Elt.getNode())
5609       break;
5610
5611     if (X86::isZeroNode(Elt))
5612       ++NumZeros;
5613     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5614       NumZeros = std::min(NumZeros + 1, PreferredNum);
5615     else
5616       break;
5617   }
5618
5619   return NumZeros;
5620 }
5621
5622 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5623 /// correspond consecutively to elements from one of the vector operands,
5624 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
5625 static
5626 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
5627                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5628                               unsigned NumElems, unsigned &OpNum) {
5629   bool SeenV1 = false;
5630   bool SeenV2 = false;
5631
5632   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5633     int Idx = SVOp->getMaskElt(i);
5634     // Ignore undef indicies
5635     if (Idx < 0)
5636       continue;
5637
5638     if (Idx < (int)NumElems)
5639       SeenV1 = true;
5640     else
5641       SeenV2 = true;
5642
5643     // Only accept consecutive elements from the same vector
5644     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5645       return false;
5646   }
5647
5648   OpNum = SeenV1 ? 0 : 1;
5649   return true;
5650 }
5651
5652 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5653 /// logical left shift of a vector.
5654 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5655                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5656   unsigned NumElems =
5657     SVOp->getSimpleValueType(0).getVectorNumElements();
5658   unsigned NumZeros = getNumOfConsecutiveZeros(
5659       SVOp, NumElems, false /* check zeros from right */, DAG,
5660       SVOp->getMaskElt(0));
5661   unsigned OpSrc;
5662
5663   if (!NumZeros)
5664     return false;
5665
5666   // Considering the elements in the mask that are not consecutive zeros,
5667   // check if they consecutively come from only one of the source vectors.
5668   //
5669   //               V1 = {X, A, B, C}     0
5670   //                         \  \  \    /
5671   //   vector_shuffle V1, V2 <1, 2, 3, X>
5672   //
5673   if (!isShuffleMaskConsecutive(SVOp,
5674             0,                   // Mask Start Index
5675             NumElems-NumZeros,   // Mask End Index(exclusive)
5676             NumZeros,            // Where to start looking in the src vector
5677             NumElems,            // Number of elements in vector
5678             OpSrc))              // Which source operand ?
5679     return false;
5680
5681   isLeft = false;
5682   ShAmt = NumZeros;
5683   ShVal = SVOp->getOperand(OpSrc);
5684   return true;
5685 }
5686
5687 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5688 /// logical left shift of a vector.
5689 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5690                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5691   unsigned NumElems =
5692     SVOp->getSimpleValueType(0).getVectorNumElements();
5693   unsigned NumZeros = getNumOfConsecutiveZeros(
5694       SVOp, NumElems, true /* check zeros from left */, DAG,
5695       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5696   unsigned OpSrc;
5697
5698   if (!NumZeros)
5699     return false;
5700
5701   // Considering the elements in the mask that are not consecutive zeros,
5702   // check if they consecutively come from only one of the source vectors.
5703   //
5704   //                           0    { A, B, X, X } = V2
5705   //                          / \    /  /
5706   //   vector_shuffle V1, V2 <X, X, 4, 5>
5707   //
5708   if (!isShuffleMaskConsecutive(SVOp,
5709             NumZeros,     // Mask Start Index
5710             NumElems,     // Mask End Index(exclusive)
5711             0,            // Where to start looking in the src vector
5712             NumElems,     // Number of elements in vector
5713             OpSrc))       // Which source operand ?
5714     return false;
5715
5716   isLeft = true;
5717   ShAmt = NumZeros;
5718   ShVal = SVOp->getOperand(OpSrc);
5719   return true;
5720 }
5721
5722 /// isVectorShift - Returns true if the shuffle can be implemented as a
5723 /// logical left or right shift of a vector.
5724 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5725                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5726   // Although the logic below support any bitwidth size, there are no
5727   // shift instructions which handle more than 128-bit vectors.
5728   if (!SVOp->getSimpleValueType(0).is128BitVector())
5729     return false;
5730
5731   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5732       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5733     return true;
5734
5735   return false;
5736 }
5737
5738 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5739 ///
5740 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5741                                        unsigned NumNonZero, unsigned NumZero,
5742                                        SelectionDAG &DAG,
5743                                        const X86Subtarget* Subtarget,
5744                                        const TargetLowering &TLI) {
5745   if (NumNonZero > 8)
5746     return SDValue();
5747
5748   SDLoc dl(Op);
5749   SDValue V;
5750   bool First = true;
5751   for (unsigned i = 0; i < 16; ++i) {
5752     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5753     if (ThisIsNonZero && First) {
5754       if (NumZero)
5755         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5756       else
5757         V = DAG.getUNDEF(MVT::v8i16);
5758       First = false;
5759     }
5760
5761     if ((i & 1) != 0) {
5762       SDValue ThisElt, LastElt;
5763       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5764       if (LastIsNonZero) {
5765         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5766                               MVT::i16, Op.getOperand(i-1));
5767       }
5768       if (ThisIsNonZero) {
5769         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5770         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5771                               ThisElt, DAG.getConstant(8, MVT::i8));
5772         if (LastIsNonZero)
5773           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5774       } else
5775         ThisElt = LastElt;
5776
5777       if (ThisElt.getNode())
5778         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5779                         DAG.getIntPtrConstant(i/2));
5780     }
5781   }
5782
5783   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5784 }
5785
5786 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5787 ///
5788 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5789                                      unsigned NumNonZero, unsigned NumZero,
5790                                      SelectionDAG &DAG,
5791                                      const X86Subtarget* Subtarget,
5792                                      const TargetLowering &TLI) {
5793   if (NumNonZero > 4)
5794     return SDValue();
5795
5796   SDLoc dl(Op);
5797   SDValue V;
5798   bool First = true;
5799   for (unsigned i = 0; i < 8; ++i) {
5800     bool isNonZero = (NonZeros & (1 << i)) != 0;
5801     if (isNonZero) {
5802       if (First) {
5803         if (NumZero)
5804           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5805         else
5806           V = DAG.getUNDEF(MVT::v8i16);
5807         First = false;
5808       }
5809       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5810                       MVT::v8i16, V, Op.getOperand(i),
5811                       DAG.getIntPtrConstant(i));
5812     }
5813   }
5814
5815   return V;
5816 }
5817
5818 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
5819 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5820                                      const X86Subtarget *Subtarget,
5821                                      const TargetLowering &TLI) {
5822   // Find all zeroable elements.
5823   bool Zeroable[4];
5824   for (int i=0; i < 4; ++i) {
5825     SDValue Elt = Op->getOperand(i);
5826     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
5827   }
5828   assert(std::count_if(&Zeroable[0], &Zeroable[4],
5829                        [](bool M) { return !M; }) > 1 &&
5830          "We expect at least two non-zero elements!");
5831
5832   // We only know how to deal with build_vector nodes where elements are either
5833   // zeroable or extract_vector_elt with constant index.
5834   SDValue FirstNonZero;
5835   unsigned FirstNonZeroIdx;
5836   for (unsigned i=0; i < 4; ++i) {
5837     if (Zeroable[i])
5838       continue;
5839     SDValue Elt = Op->getOperand(i);
5840     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5841         !isa<ConstantSDNode>(Elt.getOperand(1)))
5842       return SDValue();
5843     // Make sure that this node is extracting from a 128-bit vector.
5844     MVT VT = Elt.getOperand(0).getSimpleValueType();
5845     if (!VT.is128BitVector())
5846       return SDValue();
5847     if (!FirstNonZero.getNode()) {
5848       FirstNonZero = Elt;
5849       FirstNonZeroIdx = i;
5850     }
5851   }
5852
5853   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5854   SDValue V1 = FirstNonZero.getOperand(0);
5855   MVT VT = V1.getSimpleValueType();
5856
5857   // See if this build_vector can be lowered as a blend with zero.
5858   SDValue Elt;
5859   unsigned EltMaskIdx, EltIdx;
5860   int Mask[4];
5861   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5862     if (Zeroable[EltIdx]) {
5863       // The zero vector will be on the right hand side.
5864       Mask[EltIdx] = EltIdx+4;
5865       continue;
5866     }
5867
5868     Elt = Op->getOperand(EltIdx);
5869     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5870     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5871     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5872       break;
5873     Mask[EltIdx] = EltIdx;
5874   }
5875
5876   if (EltIdx == 4) {
5877     // Let the shuffle legalizer deal with blend operations.
5878     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5879     if (V1.getSimpleValueType() != VT)
5880       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
5881     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
5882   }
5883
5884   // See if we can lower this build_vector to a INSERTPS.
5885   if (!Subtarget->hasSSE41())
5886     return SDValue();
5887
5888   SDValue V2 = Elt.getOperand(0);
5889   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5890     V1 = SDValue();
5891
5892   bool CanFold = true;
5893   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5894     if (Zeroable[i])
5895       continue;
5896
5897     SDValue Current = Op->getOperand(i);
5898     SDValue SrcVector = Current->getOperand(0);
5899     if (!V1.getNode())
5900       V1 = SrcVector;
5901     CanFold = SrcVector == V1 &&
5902       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5903   }
5904
5905   if (!CanFold)
5906     return SDValue();
5907
5908   assert(V1.getNode() && "Expected at least two non-zero elements!");
5909   if (V1.getSimpleValueType() != MVT::v4f32)
5910     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
5911   if (V2.getSimpleValueType() != MVT::v4f32)
5912     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
5913
5914   // Ok, we can emit an INSERTPS instruction.
5915   unsigned ZMask = 0;
5916   for (int i = 0; i < 4; ++i)
5917     if (Zeroable[i])
5918       ZMask |= 1 << i;
5919
5920   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5921   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5922   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
5923                                DAG.getIntPtrConstant(InsertPSMask));
5924   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
5925 }
5926
5927 /// Return a vector logical shift node.
5928 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5929                          unsigned NumBits, SelectionDAG &DAG,
5930                          const TargetLowering &TLI, SDLoc dl) {
5931   assert(VT.is128BitVector() && "Unknown type for VShift");
5932   MVT ShVT = MVT::v2i64;
5933   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5934   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5935   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
5936   SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy);
5937   return DAG.getNode(ISD::BITCAST, dl, VT,
5938                      DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5939 }
5940
5941 static SDValue
5942 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
5943
5944   // Check if the scalar load can be widened into a vector load. And if
5945   // the address is "base + cst" see if the cst can be "absorbed" into
5946   // the shuffle mask.
5947   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5948     SDValue Ptr = LD->getBasePtr();
5949     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5950       return SDValue();
5951     EVT PVT = LD->getValueType(0);
5952     if (PVT != MVT::i32 && PVT != MVT::f32)
5953       return SDValue();
5954
5955     int FI = -1;
5956     int64_t Offset = 0;
5957     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5958       FI = FINode->getIndex();
5959       Offset = 0;
5960     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5961                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5962       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5963       Offset = Ptr.getConstantOperandVal(1);
5964       Ptr = Ptr.getOperand(0);
5965     } else {
5966       return SDValue();
5967     }
5968
5969     // FIXME: 256-bit vector instructions don't require a strict alignment,
5970     // improve this code to support it better.
5971     unsigned RequiredAlign = VT.getSizeInBits()/8;
5972     SDValue Chain = LD->getChain();
5973     // Make sure the stack object alignment is at least 16 or 32.
5974     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5975     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5976       if (MFI->isFixedObjectIndex(FI)) {
5977         // Can't change the alignment. FIXME: It's possible to compute
5978         // the exact stack offset and reference FI + adjust offset instead.
5979         // If someone *really* cares about this. That's the way to implement it.
5980         return SDValue();
5981       } else {
5982         MFI->setObjectAlignment(FI, RequiredAlign);
5983       }
5984     }
5985
5986     // (Offset % 16 or 32) must be multiple of 4. Then address is then
5987     // Ptr + (Offset & ~15).
5988     if (Offset < 0)
5989       return SDValue();
5990     if ((Offset % RequiredAlign) & 3)
5991       return SDValue();
5992     int64_t StartOffset = Offset & ~(RequiredAlign-1);
5993     if (StartOffset)
5994       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
5995                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
5996
5997     int EltNo = (Offset - StartOffset) >> 2;
5998     unsigned NumElems = VT.getVectorNumElements();
5999
6000     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6001     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6002                              LD->getPointerInfo().getWithOffset(StartOffset),
6003                              false, false, false, 0);
6004
6005     SmallVector<int, 8> Mask;
6006     for (unsigned i = 0; i != NumElems; ++i)
6007       Mask.push_back(EltNo);
6008
6009     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
6010   }
6011
6012   return SDValue();
6013 }
6014
6015 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6016 /// elements can be replaced by a single large load which has the same value as
6017 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6018 ///
6019 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
6020 ///
6021 /// FIXME: we'd also like to handle the case where the last elements are zero
6022 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
6023 /// There's even a handy isZeroNode for that purpose.
6024 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6025                                         SDLoc &DL, SelectionDAG &DAG,
6026                                         bool isAfterLegalize) {
6027   unsigned NumElems = Elts.size();
6028
6029   LoadSDNode *LDBase = nullptr;
6030   unsigned LastLoadedElt = -1U;
6031
6032   // For each element in the initializer, see if we've found a load or an undef.
6033   // If we don't find an initial load element, or later load elements are
6034   // non-consecutive, bail out.
6035   for (unsigned i = 0; i < NumElems; ++i) {
6036     SDValue Elt = Elts[i];
6037     // Look through a bitcast.
6038     if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
6039       Elt = Elt.getOperand(0);
6040     if (!Elt.getNode() ||
6041         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
6042       return SDValue();
6043     if (!LDBase) {
6044       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
6045         return SDValue();
6046       LDBase = cast<LoadSDNode>(Elt.getNode());
6047       LastLoadedElt = i;
6048       continue;
6049     }
6050     if (Elt.getOpcode() == ISD::UNDEF)
6051       continue;
6052
6053     LoadSDNode *LD = cast<LoadSDNode>(Elt);
6054     EVT LdVT = Elt.getValueType();
6055     // Each loaded element must be the correct fractional portion of the
6056     // requested vector load.
6057     if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
6058       return SDValue();
6059     if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
6060       return SDValue();
6061     LastLoadedElt = i;
6062   }
6063
6064   // If we have found an entire vector of loads and undefs, then return a large
6065   // load of the entire vector width starting at the base pointer.  If we found
6066   // consecutive loads for the low half, generate a vzext_load node.
6067   if (LastLoadedElt == NumElems - 1) {
6068     assert(LDBase && "Did not find base load for merging consecutive loads");
6069     EVT EltVT = LDBase->getValueType(0);
6070     // Ensure that the input vector size for the merged loads matches the
6071     // cumulative size of the input elements.
6072     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6073       return SDValue();
6074
6075     if (isAfterLegalize &&
6076         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
6077       return SDValue();
6078
6079     SDValue NewLd = SDValue();
6080
6081     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6082                         LDBase->getPointerInfo(), LDBase->isVolatile(),
6083                         LDBase->isNonTemporal(), LDBase->isInvariant(),
6084                         LDBase->getAlignment());
6085
6086     if (LDBase->hasAnyUseOfValue(1)) {
6087       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6088                                      SDValue(LDBase, 1),
6089                                      SDValue(NewLd.getNode(), 1));
6090       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6091       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6092                              SDValue(NewLd.getNode(), 1));
6093     }
6094
6095     return NewLd;
6096   }
6097
6098   //TODO: The code below fires only for for loading the low v2i32 / v2f32
6099   //of a v4i32 / v4f32. It's probably worth generalizing.
6100   EVT EltVT = VT.getVectorElementType();
6101   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
6102       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
6103     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
6104     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6105     SDValue ResNode =
6106         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
6107                                 LDBase->getPointerInfo(),
6108                                 LDBase->getAlignment(),
6109                                 false/*isVolatile*/, true/*ReadMem*/,
6110                                 false/*WriteMem*/);
6111
6112     // Make sure the newly-created LOAD is in the same position as LDBase in
6113     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
6114     // update uses of LDBase's output chain to use the TokenFactor.
6115     if (LDBase->hasAnyUseOfValue(1)) {
6116       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6117                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
6118       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6119       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6120                              SDValue(ResNode.getNode(), 1));
6121     }
6122
6123     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
6124   }
6125   return SDValue();
6126 }
6127
6128 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
6129 /// to generate a splat value for the following cases:
6130 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
6131 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6132 /// a scalar load, or a constant.
6133 /// The VBROADCAST node is returned when a pattern is found,
6134 /// or SDValue() otherwise.
6135 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
6136                                     SelectionDAG &DAG) {
6137   // VBROADCAST requires AVX.
6138   // TODO: Splats could be generated for non-AVX CPUs using SSE
6139   // instructions, but there's less potential gain for only 128-bit vectors.
6140   if (!Subtarget->hasAVX())
6141     return SDValue();
6142
6143   MVT VT = Op.getSimpleValueType();
6144   SDLoc dl(Op);
6145
6146   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6147          "Unsupported vector type for broadcast.");
6148
6149   SDValue Ld;
6150   bool ConstSplatVal;
6151
6152   switch (Op.getOpcode()) {
6153     default:
6154       // Unknown pattern found.
6155       return SDValue();
6156
6157     case ISD::BUILD_VECTOR: {
6158       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
6159       BitVector UndefElements;
6160       SDValue Splat = BVOp->getSplatValue(&UndefElements);
6161
6162       // We need a splat of a single value to use broadcast, and it doesn't
6163       // make any sense if the value is only in one element of the vector.
6164       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
6165         return SDValue();
6166
6167       Ld = Splat;
6168       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6169                        Ld.getOpcode() == ISD::ConstantFP);
6170
6171       // Make sure that all of the users of a non-constant load are from the
6172       // BUILD_VECTOR node.
6173       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6174         return SDValue();
6175       break;
6176     }
6177
6178     case ISD::VECTOR_SHUFFLE: {
6179       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6180
6181       // Shuffles must have a splat mask where the first element is
6182       // broadcasted.
6183       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
6184         return SDValue();
6185
6186       SDValue Sc = Op.getOperand(0);
6187       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
6188           Sc.getOpcode() != ISD::BUILD_VECTOR) {
6189
6190         if (!Subtarget->hasInt256())
6191           return SDValue();
6192
6193         // Use the register form of the broadcast instruction available on AVX2.
6194         if (VT.getSizeInBits() >= 256)
6195           Sc = Extract128BitVector(Sc, 0, DAG, dl);
6196         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
6197       }
6198
6199       Ld = Sc.getOperand(0);
6200       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6201                        Ld.getOpcode() == ISD::ConstantFP);
6202
6203       // The scalar_to_vector node and the suspected
6204       // load node must have exactly one user.
6205       // Constants may have multiple users.
6206
6207       // AVX-512 has register version of the broadcast
6208       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
6209         Ld.getValueType().getSizeInBits() >= 32;
6210       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
6211           !hasRegVer))
6212         return SDValue();
6213       break;
6214     }
6215   }
6216
6217   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
6218   bool IsGE256 = (VT.getSizeInBits() >= 256);
6219
6220   // When optimizing for size, generate up to 5 extra bytes for a broadcast
6221   // instruction to save 8 or more bytes of constant pool data.
6222   // TODO: If multiple splats are generated to load the same constant,
6223   // it may be detrimental to overall size. There needs to be a way to detect
6224   // that condition to know if this is truly a size win.
6225   const Function *F = DAG.getMachineFunction().getFunction();
6226   bool OptForSize = F->getAttributes().
6227     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
6228
6229   // Handle broadcasting a single constant scalar from the constant pool
6230   // into a vector.
6231   // On Sandybridge (no AVX2), it is still better to load a constant vector
6232   // from the constant pool and not to broadcast it from a scalar.
6233   // But override that restriction when optimizing for size.
6234   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6235   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
6236     EVT CVT = Ld.getValueType();
6237     assert(!CVT.isVector() && "Must not broadcast a vector type");
6238
6239     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6240     // For size optimization, also splat v2f64 and v2i64, and for size opt
6241     // with AVX2, also splat i8 and i16.
6242     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6243     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6244         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
6245       const Constant *C = nullptr;
6246       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6247         C = CI->getConstantIntValue();
6248       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6249         C = CF->getConstantFPValue();
6250
6251       assert(C && "Invalid constant type");
6252
6253       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6254       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
6255       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6256       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
6257                        MachinePointerInfo::getConstantPool(),
6258                        false, false, false, Alignment);
6259
6260       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6261     }
6262   }
6263
6264   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6265
6266   // Handle AVX2 in-register broadcasts.
6267   if (!IsLoad && Subtarget->hasInt256() &&
6268       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6269     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6270
6271   // The scalar source must be a normal load.
6272   if (!IsLoad)
6273     return SDValue();
6274
6275   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6276       (Subtarget->hasVLX() && ScalarSize == 64))
6277     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6278
6279   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6280   // double since there is no vbroadcastsd xmm
6281   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
6282     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6283       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6284   }
6285
6286   // Unsupported broadcast.
6287   return SDValue();
6288 }
6289
6290 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6291 /// underlying vector and index.
6292 ///
6293 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6294 /// index.
6295 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6296                                          SDValue ExtIdx) {
6297   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6298   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6299     return Idx;
6300
6301   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6302   // lowered this:
6303   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6304   // to:
6305   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
6306   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
6307   //                           undef)
6308   //                       Constant<0>)
6309   // In this case the vector is the extract_subvector expression and the index
6310   // is 2, as specified by the shuffle.
6311   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6312   SDValue ShuffleVec = SVOp->getOperand(0);
6313   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6314   assert(ShuffleVecVT.getVectorElementType() ==
6315          ExtractedFromVec.getSimpleValueType().getVectorElementType());
6316
6317   int ShuffleIdx = SVOp->getMaskElt(Idx);
6318   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6319     ExtractedFromVec = ShuffleVec;
6320     return ShuffleIdx;
6321   }
6322   return Idx;
6323 }
6324
6325 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6326   MVT VT = Op.getSimpleValueType();
6327
6328   // Skip if insert_vec_elt is not supported.
6329   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6330   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6331     return SDValue();
6332
6333   SDLoc DL(Op);
6334   unsigned NumElems = Op.getNumOperands();
6335
6336   SDValue VecIn1;
6337   SDValue VecIn2;
6338   SmallVector<unsigned, 4> InsertIndices;
6339   SmallVector<int, 8> Mask(NumElems, -1);
6340
6341   for (unsigned i = 0; i != NumElems; ++i) {
6342     unsigned Opc = Op.getOperand(i).getOpcode();
6343
6344     if (Opc == ISD::UNDEF)
6345       continue;
6346
6347     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6348       // Quit if more than 1 elements need inserting.
6349       if (InsertIndices.size() > 1)
6350         return SDValue();
6351
6352       InsertIndices.push_back(i);
6353       continue;
6354     }
6355
6356     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6357     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6358     // Quit if non-constant index.
6359     if (!isa<ConstantSDNode>(ExtIdx))
6360       return SDValue();
6361     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6362
6363     // Quit if extracted from vector of different type.
6364     if (ExtractedFromVec.getValueType() != VT)
6365       return SDValue();
6366
6367     if (!VecIn1.getNode())
6368       VecIn1 = ExtractedFromVec;
6369     else if (VecIn1 != ExtractedFromVec) {
6370       if (!VecIn2.getNode())
6371         VecIn2 = ExtractedFromVec;
6372       else if (VecIn2 != ExtractedFromVec)
6373         // Quit if more than 2 vectors to shuffle
6374         return SDValue();
6375     }
6376
6377     if (ExtractedFromVec == VecIn1)
6378       Mask[i] = Idx;
6379     else if (ExtractedFromVec == VecIn2)
6380       Mask[i] = Idx + NumElems;
6381   }
6382
6383   if (!VecIn1.getNode())
6384     return SDValue();
6385
6386   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6387   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
6388   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6389     unsigned Idx = InsertIndices[i];
6390     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6391                      DAG.getIntPtrConstant(Idx));
6392   }
6393
6394   return NV;
6395 }
6396
6397 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6398 SDValue
6399 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6400
6401   MVT VT = Op.getSimpleValueType();
6402   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
6403          "Unexpected type in LowerBUILD_VECTORvXi1!");
6404
6405   SDLoc dl(Op);
6406   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6407     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
6408     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6409     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6410   }
6411
6412   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
6413     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
6414     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6415     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6416   }
6417
6418   bool AllContants = true;
6419   uint64_t Immediate = 0;
6420   int NonConstIdx = -1;
6421   bool IsSplat = true;
6422   unsigned NumNonConsts = 0;
6423   unsigned NumConsts = 0;
6424   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6425     SDValue In = Op.getOperand(idx);
6426     if (In.getOpcode() == ISD::UNDEF)
6427       continue;
6428     if (!isa<ConstantSDNode>(In)) {
6429       AllContants = false;
6430       NonConstIdx = idx;
6431       NumNonConsts++;
6432     } else {
6433       NumConsts++;
6434       if (cast<ConstantSDNode>(In)->getZExtValue())
6435       Immediate |= (1ULL << idx);
6436     }
6437     if (In != Op.getOperand(0))
6438       IsSplat = false;
6439   }
6440
6441   if (AllContants) {
6442     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
6443       DAG.getConstant(Immediate, MVT::i16));
6444     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
6445                        DAG.getIntPtrConstant(0));
6446   }
6447
6448   if (NumNonConsts == 1 && NonConstIdx != 0) {
6449     SDValue DstVec;
6450     if (NumConsts) {
6451       SDValue VecAsImm = DAG.getConstant(Immediate,
6452                                          MVT::getIntegerVT(VT.getSizeInBits()));
6453       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
6454     }
6455     else
6456       DstVec = DAG.getUNDEF(VT);
6457     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6458                        Op.getOperand(NonConstIdx),
6459                        DAG.getIntPtrConstant(NonConstIdx));
6460   }
6461   if (!IsSplat && (NonConstIdx != 0))
6462     llvm_unreachable("Unsupported BUILD_VECTOR operation");
6463   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
6464   SDValue Select;
6465   if (IsSplat)
6466     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6467                           DAG.getConstant(-1, SelectVT),
6468                           DAG.getConstant(0, SelectVT));
6469   else
6470     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6471                          DAG.getConstant((Immediate | 1), SelectVT),
6472                          DAG.getConstant(Immediate, SelectVT));
6473   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
6474 }
6475
6476 /// \brief Return true if \p N implements a horizontal binop and return the
6477 /// operands for the horizontal binop into V0 and V1.
6478 ///
6479 /// This is a helper function of PerformBUILD_VECTORCombine.
6480 /// This function checks that the build_vector \p N in input implements a
6481 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6482 /// operation to match.
6483 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6484 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6485 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6486 /// arithmetic sub.
6487 ///
6488 /// This function only analyzes elements of \p N whose indices are
6489 /// in range [BaseIdx, LastIdx).
6490 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6491                               SelectionDAG &DAG,
6492                               unsigned BaseIdx, unsigned LastIdx,
6493                               SDValue &V0, SDValue &V1) {
6494   EVT VT = N->getValueType(0);
6495
6496   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6497   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6498          "Invalid Vector in input!");
6499
6500   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6501   bool CanFold = true;
6502   unsigned ExpectedVExtractIdx = BaseIdx;
6503   unsigned NumElts = LastIdx - BaseIdx;
6504   V0 = DAG.getUNDEF(VT);
6505   V1 = DAG.getUNDEF(VT);
6506
6507   // Check if N implements a horizontal binop.
6508   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6509     SDValue Op = N->getOperand(i + BaseIdx);
6510
6511     // Skip UNDEFs.
6512     if (Op->getOpcode() == ISD::UNDEF) {
6513       // Update the expected vector extract index.
6514       if (i * 2 == NumElts)
6515         ExpectedVExtractIdx = BaseIdx;
6516       ExpectedVExtractIdx += 2;
6517       continue;
6518     }
6519
6520     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6521
6522     if (!CanFold)
6523       break;
6524
6525     SDValue Op0 = Op.getOperand(0);
6526     SDValue Op1 = Op.getOperand(1);
6527
6528     // Try to match the following pattern:
6529     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6530     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6531         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6532         Op0.getOperand(0) == Op1.getOperand(0) &&
6533         isa<ConstantSDNode>(Op0.getOperand(1)) &&
6534         isa<ConstantSDNode>(Op1.getOperand(1)));
6535     if (!CanFold)
6536       break;
6537
6538     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6539     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6540
6541     if (i * 2 < NumElts) {
6542       if (V0.getOpcode() == ISD::UNDEF)
6543         V0 = Op0.getOperand(0);
6544     } else {
6545       if (V1.getOpcode() == ISD::UNDEF)
6546         V1 = Op0.getOperand(0);
6547       if (i * 2 == NumElts)
6548         ExpectedVExtractIdx = BaseIdx;
6549     }
6550
6551     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6552     if (I0 == ExpectedVExtractIdx)
6553       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6554     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6555       // Try to match the following dag sequence:
6556       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6557       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6558     } else
6559       CanFold = false;
6560
6561     ExpectedVExtractIdx += 2;
6562   }
6563
6564   return CanFold;
6565 }
6566
6567 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6568 /// a concat_vector.
6569 ///
6570 /// This is a helper function of PerformBUILD_VECTORCombine.
6571 /// This function expects two 256-bit vectors called V0 and V1.
6572 /// At first, each vector is split into two separate 128-bit vectors.
6573 /// Then, the resulting 128-bit vectors are used to implement two
6574 /// horizontal binary operations.
6575 ///
6576 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6577 ///
6578 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6579 /// the two new horizontal binop.
6580 /// When Mode is set, the first horizontal binop dag node would take as input
6581 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6582 /// horizontal binop dag node would take as input the lower 128-bit of V1
6583 /// and the upper 128-bit of V1.
6584 ///   Example:
6585 ///     HADD V0_LO, V0_HI
6586 ///     HADD V1_LO, V1_HI
6587 ///
6588 /// Otherwise, the first horizontal binop dag node takes as input the lower
6589 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6590 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
6591 ///   Example:
6592 ///     HADD V0_LO, V1_LO
6593 ///     HADD V0_HI, V1_HI
6594 ///
6595 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6596 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6597 /// the upper 128-bits of the result.
6598 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6599                                      SDLoc DL, SelectionDAG &DAG,
6600                                      unsigned X86Opcode, bool Mode,
6601                                      bool isUndefLO, bool isUndefHI) {
6602   EVT VT = V0.getValueType();
6603   assert(VT.is256BitVector() && VT == V1.getValueType() &&
6604          "Invalid nodes in input!");
6605
6606   unsigned NumElts = VT.getVectorNumElements();
6607   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
6608   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
6609   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
6610   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
6611   EVT NewVT = V0_LO.getValueType();
6612
6613   SDValue LO = DAG.getUNDEF(NewVT);
6614   SDValue HI = DAG.getUNDEF(NewVT);
6615
6616   if (Mode) {
6617     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6618     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
6619       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6620     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
6621       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6622   } else {
6623     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6624     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
6625                        V1_LO->getOpcode() != ISD::UNDEF))
6626       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6627
6628     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
6629                        V1_HI->getOpcode() != ISD::UNDEF))
6630       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6631   }
6632
6633   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6634 }
6635
6636 /// \brief Try to fold a build_vector that performs an 'addsub' into the
6637 /// sequence of 'vadd + vsub + blendi'.
6638 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
6639                            const X86Subtarget *Subtarget) {
6640   SDLoc DL(BV);
6641   EVT VT = BV->getValueType(0);
6642   unsigned NumElts = VT.getVectorNumElements();
6643   SDValue InVec0 = DAG.getUNDEF(VT);
6644   SDValue InVec1 = DAG.getUNDEF(VT);
6645
6646   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6647           VT == MVT::v2f64) && "build_vector with an invalid type found!");
6648
6649   // Odd-numbered elements in the input build vector are obtained from
6650   // adding two integer/float elements.
6651   // Even-numbered elements in the input build vector are obtained from
6652   // subtracting two integer/float elements.
6653   unsigned ExpectedOpcode = ISD::FSUB;
6654   unsigned NextExpectedOpcode = ISD::FADD;
6655   bool AddFound = false;
6656   bool SubFound = false;
6657
6658   for (unsigned i = 0, e = NumElts; i != e; i++) {
6659     SDValue Op = BV->getOperand(i);
6660
6661     // Skip 'undef' values.
6662     unsigned Opcode = Op.getOpcode();
6663     if (Opcode == ISD::UNDEF) {
6664       std::swap(ExpectedOpcode, NextExpectedOpcode);
6665       continue;
6666     }
6667
6668     // Early exit if we found an unexpected opcode.
6669     if (Opcode != ExpectedOpcode)
6670       return SDValue();
6671
6672     SDValue Op0 = Op.getOperand(0);
6673     SDValue Op1 = Op.getOperand(1);
6674
6675     // Try to match the following pattern:
6676     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6677     // Early exit if we cannot match that sequence.
6678     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6679         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6680         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6681         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6682         Op0.getOperand(1) != Op1.getOperand(1))
6683       return SDValue();
6684
6685     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6686     if (I0 != i)
6687       return SDValue();
6688
6689     // We found a valid add/sub node. Update the information accordingly.
6690     if (i & 1)
6691       AddFound = true;
6692     else
6693       SubFound = true;
6694
6695     // Update InVec0 and InVec1.
6696     if (InVec0.getOpcode() == ISD::UNDEF)
6697       InVec0 = Op0.getOperand(0);
6698     if (InVec1.getOpcode() == ISD::UNDEF)
6699       InVec1 = Op1.getOperand(0);
6700
6701     // Make sure that operands in input to each add/sub node always
6702     // come from a same pair of vectors.
6703     if (InVec0 != Op0.getOperand(0)) {
6704       if (ExpectedOpcode == ISD::FSUB)
6705         return SDValue();
6706
6707       // FADD is commutable. Try to commute the operands
6708       // and then test again.
6709       std::swap(Op0, Op1);
6710       if (InVec0 != Op0.getOperand(0))
6711         return SDValue();
6712     }
6713
6714     if (InVec1 != Op1.getOperand(0))
6715       return SDValue();
6716
6717     // Update the pair of expected opcodes.
6718     std::swap(ExpectedOpcode, NextExpectedOpcode);
6719   }
6720
6721   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6722   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
6723       InVec1.getOpcode() != ISD::UNDEF)
6724     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6725
6726   return SDValue();
6727 }
6728
6729 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
6730                                           const X86Subtarget *Subtarget) {
6731   SDLoc DL(N);
6732   EVT VT = N->getValueType(0);
6733   unsigned NumElts = VT.getVectorNumElements();
6734   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
6735   SDValue InVec0, InVec1;
6736
6737   // Try to match an ADDSUB.
6738   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
6739       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
6740     SDValue Value = matchAddSub(BV, DAG, Subtarget);
6741     if (Value.getNode())
6742       return Value;
6743   }
6744
6745   // Try to match horizontal ADD/SUB.
6746   unsigned NumUndefsLO = 0;
6747   unsigned NumUndefsHI = 0;
6748   unsigned Half = NumElts/2;
6749
6750   // Count the number of UNDEF operands in the build_vector in input.
6751   for (unsigned i = 0, e = Half; i != e; ++i)
6752     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6753       NumUndefsLO++;
6754
6755   for (unsigned i = Half, e = NumElts; i != e; ++i)
6756     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6757       NumUndefsHI++;
6758
6759   // Early exit if this is either a build_vector of all UNDEFs or all the
6760   // operands but one are UNDEF.
6761   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6762     return SDValue();
6763
6764   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
6765     // Try to match an SSE3 float HADD/HSUB.
6766     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6767       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6768
6769     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6770       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6771   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
6772     // Try to match an SSSE3 integer HADD/HSUB.
6773     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6774       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6775
6776     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6777       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6778   }
6779
6780   if (!Subtarget->hasAVX())
6781     return SDValue();
6782
6783   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6784     // Try to match an AVX horizontal add/sub of packed single/double
6785     // precision floating point values from 256-bit vectors.
6786     SDValue InVec2, InVec3;
6787     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6788         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6789         ((InVec0.getOpcode() == ISD::UNDEF ||
6790           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6791         ((InVec1.getOpcode() == ISD::UNDEF ||
6792           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6793       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6794
6795     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6796         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6797         ((InVec0.getOpcode() == ISD::UNDEF ||
6798           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6799         ((InVec1.getOpcode() == ISD::UNDEF ||
6800           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6801       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6802   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6803     // Try to match an AVX2 horizontal add/sub of signed integers.
6804     SDValue InVec2, InVec3;
6805     unsigned X86Opcode;
6806     bool CanFold = true;
6807
6808     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6809         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6810         ((InVec0.getOpcode() == ISD::UNDEF ||
6811           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6812         ((InVec1.getOpcode() == ISD::UNDEF ||
6813           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6814       X86Opcode = X86ISD::HADD;
6815     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6816         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6817         ((InVec0.getOpcode() == ISD::UNDEF ||
6818           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6819         ((InVec1.getOpcode() == ISD::UNDEF ||
6820           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6821       X86Opcode = X86ISD::HSUB;
6822     else
6823       CanFold = false;
6824
6825     if (CanFold) {
6826       // Fold this build_vector into a single horizontal add/sub.
6827       // Do this only if the target has AVX2.
6828       if (Subtarget->hasAVX2())
6829         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6830
6831       // Do not try to expand this build_vector into a pair of horizontal
6832       // add/sub if we can emit a pair of scalar add/sub.
6833       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6834         return SDValue();
6835
6836       // Convert this build_vector into a pair of horizontal binop followed by
6837       // a concat vector.
6838       bool isUndefLO = NumUndefsLO == Half;
6839       bool isUndefHI = NumUndefsHI == Half;
6840       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6841                                    isUndefLO, isUndefHI);
6842     }
6843   }
6844
6845   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6846        VT == MVT::v16i16) && Subtarget->hasAVX()) {
6847     unsigned X86Opcode;
6848     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6849       X86Opcode = X86ISD::HADD;
6850     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6851       X86Opcode = X86ISD::HSUB;
6852     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6853       X86Opcode = X86ISD::FHADD;
6854     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6855       X86Opcode = X86ISD::FHSUB;
6856     else
6857       return SDValue();
6858
6859     // Don't try to expand this build_vector into a pair of horizontal add/sub
6860     // if we can simply emit a pair of scalar add/sub.
6861     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6862       return SDValue();
6863
6864     // Convert this build_vector into two horizontal add/sub followed by
6865     // a concat vector.
6866     bool isUndefLO = NumUndefsLO == Half;
6867     bool isUndefHI = NumUndefsHI == Half;
6868     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6869                                  isUndefLO, isUndefHI);
6870   }
6871
6872   return SDValue();
6873 }
6874
6875 SDValue
6876 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6877   SDLoc dl(Op);
6878
6879   MVT VT = Op.getSimpleValueType();
6880   MVT ExtVT = VT.getVectorElementType();
6881   unsigned NumElems = Op.getNumOperands();
6882
6883   // Generate vectors for predicate vectors.
6884   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
6885     return LowerBUILD_VECTORvXi1(Op, DAG);
6886
6887   // Vectors containing all zeros can be matched by pxor and xorps later
6888   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6889     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6890     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6891     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6892       return Op;
6893
6894     return getZeroVector(VT, Subtarget, DAG, dl);
6895   }
6896
6897   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6898   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6899   // vpcmpeqd on 256-bit vectors.
6900   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6901     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
6902       return Op;
6903
6904     if (!VT.is512BitVector())
6905       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
6906   }
6907
6908   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
6909   if (Broadcast.getNode())
6910     return Broadcast;
6911
6912   unsigned EVTBits = ExtVT.getSizeInBits();
6913
6914   unsigned NumZero  = 0;
6915   unsigned NumNonZero = 0;
6916   unsigned NonZeros = 0;
6917   bool IsAllConstants = true;
6918   SmallSet<SDValue, 8> Values;
6919   for (unsigned i = 0; i < NumElems; ++i) {
6920     SDValue Elt = Op.getOperand(i);
6921     if (Elt.getOpcode() == ISD::UNDEF)
6922       continue;
6923     Values.insert(Elt);
6924     if (Elt.getOpcode() != ISD::Constant &&
6925         Elt.getOpcode() != ISD::ConstantFP)
6926       IsAllConstants = false;
6927     if (X86::isZeroNode(Elt))
6928       NumZero++;
6929     else {
6930       NonZeros |= (1 << i);
6931       NumNonZero++;
6932     }
6933   }
6934
6935   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
6936   if (NumNonZero == 0)
6937     return DAG.getUNDEF(VT);
6938
6939   // Special case for single non-zero, non-undef, element.
6940   if (NumNonZero == 1) {
6941     unsigned Idx = countTrailingZeros(NonZeros);
6942     SDValue Item = Op.getOperand(Idx);
6943
6944     // If this is an insertion of an i64 value on x86-32, and if the top bits of
6945     // the value are obviously zero, truncate the value to i32 and do the
6946     // insertion that way.  Only do this if the value is non-constant or if the
6947     // value is a constant being inserted into element 0.  It is cheaper to do
6948     // a constant pool load than it is to do a movd + shuffle.
6949     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
6950         (!IsAllConstants || Idx == 0)) {
6951       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6952         // Handle SSE only.
6953         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6954         EVT VecVT = MVT::v4i32;
6955         unsigned VecElts = 4;
6956
6957         // Truncate the value (which may itself be a constant) to i32, and
6958         // convert it to a vector with movd (S2V+shuffle to zero extend).
6959         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6960         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6961
6962         // If using the new shuffle lowering, just directly insert this.
6963         if (ExperimentalVectorShuffleLowering)
6964           return DAG.getNode(
6965               ISD::BITCAST, dl, VT,
6966               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
6967
6968         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6969
6970         // Now we have our 32-bit value zero extended in the low element of
6971         // a vector.  If Idx != 0, swizzle it into place.
6972         if (Idx != 0) {
6973           SmallVector<int, 4> Mask;
6974           Mask.push_back(Idx);
6975           for (unsigned i = 1; i != VecElts; ++i)
6976             Mask.push_back(i);
6977           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
6978                                       &Mask[0]);
6979         }
6980         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
6981       }
6982     }
6983
6984     // If we have a constant or non-constant insertion into the low element of
6985     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6986     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
6987     // depending on what the source datatype is.
6988     if (Idx == 0) {
6989       if (NumZero == 0)
6990         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6991
6992       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6993           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
6994         if (VT.is256BitVector() || VT.is512BitVector()) {
6995           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6996           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6997                              Item, DAG.getIntPtrConstant(0));
6998         }
6999         assert(VT.is128BitVector() && "Expected an SSE value type!");
7000         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7001         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7002         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7003       }
7004
7005       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7006         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7007         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7008         if (VT.is256BitVector()) {
7009           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
7010           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7011         } else {
7012           assert(VT.is128BitVector() && "Expected an SSE value type!");
7013           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7014         }
7015         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
7016       }
7017     }
7018
7019     // Is it a vector logical left shift?
7020     if (NumElems == 2 && Idx == 1 &&
7021         X86::isZeroNode(Op.getOperand(0)) &&
7022         !X86::isZeroNode(Op.getOperand(1))) {
7023       unsigned NumBits = VT.getSizeInBits();
7024       return getVShift(true, VT,
7025                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7026                                    VT, Op.getOperand(1)),
7027                        NumBits/2, DAG, *this, dl);
7028     }
7029
7030     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7031       return SDValue();
7032
7033     // Otherwise, if this is a vector with i32 or f32 elements, and the element
7034     // is a non-constant being inserted into an element other than the low one,
7035     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
7036     // movd/movss) to move this into the low element, then shuffle it into
7037     // place.
7038     if (EVTBits == 32) {
7039       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7040
7041       // If using the new shuffle lowering, just directly insert this.
7042       if (ExperimentalVectorShuffleLowering)
7043         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7044
7045       // Turn it into a shuffle of zero and zero-extended scalar to vector.
7046       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
7047       SmallVector<int, 8> MaskVec;
7048       for (unsigned i = 0; i != NumElems; ++i)
7049         MaskVec.push_back(i == Idx ? 0 : 1);
7050       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
7051     }
7052   }
7053
7054   // Splat is obviously ok. Let legalizer expand it to a shuffle.
7055   if (Values.size() == 1) {
7056     if (EVTBits == 32) {
7057       // Instead of a shuffle like this:
7058       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7059       // Check if it's possible to issue this instead.
7060       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7061       unsigned Idx = countTrailingZeros(NonZeros);
7062       SDValue Item = Op.getOperand(Idx);
7063       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7064         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7065     }
7066     return SDValue();
7067   }
7068
7069   // A vector full of immediates; various special cases are already
7070   // handled, so this is best done with a single constant-pool load.
7071   if (IsAllConstants)
7072     return SDValue();
7073
7074   // For AVX-length vectors, see if we can use a vector load to get all of the
7075   // elements, otherwise build the individual 128-bit pieces and use
7076   // shuffles to put them in place.
7077   if (VT.is256BitVector() || VT.is512BitVector()) {
7078     SmallVector<SDValue, 64> V;
7079     for (unsigned i = 0; i != NumElems; ++i)
7080       V.push_back(Op.getOperand(i));
7081
7082     // Check for a build vector of consecutive loads.
7083     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
7084       return LD;
7085
7086     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7087
7088     // Build both the lower and upper subvector.
7089     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7090                                 makeArrayRef(&V[0], NumElems/2));
7091     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7092                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
7093
7094     // Recreate the wider vector with the lower and upper part.
7095     if (VT.is256BitVector())
7096       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7097     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7098   }
7099
7100   // Let legalizer expand 2-wide build_vectors.
7101   if (EVTBits == 64) {
7102     if (NumNonZero == 1) {
7103       // One half is zero or undef.
7104       unsigned Idx = countTrailingZeros(NonZeros);
7105       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7106                                  Op.getOperand(Idx));
7107       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7108     }
7109     return SDValue();
7110   }
7111
7112   // If element VT is < 32 bits, convert it to inserts into a zero vector.
7113   if (EVTBits == 8 && NumElems == 16) {
7114     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
7115                                         Subtarget, *this);
7116     if (V.getNode()) return V;
7117   }
7118
7119   if (EVTBits == 16 && NumElems == 8) {
7120     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
7121                                       Subtarget, *this);
7122     if (V.getNode()) return V;
7123   }
7124
7125   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7126   if (EVTBits == 32 && NumElems == 4) {
7127     SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
7128     if (V.getNode())
7129       return V;
7130   }
7131
7132   // If element VT is == 32 bits, turn it into a number of shuffles.
7133   SmallVector<SDValue, 8> V(NumElems);
7134   if (NumElems == 4 && NumZero > 0) {
7135     for (unsigned i = 0; i < 4; ++i) {
7136       bool isZero = !(NonZeros & (1 << i));
7137       if (isZero)
7138         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
7139       else
7140         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7141     }
7142
7143     for (unsigned i = 0; i < 2; ++i) {
7144       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7145         default: break;
7146         case 0:
7147           V[i] = V[i*2];  // Must be a zero vector.
7148           break;
7149         case 1:
7150           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
7151           break;
7152         case 2:
7153           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
7154           break;
7155         case 3:
7156           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
7157           break;
7158       }
7159     }
7160
7161     bool Reverse1 = (NonZeros & 0x3) == 2;
7162     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7163     int MaskVec[] = {
7164       Reverse1 ? 1 : 0,
7165       Reverse1 ? 0 : 1,
7166       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7167       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
7168     };
7169     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
7170   }
7171
7172   if (Values.size() > 1 && VT.is128BitVector()) {
7173     // Check for a build vector of consecutive loads.
7174     for (unsigned i = 0; i < NumElems; ++i)
7175       V[i] = Op.getOperand(i);
7176
7177     // Check for elements which are consecutive loads.
7178     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
7179     if (LD.getNode())
7180       return LD;
7181
7182     // Check for a build vector from mostly shuffle plus few inserting.
7183     SDValue Sh = buildFromShuffleMostly(Op, DAG);
7184     if (Sh.getNode())
7185       return Sh;
7186
7187     // For SSE 4.1, use insertps to put the high elements into the low element.
7188     if (Subtarget->hasSSE41()) {
7189       SDValue Result;
7190       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
7191         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7192       else
7193         Result = DAG.getUNDEF(VT);
7194
7195       for (unsigned i = 1; i < NumElems; ++i) {
7196         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
7197         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7198                              Op.getOperand(i), DAG.getIntPtrConstant(i));
7199       }
7200       return Result;
7201     }
7202
7203     // Otherwise, expand into a number of unpckl*, start by extending each of
7204     // our (non-undef) elements to the full vector width with the element in the
7205     // bottom slot of the vector (which generates no code for SSE).
7206     for (unsigned i = 0; i < NumElems; ++i) {
7207       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
7208         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7209       else
7210         V[i] = DAG.getUNDEF(VT);
7211     }
7212
7213     // Next, we iteratively mix elements, e.g. for v4f32:
7214     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7215     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7216     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
7217     unsigned EltStride = NumElems >> 1;
7218     while (EltStride != 0) {
7219       for (unsigned i = 0; i < EltStride; ++i) {
7220         // If V[i+EltStride] is undef and this is the first round of mixing,
7221         // then it is safe to just drop this shuffle: V[i] is already in the
7222         // right place, the one element (since it's the first round) being
7223         // inserted as undef can be dropped.  This isn't safe for successive
7224         // rounds because they will permute elements within both vectors.
7225         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
7226             EltStride == NumElems/2)
7227           continue;
7228
7229         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
7230       }
7231       EltStride >>= 1;
7232     }
7233     return V[0];
7234   }
7235   return SDValue();
7236 }
7237
7238 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
7239 // to create 256-bit vectors from two other 128-bit ones.
7240 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7241   SDLoc dl(Op);
7242   MVT ResVT = Op.getSimpleValueType();
7243
7244   assert((ResVT.is256BitVector() ||
7245           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7246
7247   SDValue V1 = Op.getOperand(0);
7248   SDValue V2 = Op.getOperand(1);
7249   unsigned NumElems = ResVT.getVectorNumElements();
7250   if(ResVT.is256BitVector())
7251     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7252
7253   if (Op.getNumOperands() == 4) {
7254     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
7255                                 ResVT.getVectorNumElements()/2);
7256     SDValue V3 = Op.getOperand(2);
7257     SDValue V4 = Op.getOperand(3);
7258     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
7259       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
7260   }
7261   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7262 }
7263
7264 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7265   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
7266   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7267          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7268           Op.getNumOperands() == 4)));
7269
7270   // AVX can use the vinsertf128 instruction to create 256-bit vectors
7271   // from two other 128-bit ones.
7272
7273   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7274   return LowerAVXCONCAT_VECTORS(Op, DAG);
7275 }
7276
7277
7278 //===----------------------------------------------------------------------===//
7279 // Vector shuffle lowering
7280 //
7281 // This is an experimental code path for lowering vector shuffles on x86. It is
7282 // designed to handle arbitrary vector shuffles and blends, gracefully
7283 // degrading performance as necessary. It works hard to recognize idiomatic
7284 // shuffles and lower them to optimal instruction patterns without leaving
7285 // a framework that allows reasonably efficient handling of all vector shuffle
7286 // patterns.
7287 //===----------------------------------------------------------------------===//
7288
7289 /// \brief Tiny helper function to identify a no-op mask.
7290 ///
7291 /// This is a somewhat boring predicate function. It checks whether the mask
7292 /// array input, which is assumed to be a single-input shuffle mask of the kind
7293 /// used by the X86 shuffle instructions (not a fully general
7294 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7295 /// in-place shuffle are 'no-op's.
7296 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7297   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7298     if (Mask[i] != -1 && Mask[i] != i)
7299       return false;
7300   return true;
7301 }
7302
7303 /// \brief Helper function to classify a mask as a single-input mask.
7304 ///
7305 /// This isn't a generic single-input test because in the vector shuffle
7306 /// lowering we canonicalize single inputs to be the first input operand. This
7307 /// means we can more quickly test for a single input by only checking whether
7308 /// an input from the second operand exists. We also assume that the size of
7309 /// mask corresponds to the size of the input vectors which isn't true in the
7310 /// fully general case.
7311 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
7312   for (int M : Mask)
7313     if (M >= (int)Mask.size())
7314       return false;
7315   return true;
7316 }
7317
7318 /// \brief Test whether there are elements crossing 128-bit lanes in this
7319 /// shuffle mask.
7320 ///
7321 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7322 /// and we routinely test for these.
7323 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7324   int LaneSize = 128 / VT.getScalarSizeInBits();
7325   int Size = Mask.size();
7326   for (int i = 0; i < Size; ++i)
7327     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7328       return true;
7329   return false;
7330 }
7331
7332 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
7333 ///
7334 /// This checks a shuffle mask to see if it is performing the same
7335 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
7336 /// that it is also not lane-crossing. It may however involve a blend from the
7337 /// same lane of a second vector.
7338 ///
7339 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7340 /// non-trivial to compute in the face of undef lanes. The representation is
7341 /// *not* suitable for use with existing 128-bit shuffles as it will contain
7342 /// entries from both V1 and V2 inputs to the wider mask.
7343 static bool
7344 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7345                                 SmallVectorImpl<int> &RepeatedMask) {
7346   int LaneSize = 128 / VT.getScalarSizeInBits();
7347   RepeatedMask.resize(LaneSize, -1);
7348   int Size = Mask.size();
7349   for (int i = 0; i < Size; ++i) {
7350     if (Mask[i] < 0)
7351       continue;
7352     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7353       // This entry crosses lanes, so there is no way to model this shuffle.
7354       return false;
7355
7356     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7357     if (RepeatedMask[i % LaneSize] == -1)
7358       // This is the first non-undef entry in this slot of a 128-bit lane.
7359       RepeatedMask[i % LaneSize] =
7360           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
7361     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
7362       // Found a mismatch with the repeated mask.
7363       return false;
7364   }
7365   return true;
7366 }
7367
7368 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
7369 // 2013 will allow us to use it as a non-type template parameter.
7370 namespace {
7371
7372 /// \brief Implementation of the \c isShuffleEquivalent variadic functor.
7373 ///
7374 /// See its documentation for details.
7375 bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
7376   if (Mask.size() != Args.size())
7377     return false;
7378   for (int i = 0, e = Mask.size(); i < e; ++i) {
7379     assert(*Args[i] >= 0 && "Arguments must be positive integers!");
7380     if (Mask[i] != -1 && Mask[i] != *Args[i])
7381       return false;
7382   }
7383   return true;
7384 }
7385
7386 } // namespace
7387
7388 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7389 /// arguments.
7390 ///
7391 /// This is a fast way to test a shuffle mask against a fixed pattern:
7392 ///
7393 ///   if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
7394 ///
7395 /// It returns true if the mask is exactly as wide as the argument list, and
7396 /// each element of the mask is either -1 (signifying undef) or the value given
7397 /// in the argument.
7398 static const VariadicFunction1<
7399     bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
7400
7401 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7402 ///
7403 /// This helper function produces an 8-bit shuffle immediate corresponding to
7404 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7405 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7406 /// example.
7407 ///
7408 /// NB: We rely heavily on "undef" masks preserving the input lane.
7409 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
7410                                           SelectionDAG &DAG) {
7411   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7412   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7413   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7414   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7415   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7416
7417   unsigned Imm = 0;
7418   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
7419   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
7420   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
7421   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
7422   return DAG.getConstant(Imm, MVT::i8);
7423 }
7424
7425 /// \brief Try to emit a blend instruction for a shuffle.
7426 ///
7427 /// This doesn't do any checks for the availability of instructions for blending
7428 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7429 /// be matched in the backend with the type given. What it does check for is
7430 /// that the shuffle mask is in fact a blend.
7431 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
7432                                          SDValue V2, ArrayRef<int> Mask,
7433                                          const X86Subtarget *Subtarget,
7434                                          SelectionDAG &DAG) {
7435
7436   unsigned BlendMask = 0;
7437   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7438     if (Mask[i] >= Size) {
7439       if (Mask[i] != i + Size)
7440         return SDValue(); // Shuffled V2 input!
7441       BlendMask |= 1u << i;
7442       continue;
7443     }
7444     if (Mask[i] >= 0 && Mask[i] != i)
7445       return SDValue(); // Shuffled V1 input!
7446   }
7447   switch (VT.SimpleTy) {
7448   case MVT::v2f64:
7449   case MVT::v4f32:
7450   case MVT::v4f64:
7451   case MVT::v8f32:
7452     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7453                        DAG.getConstant(BlendMask, MVT::i8));
7454
7455   case MVT::v4i64:
7456   case MVT::v8i32:
7457     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7458     // FALLTHROUGH
7459   case MVT::v2i64:
7460   case MVT::v4i32:
7461     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7462     // that instruction.
7463     if (Subtarget->hasAVX2()) {
7464       // Scale the blend by the number of 32-bit dwords per element.
7465       int Scale =  VT.getScalarSizeInBits() / 32;
7466       BlendMask = 0;
7467       for (int i = 0, Size = Mask.size(); i < Size; ++i)
7468         if (Mask[i] >= Size)
7469           for (int j = 0; j < Scale; ++j)
7470             BlendMask |= 1u << (i * Scale + j);
7471
7472       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7473       V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
7474       V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
7475       return DAG.getNode(ISD::BITCAST, DL, VT,
7476                          DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7477                                      DAG.getConstant(BlendMask, MVT::i8)));
7478     }
7479     // FALLTHROUGH
7480   case MVT::v8i16: {
7481     // For integer shuffles we need to expand the mask and cast the inputs to
7482     // v8i16s prior to blending.
7483     int Scale = 8 / VT.getVectorNumElements();
7484     BlendMask = 0;
7485     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7486       if (Mask[i] >= Size)
7487         for (int j = 0; j < Scale; ++j)
7488           BlendMask |= 1u << (i * Scale + j);
7489
7490     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
7491     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
7492     return DAG.getNode(ISD::BITCAST, DL, VT,
7493                        DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7494                                    DAG.getConstant(BlendMask, MVT::i8)));
7495   }
7496
7497   case MVT::v16i16: {
7498     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7499     SmallVector<int, 8> RepeatedMask;
7500     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7501       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7502       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7503       BlendMask = 0;
7504       for (int i = 0; i < 8; ++i)
7505         if (RepeatedMask[i] >= 16)
7506           BlendMask |= 1u << i;
7507       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7508                          DAG.getConstant(BlendMask, MVT::i8));
7509     }
7510   }
7511     // FALLTHROUGH
7512   case MVT::v32i8: {
7513     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7514     // Scale the blend by the number of bytes per element.
7515     int Scale =  VT.getScalarSizeInBits() / 8;
7516     assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
7517
7518     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7519     // mix of LLVM's code generator and the x86 backend. We tell the code
7520     // generator that boolean values in the elements of an x86 vector register
7521     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7522     // mapping a select to operand #1, and 'false' mapping to operand #2. The
7523     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7524     // of the element (the remaining are ignored) and 0 in that high bit would
7525     // mean operand #1 while 1 in the high bit would mean operand #2. So while
7526     // the LLVM model for boolean values in vector elements gets the relevant
7527     // bit set, it is set backwards and over constrained relative to x86's
7528     // actual model.
7529     SDValue VSELECTMask[32];
7530     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7531       for (int j = 0; j < Scale; ++j)
7532         VSELECTMask[Scale * i + j] =
7533             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7534                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
7535
7536     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
7537     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
7538     return DAG.getNode(
7539         ISD::BITCAST, DL, VT,
7540         DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
7541                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
7542                     V1, V2));
7543   }
7544
7545   default:
7546     llvm_unreachable("Not a supported integer vector type!");
7547   }
7548 }
7549
7550 /// \brief Generic routine to lower a shuffle and blend as a decomposed set of
7551 /// unblended shuffles followed by an unshuffled blend.
7552 ///
7553 /// This matches the extremely common pattern for handling combined
7554 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7555 /// operations.
7556 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
7557                                                           SDValue V1,
7558                                                           SDValue V2,
7559                                                           ArrayRef<int> Mask,
7560                                                           SelectionDAG &DAG) {
7561   // Shuffle the input elements into the desired positions in V1 and V2 and
7562   // blend them together.
7563   SmallVector<int, 32> V1Mask(Mask.size(), -1);
7564   SmallVector<int, 32> V2Mask(Mask.size(), -1);
7565   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7566   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7567     if (Mask[i] >= 0 && Mask[i] < Size) {
7568       V1Mask[i] = Mask[i];
7569       BlendMask[i] = i;
7570     } else if (Mask[i] >= Size) {
7571       V2Mask[i] = Mask[i] - Size;
7572       BlendMask[i] = i + Size;
7573     }
7574
7575   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7576   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7577   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7578 }
7579
7580 /// \brief Try to lower a vector shuffle as a byte rotation.
7581 ///
7582 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7583 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7584 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7585 /// try to generically lower a vector shuffle through such an pattern. It
7586 /// does not check for the profitability of lowering either as PALIGNR or
7587 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7588 /// This matches shuffle vectors that look like:
7589 ///
7590 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7591 ///
7592 /// Essentially it concatenates V1 and V2, shifts right by some number of
7593 /// elements, and takes the low elements as the result. Note that while this is
7594 /// specified as a *right shift* because x86 is little-endian, it is a *left
7595 /// rotate* of the vector lanes.
7596 ///
7597 /// Note that this only handles 128-bit vector widths currently.
7598 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
7599                                               SDValue V2,
7600                                               ArrayRef<int> Mask,
7601                                               const X86Subtarget *Subtarget,
7602                                               SelectionDAG &DAG) {
7603   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7604
7605   // We need to detect various ways of spelling a rotation:
7606   //   [11, 12, 13, 14, 15,  0,  1,  2]
7607   //   [-1, 12, 13, 14, -1, -1,  1, -1]
7608   //   [-1, -1, -1, -1, -1, -1,  1,  2]
7609   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
7610   //   [-1,  4,  5,  6, -1, -1,  9, -1]
7611   //   [-1,  4,  5,  6, -1, -1, -1, -1]
7612   int Rotation = 0;
7613   SDValue Lo, Hi;
7614   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7615     if (Mask[i] == -1)
7616       continue;
7617     assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
7618
7619     // Based on the mod-Size value of this mask element determine where
7620     // a rotated vector would have started.
7621     int StartIdx = i - (Mask[i] % Size);
7622     if (StartIdx == 0)
7623       // The identity rotation isn't interesting, stop.
7624       return SDValue();
7625
7626     // If we found the tail of a vector the rotation must be the missing
7627     // front. If we found the head of a vector, it must be how much of the head.
7628     int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
7629
7630     if (Rotation == 0)
7631       Rotation = CandidateRotation;
7632     else if (Rotation != CandidateRotation)
7633       // The rotations don't match, so we can't match this mask.
7634       return SDValue();
7635
7636     // Compute which value this mask is pointing at.
7637     SDValue MaskV = Mask[i] < Size ? V1 : V2;
7638
7639     // Compute which of the two target values this index should be assigned to.
7640     // This reflects whether the high elements are remaining or the low elements
7641     // are remaining.
7642     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7643
7644     // Either set up this value if we've not encountered it before, or check
7645     // that it remains consistent.
7646     if (!TargetV)
7647       TargetV = MaskV;
7648     else if (TargetV != MaskV)
7649       // This may be a rotation, but it pulls from the inputs in some
7650       // unsupported interleaving.
7651       return SDValue();
7652   }
7653
7654   // Check that we successfully analyzed the mask, and normalize the results.
7655   assert(Rotation != 0 && "Failed to locate a viable rotation!");
7656   assert((Lo || Hi) && "Failed to find a rotated input vector!");
7657   if (!Lo)
7658     Lo = Hi;
7659   else if (!Hi)
7660     Hi = Lo;
7661
7662   assert(VT.getSizeInBits() == 128 &&
7663          "Rotate-based lowering only supports 128-bit lowering!");
7664   assert(Mask.size() <= 16 &&
7665          "Can shuffle at most 16 bytes in a 128-bit vector!");
7666
7667   // The actual rotate instruction rotates bytes, so we need to scale the
7668   // rotation based on how many bytes are in the vector.
7669   int Scale = 16 / Mask.size();
7670
7671   // SSSE3 targets can use the palignr instruction
7672   if (Subtarget->hasSSSE3()) {
7673     // Cast the inputs to v16i8 to match PALIGNR.
7674     Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
7675     Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
7676
7677     return DAG.getNode(ISD::BITCAST, DL, VT,
7678                        DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
7679                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
7680   }
7681
7682   // Default SSE2 implementation
7683   int LoByteShift = 16 - Rotation * Scale;
7684   int HiByteShift = Rotation * Scale;
7685
7686   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
7687   Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
7688   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
7689
7690   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
7691                                 DAG.getConstant(8 * LoByteShift, MVT::i8));
7692   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
7693                                 DAG.getConstant(8 * HiByteShift, MVT::i8));
7694   return DAG.getNode(ISD::BITCAST, DL, VT,
7695                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
7696 }
7697
7698 /// \brief Compute whether each element of a shuffle is zeroable.
7699 ///
7700 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7701 /// Either it is an undef element in the shuffle mask, the element of the input
7702 /// referenced is undef, or the element of the input referenced is known to be
7703 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7704 /// as many lanes with this technique as possible to simplify the remaining
7705 /// shuffle.
7706 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7707                                                      SDValue V1, SDValue V2) {
7708   SmallBitVector Zeroable(Mask.size(), false);
7709
7710   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7711   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7712
7713   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7714     int M = Mask[i];
7715     // Handle the easy cases.
7716     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7717       Zeroable[i] = true;
7718       continue;
7719     }
7720
7721     // If this is an index into a build_vector node, dig out the input value and
7722     // use it.
7723     SDValue V = M < Size ? V1 : V2;
7724     if (V.getOpcode() != ISD::BUILD_VECTOR)
7725       continue;
7726
7727     SDValue Input = V.getOperand(M % Size);
7728     // The UNDEF opcode check really should be dead code here, but not quite
7729     // worth asserting on (it isn't invalid, just unexpected).
7730     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
7731       Zeroable[i] = true;
7732   }
7733
7734   return Zeroable;
7735 }
7736
7737 /// \brief Try to emit a bitmask instruction for a shuffle.
7738 ///
7739 /// This handles cases where we can model a blend exactly as a bitmask due to
7740 /// one of the inputs being zeroable.
7741 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
7742                                            SDValue V2, ArrayRef<int> Mask,
7743                                            SelectionDAG &DAG) {
7744   MVT EltVT = VT.getScalarType();
7745   int NumEltBits = EltVT.getSizeInBits();
7746   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
7747   SDValue Zero = DAG.getConstant(0, IntEltVT);
7748   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
7749   if (EltVT.isFloatingPoint()) {
7750     Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
7751     AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
7752   }
7753   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
7754   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7755   SDValue V;
7756   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7757     if (Zeroable[i])
7758       continue;
7759     if (Mask[i] % Size != i)
7760       return SDValue(); // Not a blend.
7761     if (!V)
7762       V = Mask[i] < Size ? V1 : V2;
7763     else if (V != (Mask[i] < Size ? V1 : V2))
7764       return SDValue(); // Can only let one input through the mask.
7765
7766     VMaskOps[i] = AllOnes;
7767   }
7768   if (!V)
7769     return SDValue(); // No non-zeroable elements!
7770
7771   SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
7772   V = DAG.getNode(VT.isFloatingPoint()
7773                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
7774                   DL, VT, V, VMask);
7775   return V;
7776 }
7777
7778 /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
7779 ///
7780 /// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
7781 /// byte-shift instructions. The mask must consist of a shifted sequential
7782 /// shuffle from one of the input vectors and zeroable elements for the
7783 /// remaining 'shifted in' elements.
7784 ///
7785 /// Note that this only handles 128-bit vector widths currently.
7786 static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
7787                                              SDValue V2, ArrayRef<int> Mask,
7788                                              SelectionDAG &DAG) {
7789   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7790
7791   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7792
7793   int Size = Mask.size();
7794   int Scale = 16 / Size;
7795
7796   for (int Shift = 1; Shift < Size; Shift++) {
7797     int ByteShift = Shift * Scale;
7798
7799     // PSRLDQ : (little-endian) right byte shift
7800     // [ 5,  6,  7, zz, zz, zz, zz, zz]
7801     // [ -1, 5,  6,  7, zz, zz, zz, zz]
7802     // [  1, 2, -1, -1, -1, -1, zz, zz]
7803     bool ZeroableRight = true;
7804     for (int i = Size - Shift; i < Size; i++) {
7805       ZeroableRight &= Zeroable[i];
7806     }
7807
7808     if (ZeroableRight) {
7809       bool ValidShiftRight1 =
7810           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
7811       bool ValidShiftRight2 =
7812           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
7813
7814       if (ValidShiftRight1 || ValidShiftRight2) {
7815         // Cast the inputs to v2i64 to match PSRLDQ.
7816         SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
7817         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7818         SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
7819                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7820         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7821       }
7822     }
7823
7824     // PSLLDQ : (little-endian) left byte shift
7825     // [ zz,  0,  1,  2,  3,  4,  5,  6]
7826     // [ zz, zz, -1, -1,  2,  3,  4, -1]
7827     // [ zz, zz, zz, zz, zz, zz, -1,  1]
7828     bool ZeroableLeft = true;
7829     for (int i = 0; i < Shift; i++) {
7830       ZeroableLeft &= Zeroable[i];
7831     }
7832
7833     if (ZeroableLeft) {
7834       bool ValidShiftLeft1 =
7835           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
7836       bool ValidShiftLeft2 =
7837           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
7838
7839       if (ValidShiftLeft1 || ValidShiftLeft2) {
7840         // Cast the inputs to v2i64 to match PSLLDQ.
7841         SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
7842         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7843         SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
7844                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7845         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7846       }
7847     }
7848   }
7849
7850   return SDValue();
7851 }
7852
7853 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
7854 ///
7855 /// Attempts to match a shuffle mask against the PSRL(W/D/Q) and PSLL(W/D/Q)
7856 /// SSE2 and AVX2 logical bit-shift instructions. The function matches
7857 /// elements from one of the input vectors shuffled to the left or right
7858 /// with zeroable elements 'shifted in'.
7859 static SDValue lowerVectorShuffleAsBitShift(SDLoc DL, MVT VT, SDValue V1,
7860                                             SDValue V2, ArrayRef<int> Mask,
7861                                             SelectionDAG &DAG) {
7862   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7863
7864   int Size = Mask.size();
7865   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7866
7867   // PSRL : (little-endian) right bit shift.
7868   // [  1, zz,  3, zz]
7869   // [ -1, -1,  7, zz]
7870   // PSHL : (little-endian) left bit shift.
7871   // [ zz, 0, zz,  2 ]
7872   // [ -1, 4, zz, -1 ]
7873   auto MatchBitShift = [&](int Shift, int Scale) -> SDValue {
7874     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
7875     MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
7876     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
7877            "Illegal integer vector type");
7878
7879     bool MatchLeft = true, MatchRight = true;
7880     for (int i = 0; i != Size; i += Scale) {
7881       for (int j = 0; j != Shift; j++) {
7882         MatchLeft &= Zeroable[i + j];
7883       }
7884       for (int j = Scale - Shift; j != Scale; j++) {
7885         MatchRight &= Zeroable[i + j];
7886       }
7887     }
7888     if (!(MatchLeft || MatchRight))
7889       return SDValue();
7890
7891     bool MatchV1 = true, MatchV2 = true;
7892     for (int i = 0; i != Size; i += Scale) {
7893       unsigned Pos = MatchLeft ? i + Shift : i;
7894       unsigned Low = MatchLeft ? i : i + Shift;
7895       unsigned Len = Scale - Shift;
7896       MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
7897       MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + Size);
7898     }
7899     if (!(MatchV1 || MatchV2))
7900       return SDValue();
7901
7902     // Cast the inputs to ShiftVT to match VSRLI/VSHLI and back again.
7903     unsigned OpCode = MatchLeft ? X86ISD::VSHLI : X86ISD::VSRLI;
7904     int ShiftAmt = Shift * VT.getScalarSizeInBits();
7905     SDValue V = MatchV1 ? V1 : V2;
7906     V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
7907     V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
7908     return DAG.getNode(ISD::BITCAST, DL, VT, V);
7909   };
7910
7911   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
7912   // keep doubling the size of the integer elements up to that. We can
7913   // then shift the elements of the integer vector by whole multiples of
7914   // their width within the elements of the larger integer vector. Test each
7915   // multiple to see if we can find a match with the moved element indices
7916   // and that the shifted in elements are all zeroable.
7917   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 64; Scale *= 2)
7918     for (int Shift = 1; Shift != Scale; Shift++)
7919       if (SDValue BitShift = MatchBitShift(Shift, Scale))
7920         return BitShift;
7921
7922   // no match
7923   return SDValue();
7924 }
7925
7926 /// \brief Lower a vector shuffle as a zero or any extension.
7927 ///
7928 /// Given a specific number of elements, element bit width, and extension
7929 /// stride, produce either a zero or any extension based on the available
7930 /// features of the subtarget.
7931 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7932     SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
7933     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7934   assert(Scale > 1 && "Need a scale to extend.");
7935   int NumElements = VT.getVectorNumElements();
7936   int EltBits = VT.getScalarSizeInBits();
7937   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
7938          "Only 8, 16, and 32 bit elements can be extended.");
7939   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
7940
7941   // Found a valid zext mask! Try various lowering strategies based on the
7942   // input type and available ISA extensions.
7943   if (Subtarget->hasSSE41()) {
7944     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
7945                                  NumElements / Scale);
7946     return DAG.getNode(ISD::BITCAST, DL, VT,
7947                        DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
7948   }
7949
7950   // For any extends we can cheat for larger element sizes and use shuffle
7951   // instructions that can fold with a load and/or copy.
7952   if (AnyExt && EltBits == 32) {
7953     int PSHUFDMask[4] = {0, -1, 1, -1};
7954     return DAG.getNode(
7955         ISD::BITCAST, DL, VT,
7956         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7957                     DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7958                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
7959   }
7960   if (AnyExt && EltBits == 16 && Scale > 2) {
7961     int PSHUFDMask[4] = {0, -1, 0, -1};
7962     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7963                          DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7964                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
7965     int PSHUFHWMask[4] = {1, -1, -1, -1};
7966     return DAG.getNode(
7967         ISD::BITCAST, DL, VT,
7968         DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
7969                     DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
7970                     getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
7971   }
7972
7973   // If this would require more than 2 unpack instructions to expand, use
7974   // pshufb when available. We can only use more than 2 unpack instructions
7975   // when zero extending i8 elements which also makes it easier to use pshufb.
7976   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
7977     assert(NumElements == 16 && "Unexpected byte vector width!");
7978     SDValue PSHUFBMask[16];
7979     for (int i = 0; i < 16; ++i)
7980       PSHUFBMask[i] =
7981           DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
7982     InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
7983     return DAG.getNode(ISD::BITCAST, DL, VT,
7984                        DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
7985                                    DAG.getNode(ISD::BUILD_VECTOR, DL,
7986                                                MVT::v16i8, PSHUFBMask)));
7987   }
7988
7989   // Otherwise emit a sequence of unpacks.
7990   do {
7991     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7992     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
7993                          : getZeroVector(InputVT, Subtarget, DAG, DL);
7994     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7995     InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
7996     Scale /= 2;
7997     EltBits *= 2;
7998     NumElements /= 2;
7999   } while (Scale > 1);
8000   return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
8001 }
8002
8003 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8004 ///
8005 /// This routine will try to do everything in its power to cleverly lower
8006 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
8007 /// check for the profitability of this lowering,  it tries to aggressively
8008 /// match this pattern. It will use all of the micro-architectural details it
8009 /// can to emit an efficient lowering. It handles both blends with all-zero
8010 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
8011 /// masking out later).
8012 ///
8013 /// The reason we have dedicated lowering for zext-style shuffles is that they
8014 /// are both incredibly common and often quite performance sensitive.
8015 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
8016     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8017     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8018   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8019
8020   int Bits = VT.getSizeInBits();
8021   int NumElements = VT.getVectorNumElements();
8022   assert(VT.getScalarSizeInBits() <= 32 &&
8023          "Exceeds 32-bit integer zero extension limit");
8024   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
8025
8026   // Define a helper function to check a particular ext-scale and lower to it if
8027   // valid.
8028   auto Lower = [&](int Scale) -> SDValue {
8029     SDValue InputV;
8030     bool AnyExt = true;
8031     for (int i = 0; i < NumElements; ++i) {
8032       if (Mask[i] == -1)
8033         continue; // Valid anywhere but doesn't tell us anything.
8034       if (i % Scale != 0) {
8035         // Each of the extended elements need to be zeroable.
8036         if (!Zeroable[i])
8037           return SDValue();
8038
8039         // We no longer are in the anyext case.
8040         AnyExt = false;
8041         continue;
8042       }
8043
8044       // Each of the base elements needs to be consecutive indices into the
8045       // same input vector.
8046       SDValue V = Mask[i] < NumElements ? V1 : V2;
8047       if (!InputV)
8048         InputV = V;
8049       else if (InputV != V)
8050         return SDValue(); // Flip-flopping inputs.
8051
8052       if (Mask[i] % NumElements != i / Scale)
8053         return SDValue(); // Non-consecutive strided elements.
8054     }
8055
8056     // If we fail to find an input, we have a zero-shuffle which should always
8057     // have already been handled.
8058     // FIXME: Maybe handle this here in case during blending we end up with one?
8059     if (!InputV)
8060       return SDValue();
8061
8062     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8063         DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
8064   };
8065
8066   // The widest scale possible for extending is to a 64-bit integer.
8067   assert(Bits % 64 == 0 &&
8068          "The number of bits in a vector must be divisible by 64 on x86!");
8069   int NumExtElements = Bits / 64;
8070
8071   // Each iteration, try extending the elements half as much, but into twice as
8072   // many elements.
8073   for (; NumExtElements < NumElements; NumExtElements *= 2) {
8074     assert(NumElements % NumExtElements == 0 &&
8075            "The input vector size must be divisible by the extended size.");
8076     if (SDValue V = Lower(NumElements / NumExtElements))
8077       return V;
8078   }
8079
8080   // General extends failed, but 128-bit vectors may be able to use MOVQ.
8081   if (Bits != 128)
8082     return SDValue();
8083
8084   // Returns one of the source operands if the shuffle can be reduced to a
8085   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
8086   auto CanZExtLowHalf = [&]() {
8087     for (int i = NumElements / 2; i != NumElements; i++)
8088       if (!Zeroable[i])
8089         return SDValue();
8090     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
8091       return V1;
8092     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
8093       return V2;
8094     return SDValue();
8095   };
8096
8097   if (SDValue V = CanZExtLowHalf()) {
8098     V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
8099     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
8100     return DAG.getNode(ISD::BITCAST, DL, VT, V);
8101   }
8102
8103   // No viable ext lowering found.
8104   return SDValue();
8105 }
8106
8107 /// \brief Try to get a scalar value for a specific element of a vector.
8108 ///
8109 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
8110 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
8111                                               SelectionDAG &DAG) {
8112   MVT VT = V.getSimpleValueType();
8113   MVT EltVT = VT.getVectorElementType();
8114   while (V.getOpcode() == ISD::BITCAST)
8115     V = V.getOperand(0);
8116   // If the bitcasts shift the element size, we can't extract an equivalent
8117   // element from it.
8118   MVT NewVT = V.getSimpleValueType();
8119   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
8120     return SDValue();
8121
8122   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8123       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
8124     return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
8125
8126   return SDValue();
8127 }
8128
8129 /// \brief Helper to test for a load that can be folded with x86 shuffles.
8130 ///
8131 /// This is particularly important because the set of instructions varies
8132 /// significantly based on whether the operand is a load or not.
8133 static bool isShuffleFoldableLoad(SDValue V) {
8134   while (V.getOpcode() == ISD::BITCAST)
8135     V = V.getOperand(0);
8136
8137   return ISD::isNON_EXTLoad(V.getNode());
8138 }
8139
8140 /// \brief Try to lower insertion of a single element into a zero vector.
8141 ///
8142 /// This is a common pattern that we have especially efficient patterns to lower
8143 /// across all subtarget feature sets.
8144 static SDValue lowerVectorShuffleAsElementInsertion(
8145     MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8146     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8147   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8148   MVT ExtVT = VT;
8149   MVT EltVT = VT.getVectorElementType();
8150
8151   int V2Index = std::find_if(Mask.begin(), Mask.end(),
8152                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
8153                 Mask.begin();
8154   bool IsV1Zeroable = true;
8155   for (int i = 0, Size = Mask.size(); i < Size; ++i)
8156     if (i != V2Index && !Zeroable[i]) {
8157       IsV1Zeroable = false;
8158       break;
8159     }
8160
8161   // Check for a single input from a SCALAR_TO_VECTOR node.
8162   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8163   // all the smarts here sunk into that routine. However, the current
8164   // lowering of BUILD_VECTOR makes that nearly impossible until the old
8165   // vector shuffle lowering is dead.
8166   if (SDValue V2S = getScalarValueForVectorElement(
8167           V2, Mask[V2Index] - Mask.size(), DAG)) {
8168     // We need to zext the scalar if it is smaller than an i32.
8169     V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
8170     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8171       // Using zext to expand a narrow element won't work for non-zero
8172       // insertions.
8173       if (!IsV1Zeroable)
8174         return SDValue();
8175
8176       // Zero-extend directly to i32.
8177       ExtVT = MVT::v4i32;
8178       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8179     }
8180     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8181   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8182              EltVT == MVT::i16) {
8183     // Either not inserting from the low element of the input or the input
8184     // element size is too small to use VZEXT_MOVL to clear the high bits.
8185     return SDValue();
8186   }
8187
8188   if (!IsV1Zeroable) {
8189     // If V1 can't be treated as a zero vector we have fewer options to lower
8190     // this. We can't support integer vectors or non-zero targets cheaply, and
8191     // the V1 elements can't be permuted in any way.
8192     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8193     if (!VT.isFloatingPoint() || V2Index != 0)
8194       return SDValue();
8195     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8196     V1Mask[V2Index] = -1;
8197     if (!isNoopShuffleMask(V1Mask))
8198       return SDValue();
8199     // This is essentially a special case blend operation, but if we have
8200     // general purpose blend operations, they are always faster. Bail and let
8201     // the rest of the lowering handle these as blends.
8202     if (Subtarget->hasSSE41())
8203       return SDValue();
8204
8205     // Otherwise, use MOVSD or MOVSS.
8206     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8207            "Only two types of floating point element types to handle!");
8208     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8209                        ExtVT, V1, V2);
8210   }
8211
8212   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8213   if (ExtVT != VT)
8214     V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8215
8216   if (V2Index != 0) {
8217     // If we have 4 or fewer lanes we can cheaply shuffle the element into
8218     // the desired position. Otherwise it is more efficient to do a vector
8219     // shift left. We know that we can do a vector shift left because all
8220     // the inputs are zero.
8221     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8222       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8223       V2Shuffle[V2Index] = 0;
8224       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8225     } else {
8226       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
8227       V2 = DAG.getNode(
8228           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
8229           DAG.getConstant(
8230               V2Index * EltVT.getSizeInBits(),
8231               DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
8232       V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8233     }
8234   }
8235   return V2;
8236 }
8237
8238 /// \brief Try to lower broadcast of a single element.
8239 ///
8240 /// For convenience, this code also bundles all of the subtarget feature set
8241 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8242 /// a convenient way to factor it out.
8243 static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
8244                                              ArrayRef<int> Mask,
8245                                              const X86Subtarget *Subtarget,
8246                                              SelectionDAG &DAG) {
8247   if (!Subtarget->hasAVX())
8248     return SDValue();
8249   if (VT.isInteger() && !Subtarget->hasAVX2())
8250     return SDValue();
8251
8252   // Check that the mask is a broadcast.
8253   int BroadcastIdx = -1;
8254   for (int M : Mask)
8255     if (M >= 0 && BroadcastIdx == -1)
8256       BroadcastIdx = M;
8257     else if (M >= 0 && M != BroadcastIdx)
8258       return SDValue();
8259
8260   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8261                                             "a sorted mask where the broadcast "
8262                                             "comes from V1.");
8263
8264   // Go up the chain of (vector) values to try and find a scalar load that
8265   // we can combine with the broadcast.
8266   for (;;) {
8267     switch (V.getOpcode()) {
8268     case ISD::CONCAT_VECTORS: {
8269       int OperandSize = Mask.size() / V.getNumOperands();
8270       V = V.getOperand(BroadcastIdx / OperandSize);
8271       BroadcastIdx %= OperandSize;
8272       continue;
8273     }
8274
8275     case ISD::INSERT_SUBVECTOR: {
8276       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8277       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8278       if (!ConstantIdx)
8279         break;
8280
8281       int BeginIdx = (int)ConstantIdx->getZExtValue();
8282       int EndIdx =
8283           BeginIdx + (int)VInner.getValueType().getVectorNumElements();
8284       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8285         BroadcastIdx -= BeginIdx;
8286         V = VInner;
8287       } else {
8288         V = VOuter;
8289       }
8290       continue;
8291     }
8292     }
8293     break;
8294   }
8295
8296   // Check if this is a broadcast of a scalar. We special case lowering
8297   // for scalars so that we can more effectively fold with loads.
8298   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8299       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8300     V = V.getOperand(BroadcastIdx);
8301
8302     // If the scalar isn't a load we can't broadcast from it in AVX1, only with
8303     // AVX2.
8304     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
8305       return SDValue();
8306   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
8307     // We can't broadcast from a vector register w/o AVX2, and we can only
8308     // broadcast from the zero-element of a vector register.
8309     return SDValue();
8310   }
8311
8312   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
8313 }
8314
8315 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8316 // INSERTPS when the V1 elements are already in the correct locations
8317 // because otherwise we can just always use two SHUFPS instructions which
8318 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8319 // perform INSERTPS if a single V1 element is out of place and all V2
8320 // elements are zeroable.
8321 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8322                                             ArrayRef<int> Mask,
8323                                             SelectionDAG &DAG) {
8324   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8325   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8326   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8327   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8328
8329   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8330
8331   unsigned ZMask = 0;
8332   int V1DstIndex = -1;
8333   int V2DstIndex = -1;
8334   bool V1UsedInPlace = false;
8335
8336   for (int i = 0; i < 4; i++) {
8337     // Synthesize a zero mask from the zeroable elements (includes undefs).
8338     if (Zeroable[i]) {
8339       ZMask |= 1 << i;
8340       continue;
8341     }
8342
8343     // Flag if we use any V1 inputs in place.
8344     if (i == Mask[i]) {
8345       V1UsedInPlace = true;
8346       continue;
8347     }
8348
8349     // We can only insert a single non-zeroable element.
8350     if (V1DstIndex != -1 || V2DstIndex != -1)
8351       return SDValue();
8352
8353     if (Mask[i] < 4) {
8354       // V1 input out of place for insertion.
8355       V1DstIndex = i;
8356     } else {
8357       // V2 input for insertion.
8358       V2DstIndex = i;
8359     }
8360   }
8361
8362   // Don't bother if we have no (non-zeroable) element for insertion.
8363   if (V1DstIndex == -1 && V2DstIndex == -1)
8364     return SDValue();
8365
8366   // Determine element insertion src/dst indices. The src index is from the
8367   // start of the inserted vector, not the start of the concatenated vector.
8368   unsigned V2SrcIndex = 0;
8369   if (V1DstIndex != -1) {
8370     // If we have a V1 input out of place, we use V1 as the V2 element insertion
8371     // and don't use the original V2 at all.
8372     V2SrcIndex = Mask[V1DstIndex];
8373     V2DstIndex = V1DstIndex;
8374     V2 = V1;
8375   } else {
8376     V2SrcIndex = Mask[V2DstIndex] - 4;
8377   }
8378
8379   // If no V1 inputs are used in place, then the result is created only from
8380   // the zero mask and the V2 insertion - so remove V1 dependency.
8381   if (!V1UsedInPlace)
8382     V1 = DAG.getUNDEF(MVT::v4f32);
8383
8384   unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8385   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8386
8387   // Insert the V2 element into the desired position.
8388   SDLoc DL(Op);
8389   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8390                      DAG.getConstant(InsertPSMask, MVT::i8));
8391 }
8392
8393 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8394 ///
8395 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8396 /// support for floating point shuffles but not integer shuffles. These
8397 /// instructions will incur a domain crossing penalty on some chips though so
8398 /// it is better to avoid lowering through this for integer vectors where
8399 /// possible.
8400 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8401                                        const X86Subtarget *Subtarget,
8402                                        SelectionDAG &DAG) {
8403   SDLoc DL(Op);
8404   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
8405   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8406   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8407   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8408   ArrayRef<int> Mask = SVOp->getMask();
8409   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8410
8411   if (isSingleInputShuffleMask(Mask)) {
8412     // Use low duplicate instructions for masks that match their pattern.
8413     if (Subtarget->hasSSE3())
8414       if (isShuffleEquivalent(Mask, 0, 0))
8415         return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
8416
8417     // Straight shuffle of a single input vector. Simulate this by using the
8418     // single input as both of the "inputs" to this instruction..
8419     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8420
8421     if (Subtarget->hasAVX()) {
8422       // If we have AVX, we can use VPERMILPS which will allow folding a load
8423       // into the shuffle.
8424       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8425                          DAG.getConstant(SHUFPDMask, MVT::i8));
8426     }
8427
8428     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
8429                        DAG.getConstant(SHUFPDMask, MVT::i8));
8430   }
8431   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8432   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8433
8434   // Use dedicated unpack instructions for masks that match their pattern.
8435   if (isShuffleEquivalent(Mask, 0, 2))
8436     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
8437   if (isShuffleEquivalent(Mask, 1, 3))
8438     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
8439
8440   // If we have a single input, insert that into V1 if we can do so cheaply.
8441   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8442     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8443             MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
8444       return Insertion;
8445     // Try inverting the insertion since for v2 masks it is easy to do and we
8446     // can't reliably sort the mask one way or the other.
8447     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8448                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8449     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8450             MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
8451       return Insertion;
8452   }
8453
8454   // Try to use one of the special instruction patterns to handle two common
8455   // blend patterns if a zero-blend above didn't work.
8456   if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
8457     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8458       // We can either use a special instruction to load over the low double or
8459       // to move just the low double.
8460       return DAG.getNode(
8461           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8462           DL, MVT::v2f64, V2,
8463           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8464
8465   if (Subtarget->hasSSE41())
8466     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8467                                                   Subtarget, DAG))
8468       return Blend;
8469
8470   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8471   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
8472                      DAG.getConstant(SHUFPDMask, MVT::i8));
8473 }
8474
8475 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8476 ///
8477 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8478 /// the integer unit to minimize domain crossing penalties. However, for blends
8479 /// it falls back to the floating point shuffle operation with appropriate bit
8480 /// casting.
8481 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8482                                        const X86Subtarget *Subtarget,
8483                                        SelectionDAG &DAG) {
8484   SDLoc DL(Op);
8485   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
8486   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8487   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8488   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8489   ArrayRef<int> Mask = SVOp->getMask();
8490   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8491
8492   if (isSingleInputShuffleMask(Mask)) {
8493     // Check for being able to broadcast a single element.
8494     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
8495                                                           Mask, Subtarget, DAG))
8496       return Broadcast;
8497
8498     // Straight shuffle of a single input vector. For everything from SSE2
8499     // onward this has a single fast instruction with no scary immediates.
8500     // We have to map the mask as it is actually a v4i32 shuffle instruction.
8501     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
8502     int WidenedMask[4] = {
8503         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
8504         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
8505     return DAG.getNode(
8506         ISD::BITCAST, DL, MVT::v2i64,
8507         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
8508                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
8509   }
8510
8511   // Try to use byte shift instructions.
8512   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8513           DL, MVT::v2i64, V1, V2, Mask, DAG))
8514     return Shift;
8515
8516   // If we have a single input from V2 insert that into V1 if we can do so
8517   // cheaply.
8518   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8519     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8520             MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
8521       return Insertion;
8522     // Try inverting the insertion since for v2 masks it is easy to do and we
8523     // can't reliably sort the mask one way or the other.
8524     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8525                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8526     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8527             MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
8528       return Insertion;
8529   }
8530
8531   // Use dedicated unpack instructions for masks that match their pattern.
8532   if (isShuffleEquivalent(Mask, 0, 2))
8533     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
8534   if (isShuffleEquivalent(Mask, 1, 3))
8535     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
8536
8537   if (Subtarget->hasSSE41())
8538     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
8539                                                   Subtarget, DAG))
8540       return Blend;
8541
8542   // Try to use byte rotation instructions.
8543   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8544   if (Subtarget->hasSSSE3())
8545     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8546             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8547       return Rotate;
8548
8549   // We implement this with SHUFPD which is pretty lame because it will likely
8550   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
8551   // However, all the alternatives are still more cycles and newer chips don't
8552   // have this problem. It would be really nice if x86 had better shuffles here.
8553   V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
8554   V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
8555   return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
8556                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
8557 }
8558
8559 /// \brief Lower a vector shuffle using the SHUFPS instruction.
8560 ///
8561 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
8562 /// It makes no assumptions about whether this is the *best* lowering, it simply
8563 /// uses it.
8564 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
8565                                             ArrayRef<int> Mask, SDValue V1,
8566                                             SDValue V2, SelectionDAG &DAG) {
8567   SDValue LowV = V1, HighV = V2;
8568   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
8569
8570   int NumV2Elements =
8571       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8572
8573   if (NumV2Elements == 1) {
8574     int V2Index =
8575         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8576         Mask.begin();
8577
8578     // Compute the index adjacent to V2Index and in the same half by toggling
8579     // the low bit.
8580     int V2AdjIndex = V2Index ^ 1;
8581
8582     if (Mask[V2AdjIndex] == -1) {
8583       // Handles all the cases where we have a single V2 element and an undef.
8584       // This will only ever happen in the high lanes because we commute the
8585       // vector otherwise.
8586       if (V2Index < 2)
8587         std::swap(LowV, HighV);
8588       NewMask[V2Index] -= 4;
8589     } else {
8590       // Handle the case where the V2 element ends up adjacent to a V1 element.
8591       // To make this work, blend them together as the first step.
8592       int V1Index = V2AdjIndex;
8593       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
8594       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
8595                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8596
8597       // Now proceed to reconstruct the final blend as we have the necessary
8598       // high or low half formed.
8599       if (V2Index < 2) {
8600         LowV = V2;
8601         HighV = V1;
8602       } else {
8603         HighV = V2;
8604       }
8605       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
8606       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
8607     }
8608   } else if (NumV2Elements == 2) {
8609     if (Mask[0] < 4 && Mask[1] < 4) {
8610       // Handle the easy case where we have V1 in the low lanes and V2 in the
8611       // high lanes.
8612       NewMask[2] -= 4;
8613       NewMask[3] -= 4;
8614     } else if (Mask[2] < 4 && Mask[3] < 4) {
8615       // We also handle the reversed case because this utility may get called
8616       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
8617       // arrange things in the right direction.
8618       NewMask[0] -= 4;
8619       NewMask[1] -= 4;
8620       HighV = V1;
8621       LowV = V2;
8622     } else {
8623       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
8624       // trying to place elements directly, just blend them and set up the final
8625       // shuffle to place them.
8626
8627       // The first two blend mask elements are for V1, the second two are for
8628       // V2.
8629       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
8630                           Mask[2] < 4 ? Mask[2] : Mask[3],
8631                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
8632                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
8633       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
8634                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8635
8636       // Now we do a normal shuffle of V1 by giving V1 as both operands to
8637       // a blend.
8638       LowV = HighV = V1;
8639       NewMask[0] = Mask[0] < 4 ? 0 : 2;
8640       NewMask[1] = Mask[0] < 4 ? 2 : 0;
8641       NewMask[2] = Mask[2] < 4 ? 1 : 3;
8642       NewMask[3] = Mask[2] < 4 ? 3 : 1;
8643     }
8644   }
8645   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
8646                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
8647 }
8648
8649 /// \brief Lower 4-lane 32-bit floating point shuffles.
8650 ///
8651 /// Uses instructions exclusively from the floating point unit to minimize
8652 /// domain crossing penalties, as these are sufficient to implement all v4f32
8653 /// shuffles.
8654 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8655                                        const X86Subtarget *Subtarget,
8656                                        SelectionDAG &DAG) {
8657   SDLoc DL(Op);
8658   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8659   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8660   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8661   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8662   ArrayRef<int> Mask = SVOp->getMask();
8663   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8664
8665   int NumV2Elements =
8666       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8667
8668   if (NumV2Elements == 0) {
8669     // Check for being able to broadcast a single element.
8670     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8671                                                           Mask, Subtarget, DAG))
8672       return Broadcast;
8673
8674     // Use even/odd duplicate instructions for masks that match their pattern.
8675     if (Subtarget->hasSSE3()) {
8676       if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
8677         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
8678       if (isShuffleEquivalent(Mask, 1, 1, 3, 3))
8679         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
8680     }
8681
8682     if (Subtarget->hasAVX()) {
8683       // If we have AVX, we can use VPERMILPS which will allow folding a load
8684       // into the shuffle.
8685       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
8686                          getV4X86ShuffleImm8ForMask(Mask, DAG));
8687     }
8688
8689     // Otherwise, use a straight shuffle of a single input vector. We pass the
8690     // input vector to both operands to simulate this with a SHUFPS.
8691     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
8692                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8693   }
8694
8695   // Use dedicated unpack instructions for masks that match their pattern.
8696   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8697     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
8698   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8699     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
8700
8701   // There are special ways we can lower some single-element blends. However, we
8702   // have custom ways we can lower more complex single-element blends below that
8703   // we defer to if both this and BLENDPS fail to match, so restrict this to
8704   // when the V2 input is targeting element 0 of the mask -- that is the fast
8705   // case here.
8706   if (NumV2Elements == 1 && Mask[0] >= 4)
8707     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
8708                                                          Mask, Subtarget, DAG))
8709       return V;
8710
8711   if (Subtarget->hasSSE41()) {
8712     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
8713                                                   Subtarget, DAG))
8714       return Blend;
8715
8716     // Use INSERTPS if we can complete the shuffle efficiently.
8717     if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8718       return V;
8719   }
8720
8721   // Otherwise fall back to a SHUFPS lowering strategy.
8722   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
8723 }
8724
8725 /// \brief Lower 4-lane i32 vector shuffles.
8726 ///
8727 /// We try to handle these with integer-domain shuffles where we can, but for
8728 /// blends we use the floating point domain blend instructions.
8729 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8730                                        const X86Subtarget *Subtarget,
8731                                        SelectionDAG &DAG) {
8732   SDLoc DL(Op);
8733   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
8734   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8735   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8736   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8737   ArrayRef<int> Mask = SVOp->getMask();
8738   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8739
8740   // Whenever we can lower this as a zext, that instruction is strictly faster
8741   // than any alternative. It also allows us to fold memory operands into the
8742   // shuffle in many cases.
8743   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
8744                                                          Mask, Subtarget, DAG))
8745     return ZExt;
8746
8747   int NumV2Elements =
8748       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8749
8750   if (NumV2Elements == 0) {
8751     // Check for being able to broadcast a single element.
8752     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8753                                                           Mask, Subtarget, DAG))
8754       return Broadcast;
8755
8756     // Straight shuffle of a single input vector. For everything from SSE2
8757     // onward this has a single fast instruction with no scary immediates.
8758     // We coerce the shuffle pattern to be compatible with UNPCK instructions
8759     // but we aren't actually going to use the UNPCK instruction because doing
8760     // so prevents folding a load into this instruction or making a copy.
8761     const int UnpackLoMask[] = {0, 0, 1, 1};
8762     const int UnpackHiMask[] = {2, 2, 3, 3};
8763     if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
8764       Mask = UnpackLoMask;
8765     else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
8766       Mask = UnpackHiMask;
8767
8768     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
8769                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8770   }
8771
8772   // Try to use bit shift instructions.
8773   if (SDValue Shift = lowerVectorShuffleAsBitShift(
8774           DL, MVT::v4i32, V1, V2, Mask, DAG))
8775     return Shift;
8776
8777   // Try to use byte shift instructions.
8778   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8779           DL, MVT::v4i32, V1, V2, Mask, DAG))
8780     return Shift;
8781
8782   // There are special ways we can lower some single-element blends.
8783   if (NumV2Elements == 1)
8784     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
8785                                                          Mask, Subtarget, DAG))
8786       return V;
8787
8788   if (Subtarget->hasSSE41())
8789     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
8790                                                   Subtarget, DAG))
8791       return Blend;
8792
8793   if (SDValue Masked =
8794           lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
8795     return Masked;
8796
8797   // Use dedicated unpack instructions for masks that match their pattern.
8798   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8799     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
8800   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8801     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
8802
8803   // Try to use byte rotation instructions.
8804   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8805   if (Subtarget->hasSSSE3())
8806     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8807             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
8808       return Rotate;
8809
8810   // We implement this with SHUFPS because it can blend from two vectors.
8811   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
8812   // up the inputs, bypassing domain shift penalties that we would encur if we
8813   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
8814   // relevant.
8815   return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
8816                      DAG.getVectorShuffle(
8817                          MVT::v4f32, DL,
8818                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
8819                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
8820 }
8821
8822 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
8823 /// shuffle lowering, and the most complex part.
8824 ///
8825 /// The lowering strategy is to try to form pairs of input lanes which are
8826 /// targeted at the same half of the final vector, and then use a dword shuffle
8827 /// to place them onto the right half, and finally unpack the paired lanes into
8828 /// their final position.
8829 ///
8830 /// The exact breakdown of how to form these dword pairs and align them on the
8831 /// correct sides is really tricky. See the comments within the function for
8832 /// more of the details.
8833 static SDValue lowerV8I16SingleInputVectorShuffle(
8834     SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
8835     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8836   assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
8837   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
8838   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
8839
8840   SmallVector<int, 4> LoInputs;
8841   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
8842                [](int M) { return M >= 0; });
8843   std::sort(LoInputs.begin(), LoInputs.end());
8844   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
8845   SmallVector<int, 4> HiInputs;
8846   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
8847                [](int M) { return M >= 0; });
8848   std::sort(HiInputs.begin(), HiInputs.end());
8849   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
8850   int NumLToL =
8851       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
8852   int NumHToL = LoInputs.size() - NumLToL;
8853   int NumLToH =
8854       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
8855   int NumHToH = HiInputs.size() - NumLToH;
8856   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
8857   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
8858   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
8859   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
8860
8861   // Check for being able to broadcast a single element.
8862   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
8863                                                         Mask, Subtarget, DAG))
8864     return Broadcast;
8865
8866   // Try to use bit shift instructions.
8867   if (SDValue Shift = lowerVectorShuffleAsBitShift(
8868           DL, MVT::v8i16, V, V, Mask, DAG))
8869     return Shift;
8870
8871   // Try to use byte shift instructions.
8872   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8873           DL, MVT::v8i16, V, V, Mask, DAG))
8874     return Shift;
8875
8876   // Use dedicated unpack instructions for masks that match their pattern.
8877   if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
8878     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
8879   if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
8880     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
8881
8882   // Try to use byte rotation instructions.
8883   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8884           DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
8885     return Rotate;
8886
8887   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
8888   // such inputs we can swap two of the dwords across the half mark and end up
8889   // with <=2 inputs to each half in each half. Once there, we can fall through
8890   // to the generic code below. For example:
8891   //
8892   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8893   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
8894   //
8895   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
8896   // and an existing 2-into-2 on the other half. In this case we may have to
8897   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
8898   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
8899   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
8900   // because any other situation (including a 3-into-1 or 1-into-3 in the other
8901   // half than the one we target for fixing) will be fixed when we re-enter this
8902   // path. We will also combine away any sequence of PSHUFD instructions that
8903   // result into a single instruction. Here is an example of the tricky case:
8904   //
8905   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8906   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
8907   //
8908   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
8909   //
8910   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
8911   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
8912   //
8913   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
8914   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
8915   //
8916   // The result is fine to be handled by the generic logic.
8917   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
8918                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
8919                           int AOffset, int BOffset) {
8920     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
8921            "Must call this with A having 3 or 1 inputs from the A half.");
8922     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
8923            "Must call this with B having 1 or 3 inputs from the B half.");
8924     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
8925            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
8926
8927     // Compute the index of dword with only one word among the three inputs in
8928     // a half by taking the sum of the half with three inputs and subtracting
8929     // the sum of the actual three inputs. The difference is the remaining
8930     // slot.
8931     int ADWord, BDWord;
8932     int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
8933     int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
8934     int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
8935     ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
8936     int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
8937     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
8938     int TripleNonInputIdx =
8939         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
8940     TripleDWord = TripleNonInputIdx / 2;
8941
8942     // We use xor with one to compute the adjacent DWord to whichever one the
8943     // OneInput is in.
8944     OneInputDWord = (OneInput / 2) ^ 1;
8945
8946     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
8947     // and BToA inputs. If there is also such a problem with the BToB and AToB
8948     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
8949     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
8950     // is essential that we don't *create* a 3<-1 as then we might oscillate.
8951     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
8952       // Compute how many inputs will be flipped by swapping these DWords. We
8953       // need
8954       // to balance this to ensure we don't form a 3-1 shuffle in the other
8955       // half.
8956       int NumFlippedAToBInputs =
8957           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
8958           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
8959       int NumFlippedBToBInputs =
8960           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
8961           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
8962       if ((NumFlippedAToBInputs == 1 &&
8963            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
8964           (NumFlippedBToBInputs == 1 &&
8965            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
8966         // We choose whether to fix the A half or B half based on whether that
8967         // half has zero flipped inputs. At zero, we may not be able to fix it
8968         // with that half. We also bias towards fixing the B half because that
8969         // will more commonly be the high half, and we have to bias one way.
8970         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
8971                                                        ArrayRef<int> Inputs) {
8972           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
8973           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
8974                                          PinnedIdx ^ 1) != Inputs.end();
8975           // Determine whether the free index is in the flipped dword or the
8976           // unflipped dword based on where the pinned index is. We use this bit
8977           // in an xor to conditionally select the adjacent dword.
8978           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
8979           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8980                                              FixFreeIdx) != Inputs.end();
8981           if (IsFixIdxInput == IsFixFreeIdxInput)
8982             FixFreeIdx += 1;
8983           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8984                                         FixFreeIdx) != Inputs.end();
8985           assert(IsFixIdxInput != IsFixFreeIdxInput &&
8986                  "We need to be changing the number of flipped inputs!");
8987           int PSHUFHalfMask[] = {0, 1, 2, 3};
8988           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
8989           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
8990                           MVT::v8i16, V,
8991                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
8992
8993           for (int &M : Mask)
8994             if (M != -1 && M == FixIdx)
8995               M = FixFreeIdx;
8996             else if (M != -1 && M == FixFreeIdx)
8997               M = FixIdx;
8998         };
8999         if (NumFlippedBToBInputs != 0) {
9000           int BPinnedIdx =
9001               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9002           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
9003         } else {
9004           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
9005           int APinnedIdx =
9006               AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9007           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
9008         }
9009       }
9010     }
9011
9012     int PSHUFDMask[] = {0, 1, 2, 3};
9013     PSHUFDMask[ADWord] = BDWord;
9014     PSHUFDMask[BDWord] = ADWord;
9015     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9016                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9017                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9018                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9019
9020     // Adjust the mask to match the new locations of A and B.
9021     for (int &M : Mask)
9022       if (M != -1 && M/2 == ADWord)
9023         M = 2 * BDWord + M % 2;
9024       else if (M != -1 && M/2 == BDWord)
9025         M = 2 * ADWord + M % 2;
9026
9027     // Recurse back into this routine to re-compute state now that this isn't
9028     // a 3 and 1 problem.
9029     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9030                                 Mask);
9031   };
9032   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
9033     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
9034   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
9035     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
9036
9037   // At this point there are at most two inputs to the low and high halves from
9038   // each half. That means the inputs can always be grouped into dwords and
9039   // those dwords can then be moved to the correct half with a dword shuffle.
9040   // We use at most one low and one high word shuffle to collect these paired
9041   // inputs into dwords, and finally a dword shuffle to place them.
9042   int PSHUFLMask[4] = {-1, -1, -1, -1};
9043   int PSHUFHMask[4] = {-1, -1, -1, -1};
9044   int PSHUFDMask[4] = {-1, -1, -1, -1};
9045
9046   // First fix the masks for all the inputs that are staying in their
9047   // original halves. This will then dictate the targets of the cross-half
9048   // shuffles.
9049   auto fixInPlaceInputs =
9050       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
9051                     MutableArrayRef<int> SourceHalfMask,
9052                     MutableArrayRef<int> HalfMask, int HalfOffset) {
9053     if (InPlaceInputs.empty())
9054       return;
9055     if (InPlaceInputs.size() == 1) {
9056       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9057           InPlaceInputs[0] - HalfOffset;
9058       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
9059       return;
9060     }
9061     if (IncomingInputs.empty()) {
9062       // Just fix all of the in place inputs.
9063       for (int Input : InPlaceInputs) {
9064         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
9065         PSHUFDMask[Input / 2] = Input / 2;
9066       }
9067       return;
9068     }
9069
9070     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
9071     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9072         InPlaceInputs[0] - HalfOffset;
9073     // Put the second input next to the first so that they are packed into
9074     // a dword. We find the adjacent index by toggling the low bit.
9075     int AdjIndex = InPlaceInputs[0] ^ 1;
9076     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
9077     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
9078     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
9079   };
9080   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
9081   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
9082
9083   // Now gather the cross-half inputs and place them into a free dword of
9084   // their target half.
9085   // FIXME: This operation could almost certainly be simplified dramatically to
9086   // look more like the 3-1 fixing operation.
9087   auto moveInputsToRightHalf = [&PSHUFDMask](
9088       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
9089       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
9090       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
9091       int DestOffset) {
9092     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
9093       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
9094     };
9095     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
9096                                                int Word) {
9097       int LowWord = Word & ~1;
9098       int HighWord = Word | 1;
9099       return isWordClobbered(SourceHalfMask, LowWord) ||
9100              isWordClobbered(SourceHalfMask, HighWord);
9101     };
9102
9103     if (IncomingInputs.empty())
9104       return;
9105
9106     if (ExistingInputs.empty()) {
9107       // Map any dwords with inputs from them into the right half.
9108       for (int Input : IncomingInputs) {
9109         // If the source half mask maps over the inputs, turn those into
9110         // swaps and use the swapped lane.
9111         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
9112           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
9113             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
9114                 Input - SourceOffset;
9115             // We have to swap the uses in our half mask in one sweep.
9116             for (int &M : HalfMask)
9117               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
9118                 M = Input;
9119               else if (M == Input)
9120                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9121           } else {
9122             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
9123                        Input - SourceOffset &&
9124                    "Previous placement doesn't match!");
9125           }
9126           // Note that this correctly re-maps both when we do a swap and when
9127           // we observe the other side of the swap above. We rely on that to
9128           // avoid swapping the members of the input list directly.
9129           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9130         }
9131
9132         // Map the input's dword into the correct half.
9133         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
9134           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
9135         else
9136           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
9137                      Input / 2 &&
9138                  "Previous placement doesn't match!");
9139       }
9140
9141       // And just directly shift any other-half mask elements to be same-half
9142       // as we will have mirrored the dword containing the element into the
9143       // same position within that half.
9144       for (int &M : HalfMask)
9145         if (M >= SourceOffset && M < SourceOffset + 4) {
9146           M = M - SourceOffset + DestOffset;
9147           assert(M >= 0 && "This should never wrap below zero!");
9148         }
9149       return;
9150     }
9151
9152     // Ensure we have the input in a viable dword of its current half. This
9153     // is particularly tricky because the original position may be clobbered
9154     // by inputs being moved and *staying* in that half.
9155     if (IncomingInputs.size() == 1) {
9156       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9157         int InputFixed = std::find(std::begin(SourceHalfMask),
9158                                    std::end(SourceHalfMask), -1) -
9159                          std::begin(SourceHalfMask) + SourceOffset;
9160         SourceHalfMask[InputFixed - SourceOffset] =
9161             IncomingInputs[0] - SourceOffset;
9162         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9163                      InputFixed);
9164         IncomingInputs[0] = InputFixed;
9165       }
9166     } else if (IncomingInputs.size() == 2) {
9167       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9168           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9169         // We have two non-adjacent or clobbered inputs we need to extract from
9170         // the source half. To do this, we need to map them into some adjacent
9171         // dword slot in the source mask.
9172         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9173                               IncomingInputs[1] - SourceOffset};
9174
9175         // If there is a free slot in the source half mask adjacent to one of
9176         // the inputs, place the other input in it. We use (Index XOR 1) to
9177         // compute an adjacent index.
9178         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9179             SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
9180           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9181           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9182           InputsFixed[1] = InputsFixed[0] ^ 1;
9183         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9184                    SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
9185           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9186           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9187           InputsFixed[0] = InputsFixed[1] ^ 1;
9188         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
9189                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
9190           // The two inputs are in the same DWord but it is clobbered and the
9191           // adjacent DWord isn't used at all. Move both inputs to the free
9192           // slot.
9193           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9194           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9195           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9196           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9197         } else {
9198           // The only way we hit this point is if there is no clobbering
9199           // (because there are no off-half inputs to this half) and there is no
9200           // free slot adjacent to one of the inputs. In this case, we have to
9201           // swap an input with a non-input.
9202           for (int i = 0; i < 4; ++i)
9203             assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
9204                    "We can't handle any clobbers here!");
9205           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9206                  "Cannot have adjacent inputs here!");
9207
9208           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9209           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9210
9211           // We also have to update the final source mask in this case because
9212           // it may need to undo the above swap.
9213           for (int &M : FinalSourceHalfMask)
9214             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9215               M = InputsFixed[1] + SourceOffset;
9216             else if (M == InputsFixed[1] + SourceOffset)
9217               M = (InputsFixed[0] ^ 1) + SourceOffset;
9218
9219           InputsFixed[1] = InputsFixed[0] ^ 1;
9220         }
9221
9222         // Point everything at the fixed inputs.
9223         for (int &M : HalfMask)
9224           if (M == IncomingInputs[0])
9225             M = InputsFixed[0] + SourceOffset;
9226           else if (M == IncomingInputs[1])
9227             M = InputsFixed[1] + SourceOffset;
9228
9229         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9230         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9231       }
9232     } else {
9233       llvm_unreachable("Unhandled input size!");
9234     }
9235
9236     // Now hoist the DWord down to the right half.
9237     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
9238     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
9239     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9240     for (int &M : HalfMask)
9241       for (int Input : IncomingInputs)
9242         if (M == Input)
9243           M = FreeDWord * 2 + Input % 2;
9244   };
9245   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9246                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
9247   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9248                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
9249
9250   // Now enact all the shuffles we've computed to move the inputs into their
9251   // target half.
9252   if (!isNoopShuffleMask(PSHUFLMask))
9253     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9254                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
9255   if (!isNoopShuffleMask(PSHUFHMask))
9256     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9257                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
9258   if (!isNoopShuffleMask(PSHUFDMask))
9259     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9260                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9261                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9262                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9263
9264   // At this point, each half should contain all its inputs, and we can then
9265   // just shuffle them into their final position.
9266   assert(std::count_if(LoMask.begin(), LoMask.end(),
9267                        [](int M) { return M >= 4; }) == 0 &&
9268          "Failed to lift all the high half inputs to the low mask!");
9269   assert(std::count_if(HiMask.begin(), HiMask.end(),
9270                        [](int M) { return M >= 0 && M < 4; }) == 0 &&
9271          "Failed to lift all the low half inputs to the high mask!");
9272
9273   // Do a half shuffle for the low mask.
9274   if (!isNoopShuffleMask(LoMask))
9275     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9276                     getV4X86ShuffleImm8ForMask(LoMask, DAG));
9277
9278   // Do a half shuffle with the high mask after shifting its values down.
9279   for (int &M : HiMask)
9280     if (M >= 0)
9281       M -= 4;
9282   if (!isNoopShuffleMask(HiMask))
9283     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9284                     getV4X86ShuffleImm8ForMask(HiMask, DAG));
9285
9286   return V;
9287 }
9288
9289 /// \brief Detect whether the mask pattern should be lowered through
9290 /// interleaving.
9291 ///
9292 /// This essentially tests whether viewing the mask as an interleaving of two
9293 /// sub-sequences reduces the cross-input traffic of a blend operation. If so,
9294 /// lowering it through interleaving is a significantly better strategy.
9295 static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
9296   int NumEvenInputs[2] = {0, 0};
9297   int NumOddInputs[2] = {0, 0};
9298   int NumLoInputs[2] = {0, 0};
9299   int NumHiInputs[2] = {0, 0};
9300   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9301     if (Mask[i] < 0)
9302       continue;
9303
9304     int InputIdx = Mask[i] >= Size;
9305
9306     if (i < Size / 2)
9307       ++NumLoInputs[InputIdx];
9308     else
9309       ++NumHiInputs[InputIdx];
9310
9311     if ((i % 2) == 0)
9312       ++NumEvenInputs[InputIdx];
9313     else
9314       ++NumOddInputs[InputIdx];
9315   }
9316
9317   // The minimum number of cross-input results for both the interleaved and
9318   // split cases. If interleaving results in fewer cross-input results, return
9319   // true.
9320   int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
9321                                     NumEvenInputs[0] + NumOddInputs[1]);
9322   int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
9323                               NumLoInputs[0] + NumHiInputs[1]);
9324   return InterleavedCrosses < SplitCrosses;
9325 }
9326
9327 /// \brief Blend two v8i16 vectors using a naive unpack strategy.
9328 ///
9329 /// This strategy only works when the inputs from each vector fit into a single
9330 /// half of that vector, and generally there are not so many inputs as to leave
9331 /// the in-place shuffles required highly constrained (and thus expensive). It
9332 /// shifts all the inputs into a single side of both input vectors and then
9333 /// uses an unpack to interleave these inputs in a single vector. At that
9334 /// point, we will fall back on the generic single input shuffle lowering.
9335 static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
9336                                                  SDValue V2,
9337                                                  MutableArrayRef<int> Mask,
9338                                                  const X86Subtarget *Subtarget,
9339                                                  SelectionDAG &DAG) {
9340   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9341   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9342   SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
9343   for (int i = 0; i < 8; ++i)
9344     if (Mask[i] >= 0 && Mask[i] < 4)
9345       LoV1Inputs.push_back(i);
9346     else if (Mask[i] >= 4 && Mask[i] < 8)
9347       HiV1Inputs.push_back(i);
9348     else if (Mask[i] >= 8 && Mask[i] < 12)
9349       LoV2Inputs.push_back(i);
9350     else if (Mask[i] >= 12)
9351       HiV2Inputs.push_back(i);
9352
9353   int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
9354   int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
9355   (void)NumV1Inputs;
9356   (void)NumV2Inputs;
9357   assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
9358   assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
9359   assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
9360
9361   bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
9362                      HiV1Inputs.size() + HiV2Inputs.size();
9363
9364   auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
9365                               ArrayRef<int> HiInputs, bool MoveToLo,
9366                               int MaskOffset) {
9367     ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
9368     ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
9369     if (BadInputs.empty())
9370       return V;
9371
9372     int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9373     int MoveOffset = MoveToLo ? 0 : 4;
9374
9375     if (GoodInputs.empty()) {
9376       for (int BadInput : BadInputs) {
9377         MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
9378         Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
9379       }
9380     } else {
9381       if (GoodInputs.size() == 2) {
9382         // If the low inputs are spread across two dwords, pack them into
9383         // a single dword.
9384         MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
9385         MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
9386         Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
9387         Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
9388       } else {
9389         // Otherwise pin the good inputs.
9390         for (int GoodInput : GoodInputs)
9391           MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
9392       }
9393
9394       if (BadInputs.size() == 2) {
9395         // If we have two bad inputs then there may be either one or two good
9396         // inputs fixed in place. Find a fixed input, and then find the *other*
9397         // two adjacent indices by using modular arithmetic.
9398         int GoodMaskIdx =
9399             std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
9400                          [](int M) { return M >= 0; }) -
9401             std::begin(MoveMask);
9402         int MoveMaskIdx =
9403             ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
9404         assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
9405         assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
9406         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9407         MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
9408         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9409         Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
9410       } else {
9411         assert(BadInputs.size() == 1 && "All sizes handled");
9412         int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
9413                                     std::end(MoveMask), -1) -
9414                           std::begin(MoveMask);
9415         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9416         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9417       }
9418     }
9419
9420     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9421                                 MoveMask);
9422   };
9423   V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
9424                         /*MaskOffset*/ 0);
9425   V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
9426                         /*MaskOffset*/ 8);
9427
9428   // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
9429   // cross-half traffic in the final shuffle.
9430
9431   // Munge the mask to be a single-input mask after the unpack merges the
9432   // results.
9433   for (int &M : Mask)
9434     if (M != -1)
9435       M = 2 * (M % 4) + (M / 8);
9436
9437   return DAG.getVectorShuffle(
9438       MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
9439                                   DL, MVT::v8i16, V1, V2),
9440       DAG.getUNDEF(MVT::v8i16), Mask);
9441 }
9442
9443 /// \brief Generic lowering of 8-lane i16 shuffles.
9444 ///
9445 /// This handles both single-input shuffles and combined shuffle/blends with
9446 /// two inputs. The single input shuffles are immediately delegated to
9447 /// a dedicated lowering routine.
9448 ///
9449 /// The blends are lowered in one of three fundamental ways. If there are few
9450 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9451 /// of the input is significantly cheaper when lowered as an interleaving of
9452 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9453 /// halves of the inputs separately (making them have relatively few inputs)
9454 /// and then concatenate them.
9455 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9456                                        const X86Subtarget *Subtarget,
9457                                        SelectionDAG &DAG) {
9458   SDLoc DL(Op);
9459   assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
9460   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9461   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9462   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9463   ArrayRef<int> OrigMask = SVOp->getMask();
9464   int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9465                         OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
9466   MutableArrayRef<int> Mask(MaskStorage);
9467
9468   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9469
9470   // Whenever we can lower this as a zext, that instruction is strictly faster
9471   // than any alternative.
9472   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9473           DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
9474     return ZExt;
9475
9476   auto isV1 = [](int M) { return M >= 0 && M < 8; };
9477   auto isV2 = [](int M) { return M >= 8; };
9478
9479   int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
9480   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
9481
9482   if (NumV2Inputs == 0)
9483     return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
9484
9485   assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
9486                             "to be V1-input shuffles.");
9487
9488   // Try to use bit shift instructions.
9489   if (SDValue Shift = lowerVectorShuffleAsBitShift(
9490           DL, MVT::v8i16, V1, V2, Mask, DAG))
9491     return Shift;
9492
9493   // Try to use byte shift instructions.
9494   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9495           DL, MVT::v8i16, V1, V2, Mask, DAG))
9496     return Shift;
9497
9498   // There are special ways we can lower some single-element blends.
9499   if (NumV2Inputs == 1)
9500     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
9501                                                          Mask, Subtarget, DAG))
9502       return V;
9503
9504   if (Subtarget->hasSSE41())
9505     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9506                                                   Subtarget, DAG))
9507       return Blend;
9508
9509   if (SDValue Masked =
9510           lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
9511     return Masked;
9512
9513   // Use dedicated unpack instructions for masks that match their pattern.
9514   if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
9515     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
9516   if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
9517     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
9518
9519   // Try to use byte rotation instructions.
9520   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9521           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9522     return Rotate;
9523
9524   if (NumV1Inputs + NumV2Inputs <= 4)
9525     return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
9526
9527   // Check whether an interleaving lowering is likely to be more efficient.
9528   // This isn't perfect but it is a strong heuristic that tends to work well on
9529   // the kinds of shuffles that show up in practice.
9530   //
9531   // FIXME: Handle 1x, 2x, and 4x interleaving.
9532   if (shouldLowerAsInterleaving(Mask)) {
9533     // FIXME: Figure out whether we should pack these into the low or high
9534     // halves.
9535
9536     int EMask[8], OMask[8];
9537     for (int i = 0; i < 4; ++i) {
9538       EMask[i] = Mask[2*i];
9539       OMask[i] = Mask[2*i + 1];
9540       EMask[i + 4] = -1;
9541       OMask[i + 4] = -1;
9542     }
9543
9544     SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
9545     SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
9546
9547     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
9548   }
9549
9550   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9551   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9552
9553   for (int i = 0; i < 4; ++i) {
9554     LoBlendMask[i] = Mask[i];
9555     HiBlendMask[i] = Mask[i + 4];
9556   }
9557
9558   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9559   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9560   LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
9561   HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
9562
9563   return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9564                      DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
9565 }
9566
9567 /// \brief Check whether a compaction lowering can be done by dropping even
9568 /// elements and compute how many times even elements must be dropped.
9569 ///
9570 /// This handles shuffles which take every Nth element where N is a power of
9571 /// two. Example shuffle masks:
9572 ///
9573 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
9574 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
9575 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
9576 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
9577 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
9578 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
9579 ///
9580 /// Any of these lanes can of course be undef.
9581 ///
9582 /// This routine only supports N <= 3.
9583 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
9584 /// for larger N.
9585 ///
9586 /// \returns N above, or the number of times even elements must be dropped if
9587 /// there is such a number. Otherwise returns zero.
9588 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
9589   // Figure out whether we're looping over two inputs or just one.
9590   bool IsSingleInput = isSingleInputShuffleMask(Mask);
9591
9592   // The modulus for the shuffle vector entries is based on whether this is
9593   // a single input or not.
9594   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
9595   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
9596          "We should only be called with masks with a power-of-2 size!");
9597
9598   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
9599
9600   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
9601   // and 2^3 simultaneously. This is because we may have ambiguity with
9602   // partially undef inputs.
9603   bool ViableForN[3] = {true, true, true};
9604
9605   for (int i = 0, e = Mask.size(); i < e; ++i) {
9606     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
9607     // want.
9608     if (Mask[i] == -1)
9609       continue;
9610
9611     bool IsAnyViable = false;
9612     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9613       if (ViableForN[j]) {
9614         uint64_t N = j + 1;
9615
9616         // The shuffle mask must be equal to (i * 2^N) % M.
9617         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
9618           IsAnyViable = true;
9619         else
9620           ViableForN[j] = false;
9621       }
9622     // Early exit if we exhaust the possible powers of two.
9623     if (!IsAnyViable)
9624       break;
9625   }
9626
9627   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9628     if (ViableForN[j])
9629       return j + 1;
9630
9631   // Return 0 as there is no viable power of two.
9632   return 0;
9633 }
9634
9635 /// \brief Generic lowering of v16i8 shuffles.
9636 ///
9637 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
9638 /// detect any complexity reducing interleaving. If that doesn't help, it uses
9639 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
9640 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
9641 /// back together.
9642 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9643                                        const X86Subtarget *Subtarget,
9644                                        SelectionDAG &DAG) {
9645   SDLoc DL(Op);
9646   assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
9647   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9648   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9649   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9650   ArrayRef<int> OrigMask = SVOp->getMask();
9651   assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
9652
9653   // Try to use bit shift instructions.
9654   if (SDValue Shift = lowerVectorShuffleAsBitShift(
9655           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9656     return Shift;
9657
9658   // Try to use byte shift instructions.
9659   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9660           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9661     return Shift;
9662
9663   // Try to use byte rotation instructions.
9664   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9665           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9666     return Rotate;
9667
9668   // Try to use a zext lowering.
9669   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9670           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9671     return ZExt;
9672
9673   int MaskStorage[16] = {
9674       OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
9675       OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
9676       OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
9677       OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
9678   MutableArrayRef<int> Mask(MaskStorage);
9679   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
9680   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
9681
9682   int NumV2Elements =
9683       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
9684
9685   // For single-input shuffles, there are some nicer lowering tricks we can use.
9686   if (NumV2Elements == 0) {
9687     // Check for being able to broadcast a single element.
9688     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9689                                                           Mask, Subtarget, DAG))
9690       return Broadcast;
9691
9692     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
9693     // Notably, this handles splat and partial-splat shuffles more efficiently.
9694     // However, it only makes sense if the pre-duplication shuffle simplifies
9695     // things significantly. Currently, this means we need to be able to
9696     // express the pre-duplication shuffle as an i16 shuffle.
9697     //
9698     // FIXME: We should check for other patterns which can be widened into an
9699     // i16 shuffle as well.
9700     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
9701       for (int i = 0; i < 16; i += 2)
9702         if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
9703           return false;
9704
9705       return true;
9706     };
9707     auto tryToWidenViaDuplication = [&]() -> SDValue {
9708       if (!canWidenViaDuplication(Mask))
9709         return SDValue();
9710       SmallVector<int, 4> LoInputs;
9711       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
9712                    [](int M) { return M >= 0 && M < 8; });
9713       std::sort(LoInputs.begin(), LoInputs.end());
9714       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
9715                      LoInputs.end());
9716       SmallVector<int, 4> HiInputs;
9717       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
9718                    [](int M) { return M >= 8; });
9719       std::sort(HiInputs.begin(), HiInputs.end());
9720       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
9721                      HiInputs.end());
9722
9723       bool TargetLo = LoInputs.size() >= HiInputs.size();
9724       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
9725       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
9726
9727       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9728       SmallDenseMap<int, int, 8> LaneMap;
9729       for (int I : InPlaceInputs) {
9730         PreDupI16Shuffle[I/2] = I/2;
9731         LaneMap[I] = I;
9732       }
9733       int j = TargetLo ? 0 : 4, je = j + 4;
9734       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
9735         // Check if j is already a shuffle of this input. This happens when
9736         // there are two adjacent bytes after we move the low one.
9737         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
9738           // If we haven't yet mapped the input, search for a slot into which
9739           // we can map it.
9740           while (j < je && PreDupI16Shuffle[j] != -1)
9741             ++j;
9742
9743           if (j == je)
9744             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
9745             return SDValue();
9746
9747           // Map this input with the i16 shuffle.
9748           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
9749         }
9750
9751         // Update the lane map based on the mapping we ended up with.
9752         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
9753       }
9754       V1 = DAG.getNode(
9755           ISD::BITCAST, DL, MVT::v16i8,
9756           DAG.getVectorShuffle(MVT::v8i16, DL,
9757                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9758                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
9759
9760       // Unpack the bytes to form the i16s that will be shuffled into place.
9761       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9762                        MVT::v16i8, V1, V1);
9763
9764       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9765       for (int i = 0; i < 16; ++i)
9766         if (Mask[i] != -1) {
9767           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
9768           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
9769           if (PostDupI16Shuffle[i / 2] == -1)
9770             PostDupI16Shuffle[i / 2] = MappedMask;
9771           else
9772             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
9773                    "Conflicting entrties in the original shuffle!");
9774         }
9775       return DAG.getNode(
9776           ISD::BITCAST, DL, MVT::v16i8,
9777           DAG.getVectorShuffle(MVT::v8i16, DL,
9778                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9779                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
9780     };
9781     if (SDValue V = tryToWidenViaDuplication())
9782       return V;
9783   }
9784
9785   // Check whether an interleaving lowering is likely to be more efficient.
9786   // This isn't perfect but it is a strong heuristic that tends to work well on
9787   // the kinds of shuffles that show up in practice.
9788   //
9789   // FIXME: We need to handle other interleaving widths (i16, i32, ...).
9790   if (shouldLowerAsInterleaving(Mask)) {
9791     int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9792       return (M >= 0 && M < 8) || (M >= 16 && M < 24);
9793     });
9794     int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9795       return (M >= 8 && M < 16) || M >= 24;
9796     });
9797     int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9798                      -1, -1, -1, -1, -1, -1, -1, -1};
9799     int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9800                      -1, -1, -1, -1, -1, -1, -1, -1};
9801     bool UnpackLo = NumLoHalf >= NumHiHalf;
9802     MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
9803     MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
9804     for (int i = 0; i < 8; ++i) {
9805       TargetEMask[i] = Mask[2 * i];
9806       TargetOMask[i] = Mask[2 * i + 1];
9807     }
9808
9809     SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
9810     SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
9811
9812     return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9813                        MVT::v16i8, Evens, Odds);
9814   }
9815
9816   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
9817   // with PSHUFB. It is important to do this before we attempt to generate any
9818   // blends but after all of the single-input lowerings. If the single input
9819   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
9820   // want to preserve that and we can DAG combine any longer sequences into
9821   // a PSHUFB in the end. But once we start blending from multiple inputs,
9822   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
9823   // and there are *very* few patterns that would actually be faster than the
9824   // PSHUFB approach because of its ability to zero lanes.
9825   //
9826   // FIXME: The only exceptions to the above are blends which are exact
9827   // interleavings with direct instructions supporting them. We currently don't
9828   // handle those well here.
9829   if (Subtarget->hasSSSE3()) {
9830     SDValue V1Mask[16];
9831     SDValue V2Mask[16];
9832     bool V1InUse = false;
9833     bool V2InUse = false;
9834     SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9835
9836     for (int i = 0; i < 16; ++i) {
9837       if (Mask[i] == -1) {
9838         V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9839       } else {
9840         const int ZeroMask = 0x80;
9841         int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
9842         int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
9843         if (Zeroable[i])
9844           V1Idx = V2Idx = ZeroMask;
9845         V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
9846         V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
9847         V1InUse |= (ZeroMask != V1Idx);
9848         V2InUse |= (ZeroMask != V2Idx);
9849       }
9850     }
9851
9852     if (V1InUse)
9853       V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
9854                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
9855     if (V2InUse)
9856       V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
9857                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
9858
9859     // If we need shuffled inputs from both, blend the two.
9860     if (V1InUse && V2InUse)
9861       return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9862     if (V1InUse)
9863       return V1; // Single inputs are easy.
9864     if (V2InUse)
9865       return V2; // Single inputs are easy.
9866     // Shuffling to a zeroable vector.
9867     return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
9868   }
9869
9870   // There are special ways we can lower some single-element blends.
9871   if (NumV2Elements == 1)
9872     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
9873                                                          Mask, Subtarget, DAG))
9874       return V;
9875
9876   // Check whether a compaction lowering can be done. This handles shuffles
9877   // which take every Nth element for some even N. See the helper function for
9878   // details.
9879   //
9880   // We special case these as they can be particularly efficiently handled with
9881   // the PACKUSB instruction on x86 and they show up in common patterns of
9882   // rearranging bytes to truncate wide elements.
9883   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
9884     // NumEvenDrops is the power of two stride of the elements. Another way of
9885     // thinking about it is that we need to drop the even elements this many
9886     // times to get the original input.
9887     bool IsSingleInput = isSingleInputShuffleMask(Mask);
9888
9889     // First we need to zero all the dropped bytes.
9890     assert(NumEvenDrops <= 3 &&
9891            "No support for dropping even elements more than 3 times.");
9892     // We use the mask type to pick which bytes are preserved based on how many
9893     // elements are dropped.
9894     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
9895     SDValue ByteClearMask =
9896         DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
9897                     DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
9898     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
9899     if (!IsSingleInput)
9900       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
9901
9902     // Now pack things back together.
9903     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
9904     V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
9905     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
9906     for (int i = 1; i < NumEvenDrops; ++i) {
9907       Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
9908       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
9909     }
9910
9911     return Result;
9912   }
9913
9914   int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9915   int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9916   int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9917   int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9918
9919   auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
9920                             MutableArrayRef<int> V1HalfBlendMask,
9921                             MutableArrayRef<int> V2HalfBlendMask) {
9922     for (int i = 0; i < 8; ++i)
9923       if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
9924         V1HalfBlendMask[i] = HalfMask[i];
9925         HalfMask[i] = i;
9926       } else if (HalfMask[i] >= 16) {
9927         V2HalfBlendMask[i] = HalfMask[i] - 16;
9928         HalfMask[i] = i + 8;
9929       }
9930   };
9931   buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
9932   buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
9933
9934   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
9935
9936   auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
9937                              MutableArrayRef<int> HiBlendMask) {
9938     SDValue V1, V2;
9939     // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
9940     // them out and avoid using UNPCK{L,H} to extract the elements of V as
9941     // i16s.
9942     if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
9943                      [](int M) { return M >= 0 && M % 2 == 1; }) &&
9944         std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
9945                      [](int M) { return M >= 0 && M % 2 == 1; })) {
9946       // Use a mask to drop the high bytes.
9947       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
9948       V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
9949                        DAG.getConstant(0x00FF, MVT::v8i16));
9950
9951       // This will be a single vector shuffle instead of a blend so nuke V2.
9952       V2 = DAG.getUNDEF(MVT::v8i16);
9953
9954       // Squash the masks to point directly into V1.
9955       for (int &M : LoBlendMask)
9956         if (M >= 0)
9957           M /= 2;
9958       for (int &M : HiBlendMask)
9959         if (M >= 0)
9960           M /= 2;
9961     } else {
9962       // Otherwise just unpack the low half of V into V1 and the high half into
9963       // V2 so that we can blend them as i16s.
9964       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9965                        DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
9966       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9967                        DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
9968     }
9969
9970     SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9971     SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9972     return std::make_pair(BlendedLo, BlendedHi);
9973   };
9974   SDValue V1Lo, V1Hi, V2Lo, V2Hi;
9975   std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
9976   std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
9977
9978   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
9979   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
9980
9981   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
9982 }
9983
9984 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
9985 ///
9986 /// This routine breaks down the specific type of 128-bit shuffle and
9987 /// dispatches to the lowering routines accordingly.
9988 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9989                                         MVT VT, const X86Subtarget *Subtarget,
9990                                         SelectionDAG &DAG) {
9991   switch (VT.SimpleTy) {
9992   case MVT::v2i64:
9993     return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9994   case MVT::v2f64:
9995     return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9996   case MVT::v4i32:
9997     return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9998   case MVT::v4f32:
9999     return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10000   case MVT::v8i16:
10001     return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10002   case MVT::v16i8:
10003     return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10004
10005   default:
10006     llvm_unreachable("Unimplemented!");
10007   }
10008 }
10009
10010 /// \brief Helper function to test whether a shuffle mask could be
10011 /// simplified by widening the elements being shuffled.
10012 ///
10013 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
10014 /// leaves it in an unspecified state.
10015 ///
10016 /// NOTE: This must handle normal vector shuffle masks and *target* vector
10017 /// shuffle masks. The latter have the special property of a '-2' representing
10018 /// a zero-ed lane of a vector.
10019 static bool canWidenShuffleElements(ArrayRef<int> Mask,
10020                                     SmallVectorImpl<int> &WidenedMask) {
10021   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
10022     // If both elements are undef, its trivial.
10023     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
10024       WidenedMask.push_back(SM_SentinelUndef);
10025       continue;
10026     }
10027
10028     // Check for an undef mask and a mask value properly aligned to fit with
10029     // a pair of values. If we find such a case, use the non-undef mask's value.
10030     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
10031       WidenedMask.push_back(Mask[i + 1] / 2);
10032       continue;
10033     }
10034     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
10035       WidenedMask.push_back(Mask[i] / 2);
10036       continue;
10037     }
10038
10039     // When zeroing, we need to spread the zeroing across both lanes to widen.
10040     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
10041       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
10042           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
10043         WidenedMask.push_back(SM_SentinelZero);
10044         continue;
10045       }
10046       return false;
10047     }
10048
10049     // Finally check if the two mask values are adjacent and aligned with
10050     // a pair.
10051     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
10052       WidenedMask.push_back(Mask[i] / 2);
10053       continue;
10054     }
10055
10056     // Otherwise we can't safely widen the elements used in this shuffle.
10057     return false;
10058   }
10059   assert(WidenedMask.size() == Mask.size() / 2 &&
10060          "Incorrect size of mask after widening the elements!");
10061
10062   return true;
10063 }
10064
10065 /// \brief Generic routine to split ector shuffle into half-sized shuffles.
10066 ///
10067 /// This routine just extracts two subvectors, shuffles them independently, and
10068 /// then concatenates them back together. This should work effectively with all
10069 /// AVX vector shuffle types.
10070 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10071                                           SDValue V2, ArrayRef<int> Mask,
10072                                           SelectionDAG &DAG) {
10073   assert(VT.getSizeInBits() >= 256 &&
10074          "Only for 256-bit or wider vector shuffles!");
10075   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
10076   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
10077
10078   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
10079   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
10080
10081   int NumElements = VT.getVectorNumElements();
10082   int SplitNumElements = NumElements / 2;
10083   MVT ScalarVT = VT.getScalarType();
10084   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
10085
10086   SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
10087                              DAG.getIntPtrConstant(0));
10088   SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
10089                              DAG.getIntPtrConstant(SplitNumElements));
10090   SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
10091                              DAG.getIntPtrConstant(0));
10092   SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
10093                              DAG.getIntPtrConstant(SplitNumElements));
10094
10095   // Now create two 4-way blends of these half-width vectors.
10096   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
10097     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
10098     SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
10099     for (int i = 0; i < SplitNumElements; ++i) {
10100       int M = HalfMask[i];
10101       if (M >= NumElements) {
10102         if (M >= NumElements + SplitNumElements)
10103           UseHiV2 = true;
10104         else
10105           UseLoV2 = true;
10106         V2BlendMask.push_back(M - NumElements);
10107         V1BlendMask.push_back(-1);
10108         BlendMask.push_back(SplitNumElements + i);
10109       } else if (M >= 0) {
10110         if (M >= SplitNumElements)
10111           UseHiV1 = true;
10112         else
10113           UseLoV1 = true;
10114         V2BlendMask.push_back(-1);
10115         V1BlendMask.push_back(M);
10116         BlendMask.push_back(i);
10117       } else {
10118         V2BlendMask.push_back(-1);
10119         V1BlendMask.push_back(-1);
10120         BlendMask.push_back(-1);
10121       }
10122     }
10123
10124     // Because the lowering happens after all combining takes place, we need to
10125     // manually combine these blend masks as much as possible so that we create
10126     // a minimal number of high-level vector shuffle nodes.
10127
10128     // First try just blending the halves of V1 or V2.
10129     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
10130       return DAG.getUNDEF(SplitVT);
10131     if (!UseLoV2 && !UseHiV2)
10132       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10133     if (!UseLoV1 && !UseHiV1)
10134       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10135
10136     SDValue V1Blend, V2Blend;
10137     if (UseLoV1 && UseHiV1) {
10138       V1Blend =
10139         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10140     } else {
10141       // We only use half of V1 so map the usage down into the final blend mask.
10142       V1Blend = UseLoV1 ? LoV1 : HiV1;
10143       for (int i = 0; i < SplitNumElements; ++i)
10144         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
10145           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
10146     }
10147     if (UseLoV2 && UseHiV2) {
10148       V2Blend =
10149         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10150     } else {
10151       // We only use half of V2 so map the usage down into the final blend mask.
10152       V2Blend = UseLoV2 ? LoV2 : HiV2;
10153       for (int i = 0; i < SplitNumElements; ++i)
10154         if (BlendMask[i] >= SplitNumElements)
10155           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
10156     }
10157     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
10158   };
10159   SDValue Lo = HalfBlend(LoMask);
10160   SDValue Hi = HalfBlend(HiMask);
10161   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
10162 }
10163
10164 /// \brief Either split a vector in halves or decompose the shuffles and the
10165 /// blend.
10166 ///
10167 /// This is provided as a good fallback for many lowerings of non-single-input
10168 /// shuffles with more than one 128-bit lane. In those cases, we want to select
10169 /// between splitting the shuffle into 128-bit components and stitching those
10170 /// back together vs. extracting the single-input shuffles and blending those
10171 /// results.
10172 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
10173                                                 SDValue V2, ArrayRef<int> Mask,
10174                                                 SelectionDAG &DAG) {
10175   assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
10176                                             "lower single-input shuffles as it "
10177                                             "could then recurse on itself.");
10178   int Size = Mask.size();
10179
10180   // If this can be modeled as a broadcast of two elements followed by a blend,
10181   // prefer that lowering. This is especially important because broadcasts can
10182   // often fold with memory operands.
10183   auto DoBothBroadcast = [&] {
10184     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10185     for (int M : Mask)
10186       if (M >= Size) {
10187         if (V2BroadcastIdx == -1)
10188           V2BroadcastIdx = M - Size;
10189         else if (M - Size != V2BroadcastIdx)
10190           return false;
10191       } else if (M >= 0) {
10192         if (V1BroadcastIdx == -1)
10193           V1BroadcastIdx = M;
10194         else if (M != V1BroadcastIdx)
10195           return false;
10196       }
10197     return true;
10198   };
10199   if (DoBothBroadcast())
10200     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10201                                                       DAG);
10202
10203   // If the inputs all stem from a single 128-bit lane of each input, then we
10204   // split them rather than blending because the split will decompose to
10205   // unusually few instructions.
10206   int LaneCount = VT.getSizeInBits() / 128;
10207   int LaneSize = Size / LaneCount;
10208   SmallBitVector LaneInputs[2];
10209   LaneInputs[0].resize(LaneCount, false);
10210   LaneInputs[1].resize(LaneCount, false);
10211   for (int i = 0; i < Size; ++i)
10212     if (Mask[i] >= 0)
10213       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10214   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10215     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10216
10217   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10218   // that the decomposed single-input shuffles don't end up here.
10219   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10220 }
10221
10222 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10223 /// a permutation and blend of those lanes.
10224 ///
10225 /// This essentially blends the out-of-lane inputs to each lane into the lane
10226 /// from a permuted copy of the vector. This lowering strategy results in four
10227 /// instructions in the worst case for a single-input cross lane shuffle which
10228 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10229 /// of. Special cases for each particular shuffle pattern should be handled
10230 /// prior to trying this lowering.
10231 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
10232                                                        SDValue V1, SDValue V2,
10233                                                        ArrayRef<int> Mask,
10234                                                        SelectionDAG &DAG) {
10235   // FIXME: This should probably be generalized for 512-bit vectors as well.
10236   assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
10237   int LaneSize = Mask.size() / 2;
10238
10239   // If there are only inputs from one 128-bit lane, splitting will in fact be
10240   // less expensive. The flags track wether the given lane contains an element
10241   // that crosses to another lane.
10242   bool LaneCrossing[2] = {false, false};
10243   for (int i = 0, Size = Mask.size(); i < Size; ++i)
10244     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10245       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10246   if (!LaneCrossing[0] || !LaneCrossing[1])
10247     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10248
10249   if (isSingleInputShuffleMask(Mask)) {
10250     SmallVector<int, 32> FlippedBlendMask;
10251     for (int i = 0, Size = Mask.size(); i < Size; ++i)
10252       FlippedBlendMask.push_back(
10253           Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10254                                   ? Mask[i]
10255                                   : Mask[i] % LaneSize +
10256                                         (i / LaneSize) * LaneSize + Size));
10257
10258     // Flip the vector, and blend the results which should now be in-lane. The
10259     // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10260     // 5 for the high source. The value 3 selects the high half of source 2 and
10261     // the value 2 selects the low half of source 2. We only use source 2 to
10262     // allow folding it into a memory operand.
10263     unsigned PERMMask = 3 | 2 << 4;
10264     SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10265                                   V1, DAG.getConstant(PERMMask, MVT::i8));
10266     return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10267   }
10268
10269   // This now reduces to two single-input shuffles of V1 and V2 which at worst
10270   // will be handled by the above logic and a blend of the results, much like
10271   // other patterns in AVX.
10272   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10273 }
10274
10275 /// \brief Handle lowering 2-lane 128-bit shuffles.
10276 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10277                                         SDValue V2, ArrayRef<int> Mask,
10278                                         const X86Subtarget *Subtarget,
10279                                         SelectionDAG &DAG) {
10280   // Blends are faster and handle all the non-lane-crossing cases.
10281   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10282                                                 Subtarget, DAG))
10283     return Blend;
10284
10285   MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10286                                VT.getVectorNumElements() / 2);
10287   // Check for patterns which can be matched with a single insert of a 128-bit
10288   // subvector.
10289   if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
10290       isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
10291     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10292                               DAG.getIntPtrConstant(0));
10293     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10294                               Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
10295     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10296   }
10297   if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
10298     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10299                               DAG.getIntPtrConstant(0));
10300     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
10301                               DAG.getIntPtrConstant(2));
10302     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10303   }
10304
10305   // Otherwise form a 128-bit permutation.
10306   // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
10307   unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
10308   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10309                      DAG.getConstant(PermMask, MVT::i8));
10310 }
10311
10312 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10313 /// shuffling each lane.
10314 ///
10315 /// This will only succeed when the result of fixing the 128-bit lanes results
10316 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10317 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10318 /// the lane crosses early and then use simpler shuffles within each lane.
10319 ///
10320 /// FIXME: It might be worthwhile at some point to support this without
10321 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10322 /// in x86 only floating point has interesting non-repeating shuffles, and even
10323 /// those are still *marginally* more expensive.
10324 static SDValue lowerVectorShuffleByMerging128BitLanes(
10325     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10326     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
10327   assert(!isSingleInputShuffleMask(Mask) &&
10328          "This is only useful with multiple inputs.");
10329
10330   int Size = Mask.size();
10331   int LaneSize = 128 / VT.getScalarSizeInBits();
10332   int NumLanes = Size / LaneSize;
10333   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10334
10335   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10336   // check whether the in-128-bit lane shuffles share a repeating pattern.
10337   SmallVector<int, 4> Lanes;
10338   Lanes.resize(NumLanes, -1);
10339   SmallVector<int, 4> InLaneMask;
10340   InLaneMask.resize(LaneSize, -1);
10341   for (int i = 0; i < Size; ++i) {
10342     if (Mask[i] < 0)
10343       continue;
10344
10345     int j = i / LaneSize;
10346
10347     if (Lanes[j] < 0) {
10348       // First entry we've seen for this lane.
10349       Lanes[j] = Mask[i] / LaneSize;
10350     } else if (Lanes[j] != Mask[i] / LaneSize) {
10351       // This doesn't match the lane selected previously!
10352       return SDValue();
10353     }
10354
10355     // Check that within each lane we have a consistent shuffle mask.
10356     int k = i % LaneSize;
10357     if (InLaneMask[k] < 0) {
10358       InLaneMask[k] = Mask[i] % LaneSize;
10359     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10360       // This doesn't fit a repeating in-lane mask.
10361       return SDValue();
10362     }
10363   }
10364
10365   // First shuffle the lanes into place.
10366   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10367                                 VT.getSizeInBits() / 64);
10368   SmallVector<int, 8> LaneMask;
10369   LaneMask.resize(NumLanes * 2, -1);
10370   for (int i = 0; i < NumLanes; ++i)
10371     if (Lanes[i] >= 0) {
10372       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10373       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10374     }
10375
10376   V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
10377   V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
10378   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10379
10380   // Cast it back to the type we actually want.
10381   LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
10382
10383   // Now do a simple shuffle that isn't lane crossing.
10384   SmallVector<int, 8> NewMask;
10385   NewMask.resize(Size, -1);
10386   for (int i = 0; i < Size; ++i)
10387     if (Mask[i] >= 0)
10388       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10389   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10390          "Must not introduce lane crosses at this point!");
10391
10392   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10393 }
10394
10395 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10396 /// given mask.
10397 ///
10398 /// This returns true if the elements from a particular input are already in the
10399 /// slot required by the given mask and require no permutation.
10400 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10401   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10402   int Size = Mask.size();
10403   for (int i = 0; i < Size; ++i)
10404     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10405       return false;
10406
10407   return true;
10408 }
10409
10410 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
10411 ///
10412 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
10413 /// isn't available.
10414 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10415                                        const X86Subtarget *Subtarget,
10416                                        SelectionDAG &DAG) {
10417   SDLoc DL(Op);
10418   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10419   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10420   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10421   ArrayRef<int> Mask = SVOp->getMask();
10422   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10423
10424   SmallVector<int, 4> WidenedMask;
10425   if (canWidenShuffleElements(Mask, WidenedMask))
10426     return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
10427                                     DAG);
10428
10429   if (isSingleInputShuffleMask(Mask)) {
10430     // Check for being able to broadcast a single element.
10431     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
10432                                                           Mask, Subtarget, DAG))
10433       return Broadcast;
10434
10435     // Use low duplicate instructions for masks that match their pattern.
10436     if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
10437       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
10438
10439     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
10440       // Non-half-crossing single input shuffles can be lowerid with an
10441       // interleaved permutation.
10442       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
10443                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
10444       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
10445                          DAG.getConstant(VPERMILPMask, MVT::i8));
10446     }
10447
10448     // With AVX2 we have direct support for this permutation.
10449     if (Subtarget->hasAVX2())
10450       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
10451                          getV4X86ShuffleImm8ForMask(Mask, DAG));
10452
10453     // Otherwise, fall back.
10454     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
10455                                                    DAG);
10456   }
10457
10458   // X86 has dedicated unpack instructions that can handle specific blend
10459   // operations: UNPCKH and UNPCKL.
10460   if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10461     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
10462   if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10463     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
10464
10465   // If we have a single input to the zero element, insert that into V1 if we
10466   // can do so cheaply.
10467   int NumV2Elements =
10468       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
10469   if (NumV2Elements == 1 && Mask[0] >= 4)
10470     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10471             MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
10472       return Insertion;
10473
10474   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
10475                                                 Subtarget, DAG))
10476     return Blend;
10477
10478   // Check if the blend happens to exactly fit that of SHUFPD.
10479   if ((Mask[0] == -1 || Mask[0] < 2) &&
10480       (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
10481       (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
10482       (Mask[3] == -1 || Mask[3] >= 6)) {
10483     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
10484                           ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
10485     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
10486                        DAG.getConstant(SHUFPDMask, MVT::i8));
10487   }
10488   if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
10489       (Mask[1] == -1 || Mask[1] < 2) &&
10490       (Mask[2] == -1 || Mask[2] >= 6) &&
10491       (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
10492     unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
10493                           ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
10494     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
10495                        DAG.getConstant(SHUFPDMask, MVT::i8));
10496   }
10497
10498   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10499   // shuffle. However, if we have AVX2 and either inputs are already in place,
10500   // we will be able to shuffle even across lanes the other input in a single
10501   // instruction so skip this pattern.
10502   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10503                                  isShuffleMaskInputInPlace(1, Mask))))
10504     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10505             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10506       return Result;
10507
10508   // If we have AVX2 then we always want to lower with a blend because an v4 we
10509   // can fully permute the elements.
10510   if (Subtarget->hasAVX2())
10511     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
10512                                                       Mask, DAG);
10513
10514   // Otherwise fall back on generic lowering.
10515   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
10516 }
10517
10518 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
10519 ///
10520 /// This routine is only called when we have AVX2 and thus a reasonable
10521 /// instruction set for v4i64 shuffling..
10522 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10523                                        const X86Subtarget *Subtarget,
10524                                        SelectionDAG &DAG) {
10525   SDLoc DL(Op);
10526   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10527   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10528   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10529   ArrayRef<int> Mask = SVOp->getMask();
10530   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10531   assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
10532
10533   SmallVector<int, 4> WidenedMask;
10534   if (canWidenShuffleElements(Mask, WidenedMask))
10535     return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
10536                                     DAG);
10537
10538   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
10539                                                 Subtarget, DAG))
10540     return Blend;
10541
10542   // Check for being able to broadcast a single element.
10543   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
10544                                                         Mask, Subtarget, DAG))
10545     return Broadcast;
10546
10547   // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
10548   // use lower latency instructions that will operate on both 128-bit lanes.
10549   SmallVector<int, 2> RepeatedMask;
10550   if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
10551     if (isSingleInputShuffleMask(Mask)) {
10552       int PSHUFDMask[] = {-1, -1, -1, -1};
10553       for (int i = 0; i < 2; ++i)
10554         if (RepeatedMask[i] >= 0) {
10555           PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
10556           PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
10557         }
10558       return DAG.getNode(
10559           ISD::BITCAST, DL, MVT::v4i64,
10560           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
10561                       DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
10562                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
10563     }
10564
10565     // Use dedicated unpack instructions for masks that match their pattern.
10566     if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10567       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
10568     if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10569       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
10570   }
10571
10572   // AVX2 provides a direct instruction for permuting a single input across
10573   // lanes.
10574   if (isSingleInputShuffleMask(Mask))
10575     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
10576                        getV4X86ShuffleImm8ForMask(Mask, DAG));
10577
10578   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10579   // shuffle. However, if we have AVX2 and either inputs are already in place,
10580   // we will be able to shuffle even across lanes the other input in a single
10581   // instruction so skip this pattern.
10582   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10583                                  isShuffleMaskInputInPlace(1, Mask))))
10584     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10585             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
10586       return Result;
10587
10588   // Otherwise fall back on generic blend lowering.
10589   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
10590                                                     Mask, DAG);
10591 }
10592
10593 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
10594 ///
10595 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
10596 /// isn't available.
10597 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10598                                        const X86Subtarget *Subtarget,
10599                                        SelectionDAG &DAG) {
10600   SDLoc DL(Op);
10601   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10602   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10603   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10604   ArrayRef<int> Mask = SVOp->getMask();
10605   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10606
10607   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
10608                                                 Subtarget, DAG))
10609     return Blend;
10610
10611   // Check for being able to broadcast a single element.
10612   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
10613                                                         Mask, Subtarget, DAG))
10614     return Broadcast;
10615
10616   // If the shuffle mask is repeated in each 128-bit lane, we have many more
10617   // options to efficiently lower the shuffle.
10618   SmallVector<int, 4> RepeatedMask;
10619   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
10620     assert(RepeatedMask.size() == 4 &&
10621            "Repeated masks must be half the mask width!");
10622
10623     // Use even/odd duplicate instructions for masks that match their pattern.
10624     if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6))
10625       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
10626     if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7))
10627       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
10628
10629     if (isSingleInputShuffleMask(Mask))
10630       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
10631                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10632
10633     // Use dedicated unpack instructions for masks that match their pattern.
10634     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10635       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
10636     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10637       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
10638
10639     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
10640     // have already handled any direct blends. We also need to squash the
10641     // repeated mask into a simulated v4f32 mask.
10642     for (int i = 0; i < 4; ++i)
10643       if (RepeatedMask[i] >= 8)
10644         RepeatedMask[i] -= 4;
10645     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
10646   }
10647
10648   // If we have a single input shuffle with different shuffle patterns in the
10649   // two 128-bit lanes use the variable mask to VPERMILPS.
10650   if (isSingleInputShuffleMask(Mask)) {
10651     SDValue VPermMask[8];
10652     for (int i = 0; i < 8; ++i)
10653       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10654                                  : DAG.getConstant(Mask[i], MVT::i32);
10655     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
10656       return DAG.getNode(
10657           X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
10658           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
10659
10660     if (Subtarget->hasAVX2())
10661       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
10662                          DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
10663                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
10664                                                  MVT::v8i32, VPermMask)),
10665                          V1);
10666
10667     // Otherwise, fall back.
10668     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
10669                                                    DAG);
10670   }
10671
10672   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10673   // shuffle.
10674   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10675           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
10676     return Result;
10677
10678   // If we have AVX2 then we always want to lower with a blend because at v8 we
10679   // can fully permute the elements.
10680   if (Subtarget->hasAVX2())
10681     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
10682                                                       Mask, DAG);
10683
10684   // Otherwise fall back on generic lowering.
10685   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
10686 }
10687
10688 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
10689 ///
10690 /// This routine is only called when we have AVX2 and thus a reasonable
10691 /// instruction set for v8i32 shuffling..
10692 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10693                                        const X86Subtarget *Subtarget,
10694                                        SelectionDAG &DAG) {
10695   SDLoc DL(Op);
10696   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10697   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10698   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10699   ArrayRef<int> Mask = SVOp->getMask();
10700   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10701   assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
10702
10703   // Whenever we can lower this as a zext, that instruction is strictly faster
10704   // than any alternative. It also allows us to fold memory operands into the
10705   // shuffle in many cases.
10706   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
10707                                                          Mask, Subtarget, DAG))
10708     return ZExt;
10709
10710   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
10711                                                 Subtarget, DAG))
10712     return Blend;
10713
10714   // Check for being able to broadcast a single element.
10715   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
10716                                                         Mask, Subtarget, DAG))
10717     return Broadcast;
10718
10719   // If the shuffle mask is repeated in each 128-bit lane we can use more
10720   // efficient instructions that mirror the shuffles across the two 128-bit
10721   // lanes.
10722   SmallVector<int, 4> RepeatedMask;
10723   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
10724     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
10725     if (isSingleInputShuffleMask(Mask))
10726       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
10727                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10728
10729     // Use dedicated unpack instructions for masks that match their pattern.
10730     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10731       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
10732     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10733       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
10734   }
10735
10736   // If the shuffle patterns aren't repeated but it is a single input, directly
10737   // generate a cross-lane VPERMD instruction.
10738   if (isSingleInputShuffleMask(Mask)) {
10739     SDValue VPermMask[8];
10740     for (int i = 0; i < 8; ++i)
10741       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10742                                  : DAG.getConstant(Mask[i], MVT::i32);
10743     return DAG.getNode(
10744         X86ISD::VPERMV, DL, MVT::v8i32,
10745         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
10746   }
10747
10748   // Try to use bit shift instructions.
10749   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10750           DL, MVT::v8i32, V1, V2, Mask, DAG))
10751     return Shift;
10752
10753   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10754   // shuffle.
10755   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10756           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
10757     return Result;
10758
10759   // Otherwise fall back on generic blend lowering.
10760   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
10761                                                     Mask, DAG);
10762 }
10763
10764 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
10765 ///
10766 /// This routine is only called when we have AVX2 and thus a reasonable
10767 /// instruction set for v16i16 shuffling..
10768 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10769                                         const X86Subtarget *Subtarget,
10770                                         SelectionDAG &DAG) {
10771   SDLoc DL(Op);
10772   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10773   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10774   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10775   ArrayRef<int> Mask = SVOp->getMask();
10776   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10777   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
10778
10779   // Whenever we can lower this as a zext, that instruction is strictly faster
10780   // than any alternative. It also allows us to fold memory operands into the
10781   // shuffle in many cases.
10782   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
10783                                                          Mask, Subtarget, DAG))
10784     return ZExt;
10785
10786   // Check for being able to broadcast a single element.
10787   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
10788                                                         Mask, Subtarget, DAG))
10789     return Broadcast;
10790
10791   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
10792                                                 Subtarget, DAG))
10793     return Blend;
10794
10795   // Use dedicated unpack instructions for masks that match their pattern.
10796   if (isShuffleEquivalent(Mask,
10797                           // First 128-bit lane:
10798                           0, 16, 1, 17, 2, 18, 3, 19,
10799                           // Second 128-bit lane:
10800                           8, 24, 9, 25, 10, 26, 11, 27))
10801     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
10802   if (isShuffleEquivalent(Mask,
10803                           // First 128-bit lane:
10804                           4, 20, 5, 21, 6, 22, 7, 23,
10805                           // Second 128-bit lane:
10806                           12, 28, 13, 29, 14, 30, 15, 31))
10807     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
10808
10809   if (isSingleInputShuffleMask(Mask)) {
10810     // There are no generalized cross-lane shuffle operations available on i16
10811     // element types.
10812     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
10813       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
10814                                                      Mask, DAG);
10815
10816     SDValue PSHUFBMask[32];
10817     for (int i = 0; i < 16; ++i) {
10818       if (Mask[i] == -1) {
10819         PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
10820         continue;
10821       }
10822
10823       int M = i < 8 ? Mask[i] : Mask[i] - 8;
10824       assert(M >= 0 && M < 8 && "Invalid single-input mask!");
10825       PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
10826       PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
10827     }
10828     return DAG.getNode(
10829         ISD::BITCAST, DL, MVT::v16i16,
10830         DAG.getNode(
10831             X86ISD::PSHUFB, DL, MVT::v32i8,
10832             DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
10833             DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
10834   }
10835
10836   // Try to use bit shift instructions.
10837   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10838           DL, MVT::v16i16, V1, V2, Mask, DAG))
10839     return Shift;
10840
10841   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10842   // shuffle.
10843   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10844           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
10845     return Result;
10846
10847   // Otherwise fall back on generic lowering.
10848   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
10849 }
10850
10851 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
10852 ///
10853 /// This routine is only called when we have AVX2 and thus a reasonable
10854 /// instruction set for v32i8 shuffling..
10855 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10856                                        const X86Subtarget *Subtarget,
10857                                        SelectionDAG &DAG) {
10858   SDLoc DL(Op);
10859   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10860   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10861   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10862   ArrayRef<int> Mask = SVOp->getMask();
10863   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10864   assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
10865
10866   // Whenever we can lower this as a zext, that instruction is strictly faster
10867   // than any alternative. It also allows us to fold memory operands into the
10868   // shuffle in many cases.
10869   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
10870                                                          Mask, Subtarget, DAG))
10871     return ZExt;
10872
10873   // Check for being able to broadcast a single element.
10874   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
10875                                                         Mask, Subtarget, DAG))
10876     return Broadcast;
10877
10878   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
10879                                                 Subtarget, DAG))
10880     return Blend;
10881
10882   // Use dedicated unpack instructions for masks that match their pattern.
10883   // Note that these are repeated 128-bit lane unpacks, not unpacks across all
10884   // 256-bit lanes.
10885   if (isShuffleEquivalent(
10886           Mask,
10887           // First 128-bit lane:
10888           0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
10889           // Second 128-bit lane:
10890           16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
10891     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
10892   if (isShuffleEquivalent(
10893           Mask,
10894           // First 128-bit lane:
10895           8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
10896           // Second 128-bit lane:
10897           24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
10898     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
10899
10900   if (isSingleInputShuffleMask(Mask)) {
10901     // There are no generalized cross-lane shuffle operations available on i8
10902     // element types.
10903     if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
10904       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
10905                                                      Mask, DAG);
10906
10907     SDValue PSHUFBMask[32];
10908     for (int i = 0; i < 32; ++i)
10909       PSHUFBMask[i] =
10910           Mask[i] < 0
10911               ? DAG.getUNDEF(MVT::i8)
10912               : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
10913
10914     return DAG.getNode(
10915         X86ISD::PSHUFB, DL, MVT::v32i8, V1,
10916         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
10917   }
10918
10919   // Try to use bit shift instructions.
10920   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10921           DL, MVT::v32i8, V1, V2, Mask, DAG))
10922     return Shift;
10923
10924   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10925   // shuffle.
10926   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10927           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
10928     return Result;
10929
10930   // Otherwise fall back on generic lowering.
10931   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
10932 }
10933
10934 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
10935 ///
10936 /// This routine either breaks down the specific type of a 256-bit x86 vector
10937 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
10938 /// together based on the available instructions.
10939 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10940                                         MVT VT, const X86Subtarget *Subtarget,
10941                                         SelectionDAG &DAG) {
10942   SDLoc DL(Op);
10943   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10944   ArrayRef<int> Mask = SVOp->getMask();
10945
10946   // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
10947   // check for those subtargets here and avoid much of the subtarget querying in
10948   // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
10949   // ability to manipulate a 256-bit vector with integer types. Since we'll use
10950   // floating point types there eventually, just immediately cast everything to
10951   // a float and operate entirely in that domain.
10952   if (VT.isInteger() && !Subtarget->hasAVX2()) {
10953     int ElementBits = VT.getScalarSizeInBits();
10954     if (ElementBits < 32)
10955       // No floating point type available, decompose into 128-bit vectors.
10956       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10957
10958     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
10959                                 VT.getVectorNumElements());
10960     V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
10961     V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
10962     return DAG.getNode(ISD::BITCAST, DL, VT,
10963                        DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
10964   }
10965
10966   switch (VT.SimpleTy) {
10967   case MVT::v4f64:
10968     return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10969   case MVT::v4i64:
10970     return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10971   case MVT::v8f32:
10972     return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10973   case MVT::v8i32:
10974     return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10975   case MVT::v16i16:
10976     return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10977   case MVT::v32i8:
10978     return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10979
10980   default:
10981     llvm_unreachable("Not a valid 256-bit x86 vector type!");
10982   }
10983 }
10984
10985 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
10986 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10987                                        const X86Subtarget *Subtarget,
10988                                        SelectionDAG &DAG) {
10989   SDLoc DL(Op);
10990   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10991   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10992   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10993   ArrayRef<int> Mask = SVOp->getMask();
10994   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10995
10996   // X86 has dedicated unpack instructions that can handle specific blend
10997   // operations: UNPCKH and UNPCKL.
10998   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
10999     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
11000   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11001     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
11002
11003   // FIXME: Implement direct support for this type!
11004   return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
11005 }
11006
11007 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
11008 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11009                                        const X86Subtarget *Subtarget,
11010                                        SelectionDAG &DAG) {
11011   SDLoc DL(Op);
11012   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11013   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11014   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11015   ArrayRef<int> Mask = SVOp->getMask();
11016   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11017
11018   // Use dedicated unpack instructions for masks that match their pattern.
11019   if (isShuffleEquivalent(Mask,
11020                           0, 16, 1, 17, 4, 20, 5, 21,
11021                           8, 24, 9, 25, 12, 28, 13, 29))
11022     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
11023   if (isShuffleEquivalent(Mask,
11024                           2, 18, 3, 19, 6, 22, 7, 23,
11025                           10, 26, 11, 27, 14, 30, 15, 31))
11026     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
11027
11028   // FIXME: Implement direct support for this type!
11029   return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
11030 }
11031
11032 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
11033 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11034                                        const X86Subtarget *Subtarget,
11035                                        SelectionDAG &DAG) {
11036   SDLoc DL(Op);
11037   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11038   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11039   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11040   ArrayRef<int> Mask = SVOp->getMask();
11041   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11042
11043   // X86 has dedicated unpack instructions that can handle specific blend
11044   // operations: UNPCKH and UNPCKL.
11045   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
11046     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
11047   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11048     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
11049
11050   // FIXME: Implement direct support for this type!
11051   return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
11052 }
11053
11054 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
11055 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11056                                        const X86Subtarget *Subtarget,
11057                                        SelectionDAG &DAG) {
11058   SDLoc DL(Op);
11059   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11060   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11061   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11062   ArrayRef<int> Mask = SVOp->getMask();
11063   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11064
11065   // Use dedicated unpack instructions for masks that match their pattern.
11066   if (isShuffleEquivalent(Mask,
11067                           0, 16, 1, 17, 4, 20, 5, 21,
11068                           8, 24, 9, 25, 12, 28, 13, 29))
11069     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
11070   if (isShuffleEquivalent(Mask,
11071                           2, 18, 3, 19, 6, 22, 7, 23,
11072                           10, 26, 11, 27, 14, 30, 15, 31))
11073     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
11074
11075   // FIXME: Implement direct support for this type!
11076   return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
11077 }
11078
11079 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
11080 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11081                                         const X86Subtarget *Subtarget,
11082                                         SelectionDAG &DAG) {
11083   SDLoc DL(Op);
11084   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11085   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11086   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11087   ArrayRef<int> Mask = SVOp->getMask();
11088   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11089   assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
11090
11091   // FIXME: Implement direct support for this type!
11092   return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
11093 }
11094
11095 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
11096 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11097                                        const X86Subtarget *Subtarget,
11098                                        SelectionDAG &DAG) {
11099   SDLoc DL(Op);
11100   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11101   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11102   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11103   ArrayRef<int> Mask = SVOp->getMask();
11104   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
11105   assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
11106
11107   // FIXME: Implement direct support for this type!
11108   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
11109 }
11110
11111 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
11112 ///
11113 /// This routine either breaks down the specific type of a 512-bit x86 vector
11114 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
11115 /// together based on the available instructions.
11116 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11117                                         MVT VT, const X86Subtarget *Subtarget,
11118                                         SelectionDAG &DAG) {
11119   SDLoc DL(Op);
11120   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11121   ArrayRef<int> Mask = SVOp->getMask();
11122   assert(Subtarget->hasAVX512() &&
11123          "Cannot lower 512-bit vectors w/ basic ISA!");
11124
11125   // Check for being able to broadcast a single element.
11126   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
11127                                                         Mask, Subtarget, DAG))
11128     return Broadcast;
11129
11130   // Dispatch to each element type for lowering. If we don't have supprot for
11131   // specific element type shuffles at 512 bits, immediately split them and
11132   // lower them. Each lowering routine of a given type is allowed to assume that
11133   // the requisite ISA extensions for that element type are available.
11134   switch (VT.SimpleTy) {
11135   case MVT::v8f64:
11136     return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11137   case MVT::v16f32:
11138     return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11139   case MVT::v8i64:
11140     return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11141   case MVT::v16i32:
11142     return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11143   case MVT::v32i16:
11144     if (Subtarget->hasBWI())
11145       return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
11146     break;
11147   case MVT::v64i8:
11148     if (Subtarget->hasBWI())
11149       return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
11150     break;
11151
11152   default:
11153     llvm_unreachable("Not a valid 512-bit x86 vector type!");
11154   }
11155
11156   // Otherwise fall back on splitting.
11157   return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11158 }
11159
11160 /// \brief Top-level lowering for x86 vector shuffles.
11161 ///
11162 /// This handles decomposition, canonicalization, and lowering of all x86
11163 /// vector shuffles. Most of the specific lowering strategies are encapsulated
11164 /// above in helper routines. The canonicalization attempts to widen shuffles
11165 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
11166 /// s.t. only one of the two inputs needs to be tested, etc.
11167 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
11168                                   SelectionDAG &DAG) {
11169   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11170   ArrayRef<int> Mask = SVOp->getMask();
11171   SDValue V1 = Op.getOperand(0);
11172   SDValue V2 = Op.getOperand(1);
11173   MVT VT = Op.getSimpleValueType();
11174   int NumElements = VT.getVectorNumElements();
11175   SDLoc dl(Op);
11176
11177   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
11178
11179   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
11180   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11181   if (V1IsUndef && V2IsUndef)
11182     return DAG.getUNDEF(VT);
11183
11184   // When we create a shuffle node we put the UNDEF node to second operand,
11185   // but in some cases the first operand may be transformed to UNDEF.
11186   // In this case we should just commute the node.
11187   if (V1IsUndef)
11188     return DAG.getCommutedVectorShuffle(*SVOp);
11189
11190   // Check for non-undef masks pointing at an undef vector and make the masks
11191   // undef as well. This makes it easier to match the shuffle based solely on
11192   // the mask.
11193   if (V2IsUndef)
11194     for (int M : Mask)
11195       if (M >= NumElements) {
11196         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
11197         for (int &M : NewMask)
11198           if (M >= NumElements)
11199             M = -1;
11200         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
11201       }
11202
11203   // Try to collapse shuffles into using a vector type with fewer elements but
11204   // wider element types. We cap this to not form integers or floating point
11205   // elements wider than 64 bits, but it might be interesting to form i128
11206   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
11207   SmallVector<int, 16> WidenedMask;
11208   if (VT.getScalarSizeInBits() < 64 &&
11209       canWidenShuffleElements(Mask, WidenedMask)) {
11210     MVT NewEltVT = VT.isFloatingPoint()
11211                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
11212                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
11213     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
11214     // Make sure that the new vector type is legal. For example, v2f64 isn't
11215     // legal on SSE1.
11216     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
11217       V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
11218       V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
11219       return DAG.getNode(ISD::BITCAST, dl, VT,
11220                          DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
11221     }
11222   }
11223
11224   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
11225   for (int M : SVOp->getMask())
11226     if (M < 0)
11227       ++NumUndefElements;
11228     else if (M < NumElements)
11229       ++NumV1Elements;
11230     else
11231       ++NumV2Elements;
11232
11233   // Commute the shuffle as needed such that more elements come from V1 than
11234   // V2. This allows us to match the shuffle pattern strictly on how many
11235   // elements come from V1 without handling the symmetric cases.
11236   if (NumV2Elements > NumV1Elements)
11237     return DAG.getCommutedVectorShuffle(*SVOp);
11238
11239   // When the number of V1 and V2 elements are the same, try to minimize the
11240   // number of uses of V2 in the low half of the vector. When that is tied,
11241   // ensure that the sum of indices for V1 is equal to or lower than the sum
11242   // indices for V2. When those are equal, try to ensure that the number of odd
11243   // indices for V1 is lower than the number of odd indices for V2.
11244   if (NumV1Elements == NumV2Elements) {
11245     int LowV1Elements = 0, LowV2Elements = 0;
11246     for (int M : SVOp->getMask().slice(0, NumElements / 2))
11247       if (M >= NumElements)
11248         ++LowV2Elements;
11249       else if (M >= 0)
11250         ++LowV1Elements;
11251     if (LowV2Elements > LowV1Elements) {
11252       return DAG.getCommutedVectorShuffle(*SVOp);
11253     } else if (LowV2Elements == LowV1Elements) {
11254       int SumV1Indices = 0, SumV2Indices = 0;
11255       for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11256         if (SVOp->getMask()[i] >= NumElements)
11257           SumV2Indices += i;
11258         else if (SVOp->getMask()[i] >= 0)
11259           SumV1Indices += i;
11260       if (SumV2Indices < SumV1Indices) {
11261         return DAG.getCommutedVectorShuffle(*SVOp);
11262       } else if (SumV2Indices == SumV1Indices) {
11263         int NumV1OddIndices = 0, NumV2OddIndices = 0;
11264         for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11265           if (SVOp->getMask()[i] >= NumElements)
11266             NumV2OddIndices += i % 2;
11267           else if (SVOp->getMask()[i] >= 0)
11268             NumV1OddIndices += i % 2;
11269         if (NumV2OddIndices < NumV1OddIndices)
11270           return DAG.getCommutedVectorShuffle(*SVOp);
11271       }
11272     }
11273   }
11274
11275   // For each vector width, delegate to a specialized lowering routine.
11276   if (VT.getSizeInBits() == 128)
11277     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11278
11279   if (VT.getSizeInBits() == 256)
11280     return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11281
11282   // Force AVX-512 vectors to be scalarized for now.
11283   // FIXME: Implement AVX-512 support!
11284   if (VT.getSizeInBits() == 512)
11285     return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11286
11287   llvm_unreachable("Unimplemented!");
11288 }
11289
11290
11291 //===----------------------------------------------------------------------===//
11292 // Legacy vector shuffle lowering
11293 //
11294 // This code is the legacy code handling vector shuffles until the above
11295 // replaces its functionality and performance.
11296 //===----------------------------------------------------------------------===//
11297
11298 static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
11299                         bool hasInt256, unsigned *MaskOut = nullptr) {
11300   MVT EltVT = VT.getVectorElementType();
11301
11302   // There is no blend with immediate in AVX-512.
11303   if (VT.is512BitVector())
11304     return false;
11305
11306   if (!hasSSE41 || EltVT == MVT::i8)
11307     return false;
11308   if (!hasInt256 && VT == MVT::v16i16)
11309     return false;
11310
11311   unsigned MaskValue = 0;
11312   unsigned NumElems = VT.getVectorNumElements();
11313   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
11314   unsigned NumLanes = (NumElems - 1) / 8 + 1;
11315   unsigned NumElemsInLane = NumElems / NumLanes;
11316
11317   // Blend for v16i16 should be symetric for the both lanes.
11318   for (unsigned i = 0; i < NumElemsInLane; ++i) {
11319
11320     int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
11321     int EltIdx = MaskVals[i];
11322
11323     if ((EltIdx < 0 || EltIdx == (int)i) &&
11324         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
11325       continue;
11326
11327     if (((unsigned)EltIdx == (i + NumElems)) &&
11328         (SndLaneEltIdx < 0 ||
11329          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
11330       MaskValue |= (1 << i);
11331     else
11332       return false;
11333   }
11334
11335   if (MaskOut)
11336     *MaskOut = MaskValue;
11337   return true;
11338 }
11339
11340 // Try to lower a shuffle node into a simple blend instruction.
11341 // This function assumes isBlendMask returns true for this
11342 // SuffleVectorSDNode
11343 static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
11344                                           unsigned MaskValue,
11345                                           const X86Subtarget *Subtarget,
11346                                           SelectionDAG &DAG) {
11347   MVT VT = SVOp->getSimpleValueType(0);
11348   MVT EltVT = VT.getVectorElementType();
11349   assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
11350                      Subtarget->hasInt256() && "Trying to lower a "
11351                                                "VECTOR_SHUFFLE to a Blend but "
11352                                                "with the wrong mask"));
11353   SDValue V1 = SVOp->getOperand(0);
11354   SDValue V2 = SVOp->getOperand(1);
11355   SDLoc dl(SVOp);
11356   unsigned NumElems = VT.getVectorNumElements();
11357
11358   // Convert i32 vectors to floating point if it is not AVX2.
11359   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
11360   MVT BlendVT = VT;
11361   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
11362     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
11363                                NumElems);
11364     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
11365     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
11366   }
11367
11368   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
11369                             DAG.getConstant(MaskValue, MVT::i32));
11370   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
11371 }
11372
11373 /// In vector type \p VT, return true if the element at index \p InputIdx
11374 /// falls on a different 128-bit lane than \p OutputIdx.
11375 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
11376                                      unsigned OutputIdx) {
11377   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
11378   return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
11379 }
11380
11381 /// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
11382 /// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
11383 /// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
11384 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
11385 /// zero.
11386 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
11387                          SelectionDAG &DAG) {
11388   MVT VT = V1.getSimpleValueType();
11389   assert(VT.is128BitVector() || VT.is256BitVector());
11390
11391   MVT EltVT = VT.getVectorElementType();
11392   unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
11393   unsigned NumElts = VT.getVectorNumElements();
11394
11395   SmallVector<SDValue, 32> PshufbMask;
11396   for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
11397     int InputIdx = MaskVals[OutputIdx];
11398     unsigned InputByteIdx;
11399
11400     if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
11401       InputByteIdx = 0x80;
11402     else {
11403       // Cross lane is not allowed.
11404       if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
11405         return SDValue();
11406       InputByteIdx = InputIdx * EltSizeInBytes;
11407       // Index is an byte offset within the 128-bit lane.
11408       InputByteIdx &= 0xf;
11409     }
11410
11411     for (unsigned j = 0; j < EltSizeInBytes; ++j) {
11412       PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
11413       if (InputByteIdx != 0x80)
11414         ++InputByteIdx;
11415     }
11416   }
11417
11418   MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
11419   if (ShufVT != VT)
11420     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
11421   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
11422                      DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
11423 }
11424
11425 // v8i16 shuffles - Prefer shuffles in the following order:
11426 // 1. [all]   pshuflw, pshufhw, optional move
11427 // 2. [ssse3] 1 x pshufb
11428 // 3. [ssse3] 2 x pshufb + 1 x por
11429 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
11430 static SDValue
11431 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
11432                          SelectionDAG &DAG) {
11433   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11434   SDValue V1 = SVOp->getOperand(0);
11435   SDValue V2 = SVOp->getOperand(1);
11436   SDLoc dl(SVOp);
11437   SmallVector<int, 8> MaskVals;
11438
11439   // Determine if more than 1 of the words in each of the low and high quadwords
11440   // of the result come from the same quadword of one of the two inputs.  Undef
11441   // mask values count as coming from any quadword, for better codegen.
11442   //
11443   // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
11444   // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
11445   unsigned LoQuad[] = { 0, 0, 0, 0 };
11446   unsigned HiQuad[] = { 0, 0, 0, 0 };
11447   // Indices of quads used.
11448   std::bitset<4> InputQuads;
11449   for (unsigned i = 0; i < 8; ++i) {
11450     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
11451     int EltIdx = SVOp->getMaskElt(i);
11452     MaskVals.push_back(EltIdx);
11453     if (EltIdx < 0) {
11454       ++Quad[0];
11455       ++Quad[1];
11456       ++Quad[2];
11457       ++Quad[3];
11458       continue;
11459     }
11460     ++Quad[EltIdx / 4];
11461     InputQuads.set(EltIdx / 4);
11462   }
11463
11464   int BestLoQuad = -1;
11465   unsigned MaxQuad = 1;
11466   for (unsigned i = 0; i < 4; ++i) {
11467     if (LoQuad[i] > MaxQuad) {
11468       BestLoQuad = i;
11469       MaxQuad = LoQuad[i];
11470     }
11471   }
11472
11473   int BestHiQuad = -1;
11474   MaxQuad = 1;
11475   for (unsigned i = 0; i < 4; ++i) {
11476     if (HiQuad[i] > MaxQuad) {
11477       BestHiQuad = i;
11478       MaxQuad = HiQuad[i];
11479     }
11480   }
11481
11482   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
11483   // of the two input vectors, shuffle them into one input vector so only a
11484   // single pshufb instruction is necessary. If there are more than 2 input
11485   // quads, disable the next transformation since it does not help SSSE3.
11486   bool V1Used = InputQuads[0] || InputQuads[1];
11487   bool V2Used = InputQuads[2] || InputQuads[3];
11488   if (Subtarget->hasSSSE3()) {
11489     if (InputQuads.count() == 2 && V1Used && V2Used) {
11490       BestLoQuad = InputQuads[0] ? 0 : 1;
11491       BestHiQuad = InputQuads[2] ? 2 : 3;
11492     }
11493     if (InputQuads.count() > 2) {
11494       BestLoQuad = -1;
11495       BestHiQuad = -1;
11496     }
11497   }
11498
11499   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
11500   // the shuffle mask.  If a quad is scored as -1, that means that it contains
11501   // words from all 4 input quadwords.
11502   SDValue NewV;
11503   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
11504     int MaskV[] = {
11505       BestLoQuad < 0 ? 0 : BestLoQuad,
11506       BestHiQuad < 0 ? 1 : BestHiQuad
11507     };
11508     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
11509                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
11510                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
11511     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
11512
11513     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
11514     // source words for the shuffle, to aid later transformations.
11515     bool AllWordsInNewV = true;
11516     bool InOrder[2] = { true, true };
11517     for (unsigned i = 0; i != 8; ++i) {
11518       int idx = MaskVals[i];
11519       if (idx != (int)i)
11520         InOrder[i/4] = false;
11521       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
11522         continue;
11523       AllWordsInNewV = false;
11524       break;
11525     }
11526
11527     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
11528     if (AllWordsInNewV) {
11529       for (int i = 0; i != 8; ++i) {
11530         int idx = MaskVals[i];
11531         if (idx < 0)
11532           continue;
11533         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
11534         if ((idx != i) && idx < 4)
11535           pshufhw = false;
11536         if ((idx != i) && idx > 3)
11537           pshuflw = false;
11538       }
11539       V1 = NewV;
11540       V2Used = false;
11541       BestLoQuad = 0;
11542       BestHiQuad = 1;
11543     }
11544
11545     // If we've eliminated the use of V2, and the new mask is a pshuflw or
11546     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
11547     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
11548       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
11549       unsigned TargetMask = 0;
11550       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
11551                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
11552       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11553       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
11554                              getShufflePSHUFLWImmediate(SVOp);
11555       V1 = NewV.getOperand(0);
11556       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
11557     }
11558   }
11559
11560   // Promote splats to a larger type which usually leads to more efficient code.
11561   // FIXME: Is this true if pshufb is available?
11562   if (SVOp->isSplat())
11563     return PromoteSplat(SVOp, DAG);
11564
11565   // If we have SSSE3, and all words of the result are from 1 input vector,
11566   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
11567   // is present, fall back to case 4.
11568   if (Subtarget->hasSSSE3()) {
11569     SmallVector<SDValue,16> pshufbMask;
11570
11571     // If we have elements from both input vectors, set the high bit of the
11572     // shuffle mask element to zero out elements that come from V2 in the V1
11573     // mask, and elements that come from V1 in the V2 mask, so that the two
11574     // results can be OR'd together.
11575     bool TwoInputs = V1Used && V2Used;
11576     V1 = getPSHUFB(MaskVals, V1, dl, DAG);
11577     if (!TwoInputs)
11578       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11579
11580     // Calculate the shuffle mask for the second input, shuffle it, and
11581     // OR it with the first shuffled input.
11582     CommuteVectorShuffleMask(MaskVals, 8);
11583     V2 = getPSHUFB(MaskVals, V2, dl, DAG);
11584     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11585     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11586   }
11587
11588   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
11589   // and update MaskVals with new element order.
11590   std::bitset<8> InOrder;
11591   if (BestLoQuad >= 0) {
11592     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
11593     for (int i = 0; i != 4; ++i) {
11594       int idx = MaskVals[i];
11595       if (idx < 0) {
11596         InOrder.set(i);
11597       } else if ((idx / 4) == BestLoQuad) {
11598         MaskV[i] = idx & 3;
11599         InOrder.set(i);
11600       }
11601     }
11602     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11603                                 &MaskV[0]);
11604
11605     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11606       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11607       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
11608                                   NewV.getOperand(0),
11609                                   getShufflePSHUFLWImmediate(SVOp), DAG);
11610     }
11611   }
11612
11613   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
11614   // and update MaskVals with the new element order.
11615   if (BestHiQuad >= 0) {
11616     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
11617     for (unsigned i = 4; i != 8; ++i) {
11618       int idx = MaskVals[i];
11619       if (idx < 0) {
11620         InOrder.set(i);
11621       } else if ((idx / 4) == BestHiQuad) {
11622         MaskV[i] = (idx & 3) + 4;
11623         InOrder.set(i);
11624       }
11625     }
11626     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11627                                 &MaskV[0]);
11628
11629     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11630       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11631       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
11632                                   NewV.getOperand(0),
11633                                   getShufflePSHUFHWImmediate(SVOp), DAG);
11634     }
11635   }
11636
11637   // In case BestHi & BestLo were both -1, which means each quadword has a word
11638   // from each of the four input quadwords, calculate the InOrder bitvector now
11639   // before falling through to the insert/extract cleanup.
11640   if (BestLoQuad == -1 && BestHiQuad == -1) {
11641     NewV = V1;
11642     for (int i = 0; i != 8; ++i)
11643       if (MaskVals[i] < 0 || MaskVals[i] == i)
11644         InOrder.set(i);
11645   }
11646
11647   // The other elements are put in the right place using pextrw and pinsrw.
11648   for (unsigned i = 0; i != 8; ++i) {
11649     if (InOrder[i])
11650       continue;
11651     int EltIdx = MaskVals[i];
11652     if (EltIdx < 0)
11653       continue;
11654     SDValue ExtOp = (EltIdx < 8) ?
11655       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
11656                   DAG.getIntPtrConstant(EltIdx)) :
11657       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
11658                   DAG.getIntPtrConstant(EltIdx - 8));
11659     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
11660                        DAG.getIntPtrConstant(i));
11661   }
11662   return NewV;
11663 }
11664
11665 /// \brief v16i16 shuffles
11666 ///
11667 /// FIXME: We only support generation of a single pshufb currently.  We can
11668 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
11669 /// well (e.g 2 x pshufb + 1 x por).
11670 static SDValue
11671 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
11672   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11673   SDValue V1 = SVOp->getOperand(0);
11674   SDValue V2 = SVOp->getOperand(1);
11675   SDLoc dl(SVOp);
11676
11677   if (V2.getOpcode() != ISD::UNDEF)
11678     return SDValue();
11679
11680   SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11681   return getPSHUFB(MaskVals, V1, dl, DAG);
11682 }
11683
11684 // v16i8 shuffles - Prefer shuffles in the following order:
11685 // 1. [ssse3] 1 x pshufb
11686 // 2. [ssse3] 2 x pshufb + 1 x por
11687 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
11688 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
11689                                         const X86Subtarget* Subtarget,
11690                                         SelectionDAG &DAG) {
11691   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11692   SDValue V1 = SVOp->getOperand(0);
11693   SDValue V2 = SVOp->getOperand(1);
11694   SDLoc dl(SVOp);
11695   ArrayRef<int> MaskVals = SVOp->getMask();
11696
11697   // Promote splats to a larger type which usually leads to more efficient code.
11698   // FIXME: Is this true if pshufb is available?
11699   if (SVOp->isSplat())
11700     return PromoteSplat(SVOp, DAG);
11701
11702   // If we have SSSE3, case 1 is generated when all result bytes come from
11703   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
11704   // present, fall back to case 3.
11705
11706   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
11707   if (Subtarget->hasSSSE3()) {
11708     SmallVector<SDValue,16> pshufbMask;
11709
11710     // If all result elements are from one input vector, then only translate
11711     // undef mask values to 0x80 (zero out result) in the pshufb mask.
11712     //
11713     // Otherwise, we have elements from both input vectors, and must zero out
11714     // elements that come from V2 in the first mask, and V1 in the second mask
11715     // so that we can OR them together.
11716     for (unsigned i = 0; i != 16; ++i) {
11717       int EltIdx = MaskVals[i];
11718       if (EltIdx < 0 || EltIdx >= 16)
11719         EltIdx = 0x80;
11720       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11721     }
11722     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
11723                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11724                                  MVT::v16i8, pshufbMask));
11725
11726     // As PSHUFB will zero elements with negative indices, it's safe to ignore
11727     // the 2nd operand if it's undefined or zero.
11728     if (V2.getOpcode() == ISD::UNDEF ||
11729         ISD::isBuildVectorAllZeros(V2.getNode()))
11730       return V1;
11731
11732     // Calculate the shuffle mask for the second input, shuffle it, and
11733     // OR it with the first shuffled input.
11734     pshufbMask.clear();
11735     for (unsigned i = 0; i != 16; ++i) {
11736       int EltIdx = MaskVals[i];
11737       EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
11738       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11739     }
11740     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
11741                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11742                                  MVT::v16i8, pshufbMask));
11743     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11744   }
11745
11746   // No SSSE3 - Calculate in place words and then fix all out of place words
11747   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
11748   // the 16 different words that comprise the two doublequadword input vectors.
11749   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11750   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
11751   SDValue NewV = V1;
11752   for (int i = 0; i != 8; ++i) {
11753     int Elt0 = MaskVals[i*2];
11754     int Elt1 = MaskVals[i*2+1];
11755
11756     // This word of the result is all undef, skip it.
11757     if (Elt0 < 0 && Elt1 < 0)
11758       continue;
11759
11760     // This word of the result is already in the correct place, skip it.
11761     if ((Elt0 == i*2) && (Elt1 == i*2+1))
11762       continue;
11763
11764     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
11765     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
11766     SDValue InsElt;
11767
11768     // If Elt0 and Elt1 are defined, are consecutive, and can be load
11769     // using a single extract together, load it and store it.
11770     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
11771       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11772                            DAG.getIntPtrConstant(Elt1 / 2));
11773       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11774                         DAG.getIntPtrConstant(i));
11775       continue;
11776     }
11777
11778     // If Elt1 is defined, extract it from the appropriate source.  If the
11779     // source byte is not also odd, shift the extracted word left 8 bits
11780     // otherwise clear the bottom 8 bits if we need to do an or.
11781     if (Elt1 >= 0) {
11782       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11783                            DAG.getIntPtrConstant(Elt1 / 2));
11784       if ((Elt1 & 1) == 0)
11785         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
11786                              DAG.getConstant(8,
11787                                   TLI.getShiftAmountTy(InsElt.getValueType())));
11788       else if (Elt0 >= 0)
11789         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
11790                              DAG.getConstant(0xFF00, MVT::i16));
11791     }
11792     // If Elt0 is defined, extract it from the appropriate source.  If the
11793     // source byte is not also even, shift the extracted word right 8 bits. If
11794     // Elt1 was also defined, OR the extracted values together before
11795     // inserting them in the result.
11796     if (Elt0 >= 0) {
11797       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
11798                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
11799       if ((Elt0 & 1) != 0)
11800         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
11801                               DAG.getConstant(8,
11802                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
11803       else if (Elt1 >= 0)
11804         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
11805                              DAG.getConstant(0x00FF, MVT::i16));
11806       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
11807                          : InsElt0;
11808     }
11809     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11810                        DAG.getIntPtrConstant(i));
11811   }
11812   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
11813 }
11814
11815 // v32i8 shuffles - Translate to VPSHUFB if possible.
11816 static
11817 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
11818                                  const X86Subtarget *Subtarget,
11819                                  SelectionDAG &DAG) {
11820   MVT VT = SVOp->getSimpleValueType(0);
11821   SDValue V1 = SVOp->getOperand(0);
11822   SDValue V2 = SVOp->getOperand(1);
11823   SDLoc dl(SVOp);
11824   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11825
11826   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11827   bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
11828   bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
11829
11830   // VPSHUFB may be generated if
11831   // (1) one of input vector is undefined or zeroinitializer.
11832   // The mask value 0x80 puts 0 in the corresponding slot of the vector.
11833   // And (2) the mask indexes don't cross the 128-bit lane.
11834   if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
11835       (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
11836     return SDValue();
11837
11838   if (V1IsAllZero && !V2IsAllZero) {
11839     CommuteVectorShuffleMask(MaskVals, 32);
11840     V1 = V2;
11841   }
11842   return getPSHUFB(MaskVals, V1, dl, DAG);
11843 }
11844
11845 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
11846 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
11847 /// done when every pair / quad of shuffle mask elements point to elements in
11848 /// the right sequence. e.g.
11849 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
11850 static
11851 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
11852                                  SelectionDAG &DAG) {
11853   MVT VT = SVOp->getSimpleValueType(0);
11854   SDLoc dl(SVOp);
11855   unsigned NumElems = VT.getVectorNumElements();
11856   MVT NewVT;
11857   unsigned Scale;
11858   switch (VT.SimpleTy) {
11859   default: llvm_unreachable("Unexpected!");
11860   case MVT::v2i64:
11861   case MVT::v2f64:
11862            return SDValue(SVOp, 0);
11863   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
11864   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
11865   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
11866   case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
11867   case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
11868   case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
11869   }
11870
11871   SmallVector<int, 8> MaskVec;
11872   for (unsigned i = 0; i != NumElems; i += Scale) {
11873     int StartIdx = -1;
11874     for (unsigned j = 0; j != Scale; ++j) {
11875       int EltIdx = SVOp->getMaskElt(i+j);
11876       if (EltIdx < 0)
11877         continue;
11878       if (StartIdx < 0)
11879         StartIdx = (EltIdx / Scale);
11880       if (EltIdx != (int)(StartIdx*Scale + j))
11881         return SDValue();
11882     }
11883     MaskVec.push_back(StartIdx);
11884   }
11885
11886   SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
11887   SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
11888   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
11889 }
11890
11891 /// getVZextMovL - Return a zero-extending vector move low node.
11892 ///
11893 static SDValue getVZextMovL(MVT VT, MVT OpVT,
11894                             SDValue SrcOp, SelectionDAG &DAG,
11895                             const X86Subtarget *Subtarget, SDLoc dl) {
11896   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
11897     LoadSDNode *LD = nullptr;
11898     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
11899       LD = dyn_cast<LoadSDNode>(SrcOp);
11900     if (!LD) {
11901       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
11902       // instead.
11903       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
11904       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
11905           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
11906           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
11907           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
11908         // PR2108
11909         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
11910         return DAG.getNode(ISD::BITCAST, dl, VT,
11911                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11912                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11913                                                    OpVT,
11914                                                    SrcOp.getOperand(0)
11915                                                           .getOperand(0))));
11916       }
11917     }
11918   }
11919
11920   return DAG.getNode(ISD::BITCAST, dl, VT,
11921                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11922                                  DAG.getNode(ISD::BITCAST, dl,
11923                                              OpVT, SrcOp)));
11924 }
11925
11926 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
11927 /// which could not be matched by any known target speficic shuffle
11928 static SDValue
11929 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11930
11931   SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
11932   if (NewOp.getNode())
11933     return NewOp;
11934
11935   MVT VT = SVOp->getSimpleValueType(0);
11936
11937   unsigned NumElems = VT.getVectorNumElements();
11938   unsigned NumLaneElems = NumElems / 2;
11939
11940   SDLoc dl(SVOp);
11941   MVT EltVT = VT.getVectorElementType();
11942   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
11943   SDValue Output[2];
11944
11945   SmallVector<int, 16> Mask;
11946   for (unsigned l = 0; l < 2; ++l) {
11947     // Build a shuffle mask for the output, discovering on the fly which
11948     // input vectors to use as shuffle operands (recorded in InputUsed).
11949     // If building a suitable shuffle vector proves too hard, then bail
11950     // out with UseBuildVector set.
11951     bool UseBuildVector = false;
11952     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
11953     unsigned LaneStart = l * NumLaneElems;
11954     for (unsigned i = 0; i != NumLaneElems; ++i) {
11955       // The mask element.  This indexes into the input.
11956       int Idx = SVOp->getMaskElt(i+LaneStart);
11957       if (Idx < 0) {
11958         // the mask element does not index into any input vector.
11959         Mask.push_back(-1);
11960         continue;
11961       }
11962
11963       // The input vector this mask element indexes into.
11964       int Input = Idx / NumLaneElems;
11965
11966       // Turn the index into an offset from the start of the input vector.
11967       Idx -= Input * NumLaneElems;
11968
11969       // Find or create a shuffle vector operand to hold this input.
11970       unsigned OpNo;
11971       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
11972         if (InputUsed[OpNo] == Input)
11973           // This input vector is already an operand.
11974           break;
11975         if (InputUsed[OpNo] < 0) {
11976           // Create a new operand for this input vector.
11977           InputUsed[OpNo] = Input;
11978           break;
11979         }
11980       }
11981
11982       if (OpNo >= array_lengthof(InputUsed)) {
11983         // More than two input vectors used!  Give up on trying to create a
11984         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
11985         UseBuildVector = true;
11986         break;
11987       }
11988
11989       // Add the mask index for the new shuffle vector.
11990       Mask.push_back(Idx + OpNo * NumLaneElems);
11991     }
11992
11993     if (UseBuildVector) {
11994       SmallVector<SDValue, 16> SVOps;
11995       for (unsigned i = 0; i != NumLaneElems; ++i) {
11996         // The mask element.  This indexes into the input.
11997         int Idx = SVOp->getMaskElt(i+LaneStart);
11998         if (Idx < 0) {
11999           SVOps.push_back(DAG.getUNDEF(EltVT));
12000           continue;
12001         }
12002
12003         // The input vector this mask element indexes into.
12004         int Input = Idx / NumElems;
12005
12006         // Turn the index into an offset from the start of the input vector.
12007         Idx -= Input * NumElems;
12008
12009         // Extract the vector element by hand.
12010         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
12011                                     SVOp->getOperand(Input),
12012                                     DAG.getIntPtrConstant(Idx)));
12013       }
12014
12015       // Construct the output using a BUILD_VECTOR.
12016       Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
12017     } else if (InputUsed[0] < 0) {
12018       // No input vectors were used! The result is undefined.
12019       Output[l] = DAG.getUNDEF(NVT);
12020     } else {
12021       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
12022                                         (InputUsed[0] % 2) * NumLaneElems,
12023                                         DAG, dl);
12024       // If only one input was used, use an undefined vector for the other.
12025       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
12026         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
12027                             (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
12028       // At least one input vector was used. Create a new shuffle vector.
12029       Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
12030     }
12031
12032     Mask.clear();
12033   }
12034
12035   // Concatenate the result back
12036   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
12037 }
12038
12039 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
12040 /// 4 elements, and match them with several different shuffle types.
12041 static SDValue
12042 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
12043   SDValue V1 = SVOp->getOperand(0);
12044   SDValue V2 = SVOp->getOperand(1);
12045   SDLoc dl(SVOp);
12046   MVT VT = SVOp->getSimpleValueType(0);
12047
12048   assert(VT.is128BitVector() && "Unsupported vector size");
12049
12050   std::pair<int, int> Locs[4];
12051   int Mask1[] = { -1, -1, -1, -1 };
12052   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
12053
12054   unsigned NumHi = 0;
12055   unsigned NumLo = 0;
12056   for (unsigned i = 0; i != 4; ++i) {
12057     int Idx = PermMask[i];
12058     if (Idx < 0) {
12059       Locs[i] = std::make_pair(-1, -1);
12060     } else {
12061       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
12062       if (Idx < 4) {
12063         Locs[i] = std::make_pair(0, NumLo);
12064         Mask1[NumLo] = Idx;
12065         NumLo++;
12066       } else {
12067         Locs[i] = std::make_pair(1, NumHi);
12068         if (2+NumHi < 4)
12069           Mask1[2+NumHi] = Idx;
12070         NumHi++;
12071       }
12072     }
12073   }
12074
12075   if (NumLo <= 2 && NumHi <= 2) {
12076     // If no more than two elements come from either vector. This can be
12077     // implemented with two shuffles. First shuffle gather the elements.
12078     // The second shuffle, which takes the first shuffle as both of its
12079     // vector operands, put the elements into the right order.
12080     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12081
12082     int Mask2[] = { -1, -1, -1, -1 };
12083
12084     for (unsigned i = 0; i != 4; ++i)
12085       if (Locs[i].first != -1) {
12086         unsigned Idx = (i < 2) ? 0 : 4;
12087         Idx += Locs[i].first * 2 + Locs[i].second;
12088         Mask2[i] = Idx;
12089       }
12090
12091     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
12092   }
12093
12094   if (NumLo == 3 || NumHi == 3) {
12095     // Otherwise, we must have three elements from one vector, call it X, and
12096     // one element from the other, call it Y.  First, use a shufps to build an
12097     // intermediate vector with the one element from Y and the element from X
12098     // that will be in the same half in the final destination (the indexes don't
12099     // matter). Then, use a shufps to build the final vector, taking the half
12100     // containing the element from Y from the intermediate, and the other half
12101     // from X.
12102     if (NumHi == 3) {
12103       // Normalize it so the 3 elements come from V1.
12104       CommuteVectorShuffleMask(PermMask, 4);
12105       std::swap(V1, V2);
12106     }
12107
12108     // Find the element from V2.
12109     unsigned HiIndex;
12110     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
12111       int Val = PermMask[HiIndex];
12112       if (Val < 0)
12113         continue;
12114       if (Val >= 4)
12115         break;
12116     }
12117
12118     Mask1[0] = PermMask[HiIndex];
12119     Mask1[1] = -1;
12120     Mask1[2] = PermMask[HiIndex^1];
12121     Mask1[3] = -1;
12122     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12123
12124     if (HiIndex >= 2) {
12125       Mask1[0] = PermMask[0];
12126       Mask1[1] = PermMask[1];
12127       Mask1[2] = HiIndex & 1 ? 6 : 4;
12128       Mask1[3] = HiIndex & 1 ? 4 : 6;
12129       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12130     }
12131
12132     Mask1[0] = HiIndex & 1 ? 2 : 0;
12133     Mask1[1] = HiIndex & 1 ? 0 : 2;
12134     Mask1[2] = PermMask[2];
12135     Mask1[3] = PermMask[3];
12136     if (Mask1[2] >= 0)
12137       Mask1[2] += 4;
12138     if (Mask1[3] >= 0)
12139       Mask1[3] += 4;
12140     return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
12141   }
12142
12143   // Break it into (shuffle shuffle_hi, shuffle_lo).
12144   int LoMask[] = { -1, -1, -1, -1 };
12145   int HiMask[] = { -1, -1, -1, -1 };
12146
12147   int *MaskPtr = LoMask;
12148   unsigned MaskIdx = 0;
12149   unsigned LoIdx = 0;
12150   unsigned HiIdx = 2;
12151   for (unsigned i = 0; i != 4; ++i) {
12152     if (i == 2) {
12153       MaskPtr = HiMask;
12154       MaskIdx = 1;
12155       LoIdx = 0;
12156       HiIdx = 2;
12157     }
12158     int Idx = PermMask[i];
12159     if (Idx < 0) {
12160       Locs[i] = std::make_pair(-1, -1);
12161     } else if (Idx < 4) {
12162       Locs[i] = std::make_pair(MaskIdx, LoIdx);
12163       MaskPtr[LoIdx] = Idx;
12164       LoIdx++;
12165     } else {
12166       Locs[i] = std::make_pair(MaskIdx, HiIdx);
12167       MaskPtr[HiIdx] = Idx;
12168       HiIdx++;
12169     }
12170   }
12171
12172   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
12173   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
12174   int MaskOps[] = { -1, -1, -1, -1 };
12175   for (unsigned i = 0; i != 4; ++i)
12176     if (Locs[i].first != -1)
12177       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
12178   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
12179 }
12180
12181 static bool MayFoldVectorLoad(SDValue V) {
12182   while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
12183     V = V.getOperand(0);
12184
12185   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
12186     V = V.getOperand(0);
12187   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
12188       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
12189     // BUILD_VECTOR (load), undef
12190     V = V.getOperand(0);
12191
12192   return MayFoldLoad(V);
12193 }
12194
12195 static
12196 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
12197   MVT VT = Op.getSimpleValueType();
12198
12199   // Canonizalize to v2f64.
12200   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
12201   return DAG.getNode(ISD::BITCAST, dl, VT,
12202                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
12203                                           V1, DAG));
12204 }
12205
12206 static
12207 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
12208                         bool HasSSE2) {
12209   SDValue V1 = Op.getOperand(0);
12210   SDValue V2 = Op.getOperand(1);
12211   MVT VT = Op.getSimpleValueType();
12212
12213   assert(VT != MVT::v2i64 && "unsupported shuffle type");
12214
12215   if (HasSSE2 && VT == MVT::v2f64)
12216     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
12217
12218   // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
12219   return DAG.getNode(ISD::BITCAST, dl, VT,
12220                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
12221                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
12222                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
12223 }
12224
12225 static
12226 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
12227   SDValue V1 = Op.getOperand(0);
12228   SDValue V2 = Op.getOperand(1);
12229   MVT VT = Op.getSimpleValueType();
12230
12231   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
12232          "unsupported shuffle type");
12233
12234   if (V2.getOpcode() == ISD::UNDEF)
12235     V2 = V1;
12236
12237   // v4i32 or v4f32
12238   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
12239 }
12240
12241 static
12242 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
12243   SDValue V1 = Op.getOperand(0);
12244   SDValue V2 = Op.getOperand(1);
12245   MVT VT = Op.getSimpleValueType();
12246   unsigned NumElems = VT.getVectorNumElements();
12247
12248   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
12249   // operand of these instructions is only memory, so check if there's a
12250   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
12251   // same masks.
12252   bool CanFoldLoad = false;
12253
12254   // Trivial case, when V2 comes from a load.
12255   if (MayFoldVectorLoad(V2))
12256     CanFoldLoad = true;
12257
12258   // When V1 is a load, it can be folded later into a store in isel, example:
12259   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
12260   //    turns into:
12261   //  (MOVLPSmr addr:$src1, VR128:$src2)
12262   // So, recognize this potential and also use MOVLPS or MOVLPD
12263   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
12264     CanFoldLoad = true;
12265
12266   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12267   if (CanFoldLoad) {
12268     if (HasSSE2 && NumElems == 2)
12269       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
12270
12271     if (NumElems == 4)
12272       // If we don't care about the second element, proceed to use movss.
12273       if (SVOp->getMaskElt(1) != -1)
12274         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
12275   }
12276
12277   // movl and movlp will both match v2i64, but v2i64 is never matched by
12278   // movl earlier because we make it strict to avoid messing with the movlp load
12279   // folding logic (see the code above getMOVLP call). Match it here then,
12280   // this is horrible, but will stay like this until we move all shuffle
12281   // matching to x86 specific nodes. Note that for the 1st condition all
12282   // types are matched with movsd.
12283   if (HasSSE2) {
12284     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
12285     // as to remove this logic from here, as much as possible
12286     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
12287       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12288     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12289   }
12290
12291   assert(VT != MVT::v4i32 && "unsupported shuffle type");
12292
12293   // Invert the operand order and use SHUFPS to match it.
12294   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
12295                               getShuffleSHUFImmediate(SVOp), DAG);
12296 }
12297
12298 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
12299                                          SelectionDAG &DAG) {
12300   SDLoc dl(Load);
12301   MVT VT = Load->getSimpleValueType(0);
12302   MVT EVT = VT.getVectorElementType();
12303   SDValue Addr = Load->getOperand(1);
12304   SDValue NewAddr = DAG.getNode(
12305       ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
12306       DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
12307
12308   SDValue NewLoad =
12309       DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
12310                   DAG.getMachineFunction().getMachineMemOperand(
12311                       Load->getMemOperand(), 0, EVT.getStoreSize()));
12312   return NewLoad;
12313 }
12314
12315 // It is only safe to call this function if isINSERTPSMask is true for
12316 // this shufflevector mask.
12317 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
12318                            SelectionDAG &DAG) {
12319   // Generate an insertps instruction when inserting an f32 from memory onto a
12320   // v4f32 or when copying a member from one v4f32 to another.
12321   // We also use it for transferring i32 from one register to another,
12322   // since it simply copies the same bits.
12323   // If we're transferring an i32 from memory to a specific element in a
12324   // register, we output a generic DAG that will match the PINSRD
12325   // instruction.
12326   MVT VT = SVOp->getSimpleValueType(0);
12327   MVT EVT = VT.getVectorElementType();
12328   SDValue V1 = SVOp->getOperand(0);
12329   SDValue V2 = SVOp->getOperand(1);
12330   auto Mask = SVOp->getMask();
12331   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
12332          "unsupported vector type for insertps/pinsrd");
12333
12334   auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
12335   auto FromV2Predicate = [](const int &i) { return i >= 4; };
12336   int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
12337
12338   SDValue From;
12339   SDValue To;
12340   unsigned DestIndex;
12341   if (FromV1 == 1) {
12342     From = V1;
12343     To = V2;
12344     DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
12345                 Mask.begin();
12346
12347     // If we have 1 element from each vector, we have to check if we're
12348     // changing V1's element's place. If so, we're done. Otherwise, we
12349     // should assume we're changing V2's element's place and behave
12350     // accordingly.
12351     int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
12352     assert(DestIndex <= INT32_MAX && "truncated destination index");
12353     if (FromV1 == FromV2 &&
12354         static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
12355       From = V2;
12356       To = V1;
12357       DestIndex =
12358           std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12359     }
12360   } else {
12361     assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
12362            "More than one element from V1 and from V2, or no elements from one "
12363            "of the vectors. This case should not have returned true from "
12364            "isINSERTPSMask");
12365     From = V2;
12366     To = V1;
12367     DestIndex =
12368         std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12369   }
12370
12371   // Get an index into the source vector in the range [0,4) (the mask is
12372   // in the range [0,8) because it can address V1 and V2)
12373   unsigned SrcIndex = Mask[DestIndex] % 4;
12374   if (MayFoldLoad(From)) {
12375     // Trivial case, when From comes from a load and is only used by the
12376     // shuffle. Make it use insertps from the vector that we need from that
12377     // load.
12378     SDValue NewLoad =
12379         NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
12380     if (!NewLoad.getNode())
12381       return SDValue();
12382
12383     if (EVT == MVT::f32) {
12384       // Create this as a scalar to vector to match the instruction pattern.
12385       SDValue LoadScalarToVector =
12386           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
12387       SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
12388       return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
12389                          InsertpsMask);
12390     } else { // EVT == MVT::i32
12391       // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
12392       // instruction, to match the PINSRD instruction, which loads an i32 to a
12393       // certain vector element.
12394       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
12395                          DAG.getConstant(DestIndex, MVT::i32));
12396     }
12397   }
12398
12399   // Vector-element-to-vector
12400   SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
12401   return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
12402 }
12403
12404 // Reduce a vector shuffle to zext.
12405 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
12406                                     SelectionDAG &DAG) {
12407   // PMOVZX is only available from SSE41.
12408   if (!Subtarget->hasSSE41())
12409     return SDValue();
12410
12411   MVT VT = Op.getSimpleValueType();
12412
12413   // Only AVX2 support 256-bit vector integer extending.
12414   if (!Subtarget->hasInt256() && VT.is256BitVector())
12415     return SDValue();
12416
12417   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12418   SDLoc DL(Op);
12419   SDValue V1 = Op.getOperand(0);
12420   SDValue V2 = Op.getOperand(1);
12421   unsigned NumElems = VT.getVectorNumElements();
12422
12423   // Extending is an unary operation and the element type of the source vector
12424   // won't be equal to or larger than i64.
12425   if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
12426       VT.getVectorElementType() == MVT::i64)
12427     return SDValue();
12428
12429   // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
12430   unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
12431   while ((1U << Shift) < NumElems) {
12432     if (SVOp->getMaskElt(1U << Shift) == 1)
12433       break;
12434     Shift += 1;
12435     // The maximal ratio is 8, i.e. from i8 to i64.
12436     if (Shift > 3)
12437       return SDValue();
12438   }
12439
12440   // Check the shuffle mask.
12441   unsigned Mask = (1U << Shift) - 1;
12442   for (unsigned i = 0; i != NumElems; ++i) {
12443     int EltIdx = SVOp->getMaskElt(i);
12444     if ((i & Mask) != 0 && EltIdx != -1)
12445       return SDValue();
12446     if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
12447       return SDValue();
12448   }
12449
12450   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
12451   MVT NeVT = MVT::getIntegerVT(NBits);
12452   MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
12453
12454   if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
12455     return SDValue();
12456
12457   return DAG.getNode(ISD::BITCAST, DL, VT,
12458                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
12459 }
12460
12461 static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
12462                                       SelectionDAG &DAG) {
12463   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12464   MVT VT = Op.getSimpleValueType();
12465   SDLoc dl(Op);
12466   SDValue V1 = Op.getOperand(0);
12467   SDValue V2 = Op.getOperand(1);
12468
12469   if (isZeroShuffle(SVOp))
12470     return getZeroVector(VT, Subtarget, DAG, dl);
12471
12472   // Handle splat operations
12473   if (SVOp->isSplat()) {
12474     // Use vbroadcast whenever the splat comes from a foldable load
12475     SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
12476     if (Broadcast.getNode())
12477       return Broadcast;
12478   }
12479
12480   // Check integer expanding shuffles.
12481   SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
12482   if (NewOp.getNode())
12483     return NewOp;
12484
12485   // If the shuffle can be profitably rewritten as a narrower shuffle, then
12486   // do it!
12487   if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
12488       VT == MVT::v32i8) {
12489     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12490     if (NewOp.getNode())
12491       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
12492   } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
12493     // FIXME: Figure out a cleaner way to do this.
12494     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
12495       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12496       if (NewOp.getNode()) {
12497         MVT NewVT = NewOp.getSimpleValueType();
12498         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
12499                                NewVT, true, false))
12500           return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
12501                               dl);
12502       }
12503     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
12504       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12505       if (NewOp.getNode()) {
12506         MVT NewVT = NewOp.getSimpleValueType();
12507         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
12508           return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
12509                               dl);
12510       }
12511     }
12512   }
12513   return SDValue();
12514 }
12515
12516 SDValue
12517 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
12518   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12519   SDValue V1 = Op.getOperand(0);
12520   SDValue V2 = Op.getOperand(1);
12521   MVT VT = Op.getSimpleValueType();
12522   SDLoc dl(Op);
12523   unsigned NumElems = VT.getVectorNumElements();
12524   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
12525   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
12526   bool V1IsSplat = false;
12527   bool V2IsSplat = false;
12528   bool HasSSE2 = Subtarget->hasSSE2();
12529   bool HasFp256    = Subtarget->hasFp256();
12530   bool HasInt256   = Subtarget->hasInt256();
12531   MachineFunction &MF = DAG.getMachineFunction();
12532   bool OptForSize = MF.getFunction()->getAttributes().
12533     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
12534
12535   // Check if we should use the experimental vector shuffle lowering. If so,
12536   // delegate completely to that code path.
12537   if (ExperimentalVectorShuffleLowering)
12538     return lowerVectorShuffle(Op, Subtarget, DAG);
12539
12540   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
12541
12542   if (V1IsUndef && V2IsUndef)
12543     return DAG.getUNDEF(VT);
12544
12545   // When we create a shuffle node we put the UNDEF node to second operand,
12546   // but in some cases the first operand may be transformed to UNDEF.
12547   // In this case we should just commute the node.
12548   if (V1IsUndef)
12549     return DAG.getCommutedVectorShuffle(*SVOp);
12550
12551   // Vector shuffle lowering takes 3 steps:
12552   //
12553   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
12554   //    narrowing and commutation of operands should be handled.
12555   // 2) Matching of shuffles with known shuffle masks to x86 target specific
12556   //    shuffle nodes.
12557   // 3) Rewriting of unmatched masks into new generic shuffle operations,
12558   //    so the shuffle can be broken into other shuffles and the legalizer can
12559   //    try the lowering again.
12560   //
12561   // The general idea is that no vector_shuffle operation should be left to
12562   // be matched during isel, all of them must be converted to a target specific
12563   // node here.
12564
12565   // Normalize the input vectors. Here splats, zeroed vectors, profitable
12566   // narrowing and commutation of operands should be handled. The actual code
12567   // doesn't include all of those, work in progress...
12568   SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
12569   if (NewOp.getNode())
12570     return NewOp;
12571
12572   SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
12573
12574   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
12575   // unpckh_undef). Only use pshufd if speed is more important than size.
12576   if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12577     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12578   if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12579     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12580
12581   if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
12582       V2IsUndef && MayFoldVectorLoad(V1))
12583     return getMOVDDup(Op, dl, V1, DAG);
12584
12585   if (isMOVHLPS_v_undef_Mask(M, VT))
12586     return getMOVHighToLow(Op, dl, DAG);
12587
12588   // Use to match splats
12589   if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
12590       (VT == MVT::v2f64 || VT == MVT::v2i64))
12591     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12592
12593   if (isPSHUFDMask(M, VT)) {
12594     // The actual implementation will match the mask in the if above and then
12595     // during isel it can match several different instructions, not only pshufd
12596     // as its name says, sad but true, emulate the behavior for now...
12597     if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
12598       return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
12599
12600     unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
12601
12602     if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
12603       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
12604
12605     if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
12606       return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
12607                                   DAG);
12608
12609     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
12610                                 TargetMask, DAG);
12611   }
12612
12613   if (isPALIGNRMask(M, VT, Subtarget))
12614     return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
12615                                 getShufflePALIGNRImmediate(SVOp),
12616                                 DAG);
12617
12618   if (isVALIGNMask(M, VT, Subtarget))
12619     return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
12620                                 getShuffleVALIGNImmediate(SVOp),
12621                                 DAG);
12622
12623   // Check if this can be converted into a logical shift.
12624   bool isLeft = false;
12625   unsigned ShAmt = 0;
12626   SDValue ShVal;
12627   bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
12628   if (isShift && ShVal.hasOneUse()) {
12629     // If the shifted value has multiple uses, it may be cheaper to use
12630     // v_set0 + movlhps or movhlps, etc.
12631     MVT EltVT = VT.getVectorElementType();
12632     ShAmt *= EltVT.getSizeInBits();
12633     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12634   }
12635
12636   if (isMOVLMask(M, VT)) {
12637     if (ISD::isBuildVectorAllZeros(V1.getNode()))
12638       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
12639     if (!isMOVLPMask(M, VT)) {
12640       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
12641         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12642
12643       if (VT == MVT::v4i32 || VT == MVT::v4f32)
12644         return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12645     }
12646   }
12647
12648   // FIXME: fold these into legal mask.
12649   if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
12650     return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
12651
12652   if (isMOVHLPSMask(M, VT))
12653     return getMOVHighToLow(Op, dl, DAG);
12654
12655   if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
12656     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
12657
12658   if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
12659     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
12660
12661   if (isMOVLPMask(M, VT))
12662     return getMOVLP(Op, dl, DAG, HasSSE2);
12663
12664   if (ShouldXformToMOVHLPS(M, VT) ||
12665       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
12666     return DAG.getCommutedVectorShuffle(*SVOp);
12667
12668   if (isShift) {
12669     // No better options. Use a vshldq / vsrldq.
12670     MVT EltVT = VT.getVectorElementType();
12671     ShAmt *= EltVT.getSizeInBits();
12672     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12673   }
12674
12675   bool Commuted = false;
12676   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
12677   // 1,1,1,1 -> v8i16 though.
12678   BitVector UndefElements;
12679   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
12680     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12681       V1IsSplat = true;
12682   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
12683     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12684       V2IsSplat = true;
12685
12686   // Canonicalize the splat or undef, if present, to be on the RHS.
12687   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
12688     CommuteVectorShuffleMask(M, NumElems);
12689     std::swap(V1, V2);
12690     std::swap(V1IsSplat, V2IsSplat);
12691     Commuted = true;
12692   }
12693
12694   if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
12695     // Shuffling low element of v1 into undef, just return v1.
12696     if (V2IsUndef)
12697       return V1;
12698     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
12699     // the instruction selector will not match, so get a canonical MOVL with
12700     // swapped operands to undo the commute.
12701     return getMOVL(DAG, dl, VT, V2, V1);
12702   }
12703
12704   if (isUNPCKLMask(M, VT, HasInt256))
12705     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12706
12707   if (isUNPCKHMask(M, VT, HasInt256))
12708     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12709
12710   if (V2IsSplat) {
12711     // Normalize mask so all entries that point to V2 points to its first
12712     // element then try to match unpck{h|l} again. If match, return a
12713     // new vector_shuffle with the corrected mask.p
12714     SmallVector<int, 8> NewMask(M.begin(), M.end());
12715     NormalizeMask(NewMask, NumElems);
12716     if (isUNPCKLMask(NewMask, VT, HasInt256, true))
12717       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12718     if (isUNPCKHMask(NewMask, VT, HasInt256, true))
12719       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12720   }
12721
12722   if (Commuted) {
12723     // Commute is back and try unpck* again.
12724     // FIXME: this seems wrong.
12725     CommuteVectorShuffleMask(M, NumElems);
12726     std::swap(V1, V2);
12727     std::swap(V1IsSplat, V2IsSplat);
12728
12729     if (isUNPCKLMask(M, VT, HasInt256))
12730       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12731
12732     if (isUNPCKHMask(M, VT, HasInt256))
12733       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12734   }
12735
12736   // Normalize the node to match x86 shuffle ops if needed
12737   if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
12738     return DAG.getCommutedVectorShuffle(*SVOp);
12739
12740   // The checks below are all present in isShuffleMaskLegal, but they are
12741   // inlined here right now to enable us to directly emit target specific
12742   // nodes, and remove one by one until they don't return Op anymore.
12743
12744   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
12745       SVOp->getSplatIndex() == 0 && V2IsUndef) {
12746     if (VT == MVT::v2f64 || VT == MVT::v2i64)
12747       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12748   }
12749
12750   if (isPSHUFHWMask(M, VT, HasInt256))
12751     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
12752                                 getShufflePSHUFHWImmediate(SVOp),
12753                                 DAG);
12754
12755   if (isPSHUFLWMask(M, VT, HasInt256))
12756     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
12757                                 getShufflePSHUFLWImmediate(SVOp),
12758                                 DAG);
12759
12760   unsigned MaskValue;
12761   if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
12762                   &MaskValue))
12763     return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
12764
12765   if (isSHUFPMask(M, VT))
12766     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
12767                                 getShuffleSHUFImmediate(SVOp), DAG);
12768
12769   if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12770     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12771   if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12772     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12773
12774   //===--------------------------------------------------------------------===//
12775   // Generate target specific nodes for 128 or 256-bit shuffles only
12776   // supported in the AVX instruction set.
12777   //
12778
12779   // Handle VMOVDDUPY permutations
12780   if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
12781     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
12782
12783   // Handle VPERMILPS/D* permutations
12784   if (isVPERMILPMask(M, VT)) {
12785     if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
12786       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
12787                                   getShuffleSHUFImmediate(SVOp), DAG);
12788     return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
12789                                 getShuffleSHUFImmediate(SVOp), DAG);
12790   }
12791
12792   unsigned Idx;
12793   if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
12794     return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
12795                               Idx*(NumElems/2), DAG, dl);
12796
12797   // Handle VPERM2F128/VPERM2I128 permutations
12798   if (isVPERM2X128Mask(M, VT, HasFp256))
12799     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
12800                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
12801
12802   if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
12803     return getINSERTPS(SVOp, dl, DAG);
12804
12805   unsigned Imm8;
12806   if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
12807     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
12808
12809   if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
12810       VT.is512BitVector()) {
12811     MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
12812     MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
12813     SmallVector<SDValue, 16> permclMask;
12814     for (unsigned i = 0; i != NumElems; ++i) {
12815       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
12816     }
12817
12818     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
12819     if (V2IsUndef)
12820       // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
12821       return DAG.getNode(X86ISD::VPERMV, dl, VT,
12822                           DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
12823     return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
12824                        DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
12825   }
12826
12827   //===--------------------------------------------------------------------===//
12828   // Since no target specific shuffle was selected for this generic one,
12829   // lower it into other known shuffles. FIXME: this isn't true yet, but
12830   // this is the plan.
12831   //
12832
12833   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
12834   if (VT == MVT::v8i16) {
12835     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
12836     if (NewOp.getNode())
12837       return NewOp;
12838   }
12839
12840   if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
12841     SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
12842     if (NewOp.getNode())
12843       return NewOp;
12844   }
12845
12846   if (VT == MVT::v16i8) {
12847     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
12848     if (NewOp.getNode())
12849       return NewOp;
12850   }
12851
12852   if (VT == MVT::v32i8) {
12853     SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
12854     if (NewOp.getNode())
12855       return NewOp;
12856   }
12857
12858   // Handle all 128-bit wide vectors with 4 elements, and match them with
12859   // several different shuffle types.
12860   if (NumElems == 4 && VT.is128BitVector())
12861     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
12862
12863   // Handle general 256-bit shuffles
12864   if (VT.is256BitVector())
12865     return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
12866
12867   return SDValue();
12868 }
12869
12870 // This function assumes its argument is a BUILD_VECTOR of constants or
12871 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
12872 // true.
12873 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
12874                                     unsigned &MaskValue) {
12875   MaskValue = 0;
12876   unsigned NumElems = BuildVector->getNumOperands();
12877   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
12878   unsigned NumLanes = (NumElems - 1) / 8 + 1;
12879   unsigned NumElemsInLane = NumElems / NumLanes;
12880
12881   // Blend for v16i16 should be symetric for the both lanes.
12882   for (unsigned i = 0; i < NumElemsInLane; ++i) {
12883     SDValue EltCond = BuildVector->getOperand(i);
12884     SDValue SndLaneEltCond =
12885         (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
12886
12887     int Lane1Cond = -1, Lane2Cond = -1;
12888     if (isa<ConstantSDNode>(EltCond))
12889       Lane1Cond = !isZero(EltCond);
12890     if (isa<ConstantSDNode>(SndLaneEltCond))
12891       Lane2Cond = !isZero(SndLaneEltCond);
12892
12893     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
12894       // Lane1Cond != 0, means we want the first argument.
12895       // Lane1Cond == 0, means we want the second argument.
12896       // The encoding of this argument is 0 for the first argument, 1
12897       // for the second. Therefore, invert the condition.
12898       MaskValue |= !Lane1Cond << i;
12899     else if (Lane1Cond < 0)
12900       MaskValue |= !Lane2Cond << i;
12901     else
12902       return false;
12903   }
12904   return true;
12905 }
12906
12907 /// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
12908 /// instruction.
12909 static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
12910                                     SelectionDAG &DAG) {
12911   SDValue Cond = Op.getOperand(0);
12912   SDValue LHS = Op.getOperand(1);
12913   SDValue RHS = Op.getOperand(2);
12914   SDLoc dl(Op);
12915   MVT VT = Op.getSimpleValueType();
12916   MVT EltVT = VT.getVectorElementType();
12917   unsigned NumElems = VT.getVectorNumElements();
12918
12919   // There is no blend with immediate in AVX-512.
12920   if (VT.is512BitVector())
12921     return SDValue();
12922
12923   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
12924     return SDValue();
12925   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
12926     return SDValue();
12927
12928   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12929     return SDValue();
12930
12931   // Check the mask for BLEND and build the value.
12932   unsigned MaskValue = 0;
12933   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
12934     return SDValue();
12935
12936   // Convert i32 vectors to floating point if it is not AVX2.
12937   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
12938   MVT BlendVT = VT;
12939   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
12940     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
12941                                NumElems);
12942     LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
12943     RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
12944   }
12945
12946   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
12947                             DAG.getConstant(MaskValue, MVT::i32));
12948   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
12949 }
12950
12951 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12952   // A vselect where all conditions and data are constants can be optimized into
12953   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12954   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12955       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12956       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12957     return SDValue();
12958
12959   SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
12960   if (BlendOp.getNode())
12961     return BlendOp;
12962
12963   // Some types for vselect were previously set to Expand, not Legal or
12964   // Custom. Return an empty SDValue so we fall-through to Expand, after
12965   // the Custom lowering phase.
12966   MVT VT = Op.getSimpleValueType();
12967   switch (VT.SimpleTy) {
12968   default:
12969     break;
12970   case MVT::v8i16:
12971   case MVT::v16i16:
12972     if (Subtarget->hasBWI() && Subtarget->hasVLX())
12973       break;
12974     return SDValue();
12975   }
12976
12977   // We couldn't create a "Blend with immediate" node.
12978   // This node should still be legal, but we'll have to emit a blendv*
12979   // instruction.
12980   return Op;
12981 }
12982
12983 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12984   MVT VT = Op.getSimpleValueType();
12985   SDLoc dl(Op);
12986
12987   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12988     return SDValue();
12989
12990   if (VT.getSizeInBits() == 8) {
12991     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12992                                   Op.getOperand(0), Op.getOperand(1));
12993     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12994                                   DAG.getValueType(VT));
12995     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12996   }
12997
12998   if (VT.getSizeInBits() == 16) {
12999     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13000     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
13001     if (Idx == 0)
13002       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13003                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13004                                      DAG.getNode(ISD::BITCAST, dl,
13005                                                  MVT::v4i32,
13006                                                  Op.getOperand(0)),
13007                                      Op.getOperand(1)));
13008     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
13009                                   Op.getOperand(0), Op.getOperand(1));
13010     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13011                                   DAG.getValueType(VT));
13012     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13013   }
13014
13015   if (VT == MVT::f32) {
13016     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13017     // the result back to FR32 register. It's only worth matching if the
13018     // result has a single use which is a store or a bitcast to i32.  And in
13019     // the case of a store, it's not worth it if the index is a constant 0,
13020     // because a MOVSSmr can be used instead, which is smaller and faster.
13021     if (!Op.hasOneUse())
13022       return SDValue();
13023     SDNode *User = *Op.getNode()->use_begin();
13024     if ((User->getOpcode() != ISD::STORE ||
13025          (isa<ConstantSDNode>(Op.getOperand(1)) &&
13026           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
13027         (User->getOpcode() != ISD::BITCAST ||
13028          User->getValueType(0) != MVT::i32))
13029       return SDValue();
13030     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13031                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
13032                                               Op.getOperand(0)),
13033                                               Op.getOperand(1));
13034     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
13035   }
13036
13037   if (VT == MVT::i32 || VT == MVT::i64) {
13038     // ExtractPS/pextrq works with constant index.
13039     if (isa<ConstantSDNode>(Op.getOperand(1)))
13040       return Op;
13041   }
13042   return SDValue();
13043 }
13044
13045 /// Extract one bit from mask vector, like v16i1 or v8i1.
13046 /// AVX-512 feature.
13047 SDValue
13048 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13049   SDValue Vec = Op.getOperand(0);
13050   SDLoc dl(Vec);
13051   MVT VecVT = Vec.getSimpleValueType();
13052   SDValue Idx = Op.getOperand(1);
13053   MVT EltVT = Op.getSimpleValueType();
13054
13055   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13056   assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
13057          "Unexpected vector type in ExtractBitFromMaskVector");
13058
13059   // variable index can't be handled in mask registers,
13060   // extend vector to VR512
13061   if (!isa<ConstantSDNode>(Idx)) {
13062     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13063     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
13064     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13065                               ExtVT.getVectorElementType(), Ext, Idx);
13066     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13067   }
13068
13069   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13070   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13071   if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
13072     rc = getRegClassFor(MVT::v16i1);
13073   unsigned MaxSift = rc->getSize()*8 - 1;
13074   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13075                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13076   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13077                     DAG.getConstant(MaxSift, MVT::i8));
13078   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
13079                        DAG.getIntPtrConstant(0));
13080 }
13081
13082 SDValue
13083 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13084                                            SelectionDAG &DAG) const {
13085   SDLoc dl(Op);
13086   SDValue Vec = Op.getOperand(0);
13087   MVT VecVT = Vec.getSimpleValueType();
13088   SDValue Idx = Op.getOperand(1);
13089
13090   if (Op.getSimpleValueType() == MVT::i1)
13091     return ExtractBitFromMaskVector(Op, DAG);
13092
13093   if (!isa<ConstantSDNode>(Idx)) {
13094     if (VecVT.is512BitVector() ||
13095         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
13096          VecVT.getVectorElementType().getSizeInBits() == 32)) {
13097
13098       MVT MaskEltVT =
13099         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
13100       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
13101                                     MaskEltVT.getSizeInBits());
13102
13103       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
13104       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
13105                                 getZeroVector(MaskVT, Subtarget, DAG, dl),
13106                                 Idx, DAG.getConstant(0, getPointerTy()));
13107       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
13108       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
13109                         Perm, DAG.getConstant(0, getPointerTy()));
13110     }
13111     return SDValue();
13112   }
13113
13114   // If this is a 256-bit vector result, first extract the 128-bit vector and
13115   // then extract the element from the 128-bit vector.
13116   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
13117
13118     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13119     // Get the 128-bit vector.
13120     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
13121     MVT EltVT = VecVT.getVectorElementType();
13122
13123     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
13124
13125     //if (IdxVal >= NumElems/2)
13126     //  IdxVal -= NumElems/2;
13127     IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
13128     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
13129                        DAG.getConstant(IdxVal, MVT::i32));
13130   }
13131
13132   assert(VecVT.is128BitVector() && "Unexpected vector length");
13133
13134   if (Subtarget->hasSSE41()) {
13135     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
13136     if (Res.getNode())
13137       return Res;
13138   }
13139
13140   MVT VT = Op.getSimpleValueType();
13141   // TODO: handle v16i8.
13142   if (VT.getSizeInBits() == 16) {
13143     SDValue Vec = Op.getOperand(0);
13144     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13145     if (Idx == 0)
13146       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13147                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13148                                      DAG.getNode(ISD::BITCAST, dl,
13149                                                  MVT::v4i32, Vec),
13150                                      Op.getOperand(1)));
13151     // Transform it so it match pextrw which produces a 32-bit result.
13152     MVT EltVT = MVT::i32;
13153     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
13154                                   Op.getOperand(0), Op.getOperand(1));
13155     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
13156                                   DAG.getValueType(VT));
13157     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13158   }
13159
13160   if (VT.getSizeInBits() == 32) {
13161     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13162     if (Idx == 0)
13163       return Op;
13164
13165     // SHUFPS the element to the lowest double word, then movss.
13166     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
13167     MVT VVT = Op.getOperand(0).getSimpleValueType();
13168     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13169                                        DAG.getUNDEF(VVT), Mask);
13170     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13171                        DAG.getIntPtrConstant(0));
13172   }
13173
13174   if (VT.getSizeInBits() == 64) {
13175     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
13176     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
13177     //        to match extract_elt for f64.
13178     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13179     if (Idx == 0)
13180       return Op;
13181
13182     // UNPCKHPD the element to the lowest double word, then movsd.
13183     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
13184     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
13185     int Mask[2] = { 1, -1 };
13186     MVT VVT = Op.getOperand(0).getSimpleValueType();
13187     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13188                                        DAG.getUNDEF(VVT), Mask);
13189     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13190                        DAG.getIntPtrConstant(0));
13191   }
13192
13193   return SDValue();
13194 }
13195
13196 /// Insert one bit to mask vector, like v16i1 or v8i1.
13197 /// AVX-512 feature.
13198 SDValue
13199 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
13200   SDLoc dl(Op);
13201   SDValue Vec = Op.getOperand(0);
13202   SDValue Elt = Op.getOperand(1);
13203   SDValue Idx = Op.getOperand(2);
13204   MVT VecVT = Vec.getSimpleValueType();
13205
13206   if (!isa<ConstantSDNode>(Idx)) {
13207     // Non constant index. Extend source and destination,
13208     // insert element and then truncate the result.
13209     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13210     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
13211     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13212       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13213       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13214     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13215   }
13216
13217   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13218   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13219   if (Vec.getOpcode() == ISD::UNDEF)
13220     return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13221                        DAG.getConstant(IdxVal, MVT::i8));
13222   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13223   unsigned MaxSift = rc->getSize()*8 - 1;
13224   EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13225                     DAG.getConstant(MaxSift, MVT::i8));
13226   EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
13227                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13228   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13229 }
13230
13231 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13232                                                   SelectionDAG &DAG) const {
13233   MVT VT = Op.getSimpleValueType();
13234   MVT EltVT = VT.getVectorElementType();
13235
13236   if (EltVT == MVT::i1)
13237     return InsertBitToMaskVector(Op, DAG);
13238
13239   SDLoc dl(Op);
13240   SDValue N0 = Op.getOperand(0);
13241   SDValue N1 = Op.getOperand(1);
13242   SDValue N2 = Op.getOperand(2);
13243   if (!isa<ConstantSDNode>(N2))
13244     return SDValue();
13245   auto *N2C = cast<ConstantSDNode>(N2);
13246   unsigned IdxVal = N2C->getZExtValue();
13247
13248   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13249   // into that, and then insert the subvector back into the result.
13250   if (VT.is256BitVector() || VT.is512BitVector()) {
13251     // Get the desired 128-bit vector half.
13252     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
13253
13254     // Insert the element into the desired half.
13255     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13256     unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
13257
13258     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13259                     DAG.getConstant(IdxIn128, MVT::i32));
13260
13261     // Insert the changed part back to the 256-bit vector
13262     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
13263   }
13264   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13265
13266   if (Subtarget->hasSSE41()) {
13267     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13268       unsigned Opc;
13269       if (VT == MVT::v8i16) {
13270         Opc = X86ISD::PINSRW;
13271       } else {
13272         assert(VT == MVT::v16i8);
13273         Opc = X86ISD::PINSRB;
13274       }
13275
13276       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13277       // argument.
13278       if (N1.getValueType() != MVT::i32)
13279         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13280       if (N2.getValueType() != MVT::i32)
13281         N2 = DAG.getIntPtrConstant(IdxVal);
13282       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13283     }
13284
13285     if (EltVT == MVT::f32) {
13286       // Bits [7:6] of the constant are the source select.  This will always be
13287       //  zero here.  The DAG Combiner may combine an extract_elt index into
13288       //  these
13289       //  bits.  For example (insert (extract, 3), 2) could be matched by
13290       //  putting
13291       //  the '3' into bits [7:6] of X86ISD::INSERTPS.
13292       // Bits [5:4] of the constant are the destination select.  This is the
13293       //  value of the incoming immediate.
13294       // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
13295       //   combine either bitwise AND or insert of float 0.0 to set these bits.
13296       N2 = DAG.getIntPtrConstant(IdxVal << 4);
13297       // Create this as a scalar to vector..
13298       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13299       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13300     }
13301
13302     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13303       // PINSR* works with constant index.
13304       return Op;
13305     }
13306   }
13307
13308   if (EltVT == MVT::i8)
13309     return SDValue();
13310
13311   if (EltVT.getSizeInBits() == 16) {
13312     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13313     // as its second argument.
13314     if (N1.getValueType() != MVT::i32)
13315       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13316     if (N2.getValueType() != MVT::i32)
13317       N2 = DAG.getIntPtrConstant(IdxVal);
13318     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13319   }
13320   return SDValue();
13321 }
13322
13323 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13324   SDLoc dl(Op);
13325   MVT OpVT = Op.getSimpleValueType();
13326
13327   // If this is a 256-bit vector result, first insert into a 128-bit
13328   // vector and then insert into the 256-bit vector.
13329   if (!OpVT.is128BitVector()) {
13330     // Insert into a 128-bit vector.
13331     unsigned SizeFactor = OpVT.getSizeInBits()/128;
13332     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13333                                  OpVT.getVectorNumElements() / SizeFactor);
13334
13335     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13336
13337     // Insert the 128-bit vector.
13338     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13339   }
13340
13341   if (OpVT == MVT::v1i64 &&
13342       Op.getOperand(0).getValueType() == MVT::i64)
13343     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
13344
13345   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13346   assert(OpVT.is128BitVector() && "Expected an SSE type!");
13347   return DAG.getNode(ISD::BITCAST, dl, OpVT,
13348                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
13349 }
13350
13351 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
13352 // a simple subregister reference or explicit instructions to grab
13353 // upper bits of a vector.
13354 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13355                                       SelectionDAG &DAG) {
13356   SDLoc dl(Op);
13357   SDValue In =  Op.getOperand(0);
13358   SDValue Idx = Op.getOperand(1);
13359   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13360   MVT ResVT   = Op.getSimpleValueType();
13361   MVT InVT    = In.getSimpleValueType();
13362
13363   if (Subtarget->hasFp256()) {
13364     if (ResVT.is128BitVector() &&
13365         (InVT.is256BitVector() || InVT.is512BitVector()) &&
13366         isa<ConstantSDNode>(Idx)) {
13367       return Extract128BitVector(In, IdxVal, DAG, dl);
13368     }
13369     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
13370         isa<ConstantSDNode>(Idx)) {
13371       return Extract256BitVector(In, IdxVal, DAG, dl);
13372     }
13373   }
13374   return SDValue();
13375 }
13376
13377 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
13378 // simple superregister reference or explicit instructions to insert
13379 // the upper bits of a vector.
13380 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13381                                      SelectionDAG &DAG) {
13382   if (!Subtarget->hasAVX())
13383     return SDValue();
13384
13385   SDLoc dl(Op);
13386   SDValue Vec = Op.getOperand(0);
13387   SDValue SubVec = Op.getOperand(1);
13388   SDValue Idx = Op.getOperand(2);
13389
13390   if (!isa<ConstantSDNode>(Idx))
13391     return SDValue();
13392
13393   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13394   MVT OpVT = Op.getSimpleValueType();
13395   MVT SubVecVT = SubVec.getSimpleValueType();
13396
13397   // Fold two 16-byte subvector loads into one 32-byte load:
13398   // (insert_subvector (insert_subvector undef, (load addr), 0),
13399   //                   (load addr + 16), Elts/2)
13400   // --> load32 addr
13401   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
13402       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
13403       OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
13404       !Subtarget->isUnalignedMem32Slow()) {
13405     SDValue SubVec2 = Vec.getOperand(1);
13406     if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
13407       if (Idx2->getZExtValue() == 0) {
13408         SDValue Ops[] = { SubVec2, SubVec };
13409         SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
13410         if (LD.getNode())
13411           return LD;
13412       }
13413     }
13414   }
13415
13416   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13417       SubVecVT.is128BitVector())
13418     return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13419
13420   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
13421     return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13422
13423   return SDValue();
13424 }
13425
13426 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13427 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13428 // one of the above mentioned nodes. It has to be wrapped because otherwise
13429 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13430 // be used to form addressing mode. These wrapped nodes will be selected
13431 // into MOV32ri.
13432 SDValue
13433 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13434   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13435
13436   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13437   // global base reg.
13438   unsigned char OpFlag = 0;
13439   unsigned WrapperKind = X86ISD::Wrapper;
13440   CodeModel::Model M = DAG.getTarget().getCodeModel();
13441
13442   if (Subtarget->isPICStyleRIPRel() &&
13443       (M == CodeModel::Small || M == CodeModel::Kernel))
13444     WrapperKind = X86ISD::WrapperRIP;
13445   else if (Subtarget->isPICStyleGOT())
13446     OpFlag = X86II::MO_GOTOFF;
13447   else if (Subtarget->isPICStyleStubPIC())
13448     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13449
13450   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
13451                                              CP->getAlignment(),
13452                                              CP->getOffset(), OpFlag);
13453   SDLoc DL(CP);
13454   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13455   // With PIC, the address is actually $g + Offset.
13456   if (OpFlag) {
13457     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13458                          DAG.getNode(X86ISD::GlobalBaseReg,
13459                                      SDLoc(), getPointerTy()),
13460                          Result);
13461   }
13462
13463   return Result;
13464 }
13465
13466 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13467   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13468
13469   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13470   // global base reg.
13471   unsigned char OpFlag = 0;
13472   unsigned WrapperKind = X86ISD::Wrapper;
13473   CodeModel::Model M = DAG.getTarget().getCodeModel();
13474
13475   if (Subtarget->isPICStyleRIPRel() &&
13476       (M == CodeModel::Small || M == CodeModel::Kernel))
13477     WrapperKind = X86ISD::WrapperRIP;
13478   else if (Subtarget->isPICStyleGOT())
13479     OpFlag = X86II::MO_GOTOFF;
13480   else if (Subtarget->isPICStyleStubPIC())
13481     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13482
13483   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
13484                                           OpFlag);
13485   SDLoc DL(JT);
13486   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13487
13488   // With PIC, the address is actually $g + Offset.
13489   if (OpFlag)
13490     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13491                          DAG.getNode(X86ISD::GlobalBaseReg,
13492                                      SDLoc(), getPointerTy()),
13493                          Result);
13494
13495   return Result;
13496 }
13497
13498 SDValue
13499 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13500   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13501
13502   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13503   // global base reg.
13504   unsigned char OpFlag = 0;
13505   unsigned WrapperKind = X86ISD::Wrapper;
13506   CodeModel::Model M = DAG.getTarget().getCodeModel();
13507
13508   if (Subtarget->isPICStyleRIPRel() &&
13509       (M == CodeModel::Small || M == CodeModel::Kernel)) {
13510     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
13511       OpFlag = X86II::MO_GOTPCREL;
13512     WrapperKind = X86ISD::WrapperRIP;
13513   } else if (Subtarget->isPICStyleGOT()) {
13514     OpFlag = X86II::MO_GOT;
13515   } else if (Subtarget->isPICStyleStubPIC()) {
13516     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
13517   } else if (Subtarget->isPICStyleStubNoDynamic()) {
13518     OpFlag = X86II::MO_DARWIN_NONLAZY;
13519   }
13520
13521   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
13522
13523   SDLoc DL(Op);
13524   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13525
13526   // With PIC, the address is actually $g + Offset.
13527   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
13528       !Subtarget->is64Bit()) {
13529     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13530                          DAG.getNode(X86ISD::GlobalBaseReg,
13531                                      SDLoc(), getPointerTy()),
13532                          Result);
13533   }
13534
13535   // For symbols that require a load from a stub to get the address, emit the
13536   // load.
13537   if (isGlobalStubReference(OpFlag))
13538     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
13539                          MachinePointerInfo::getGOT(), false, false, false, 0);
13540
13541   return Result;
13542 }
13543
13544 SDValue
13545 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13546   // Create the TargetBlockAddressAddress node.
13547   unsigned char OpFlags =
13548     Subtarget->ClassifyBlockAddressReference();
13549   CodeModel::Model M = DAG.getTarget().getCodeModel();
13550   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13551   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13552   SDLoc dl(Op);
13553   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
13554                                              OpFlags);
13555
13556   if (Subtarget->isPICStyleRIPRel() &&
13557       (M == CodeModel::Small || M == CodeModel::Kernel))
13558     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13559   else
13560     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13561
13562   // With PIC, the address is actually $g + Offset.
13563   if (isGlobalRelativeToPICBase(OpFlags)) {
13564     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13565                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13566                          Result);
13567   }
13568
13569   return Result;
13570 }
13571
13572 SDValue
13573 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
13574                                       int64_t Offset, SelectionDAG &DAG) const {
13575   // Create the TargetGlobalAddress node, folding in the constant
13576   // offset if it is legal.
13577   unsigned char OpFlags =
13578       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
13579   CodeModel::Model M = DAG.getTarget().getCodeModel();
13580   SDValue Result;
13581   if (OpFlags == X86II::MO_NO_FLAG &&
13582       X86::isOffsetSuitableForCodeModel(Offset, M)) {
13583     // A direct static reference to a global.
13584     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
13585     Offset = 0;
13586   } else {
13587     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
13588   }
13589
13590   if (Subtarget->isPICStyleRIPRel() &&
13591       (M == CodeModel::Small || M == CodeModel::Kernel))
13592     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13593   else
13594     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13595
13596   // With PIC, the address is actually $g + Offset.
13597   if (isGlobalRelativeToPICBase(OpFlags)) {
13598     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13599                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13600                          Result);
13601   }
13602
13603   // For globals that require a load from a stub to get the address, emit the
13604   // load.
13605   if (isGlobalStubReference(OpFlags))
13606     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
13607                          MachinePointerInfo::getGOT(), false, false, false, 0);
13608
13609   // If there was a non-zero offset that we didn't fold, create an explicit
13610   // addition for it.
13611   if (Offset != 0)
13612     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
13613                          DAG.getConstant(Offset, getPointerTy()));
13614
13615   return Result;
13616 }
13617
13618 SDValue
13619 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13620   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13621   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13622   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13623 }
13624
13625 static SDValue
13626 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13627            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13628            unsigned char OperandFlags, bool LocalDynamic = false) {
13629   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13630   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13631   SDLoc dl(GA);
13632   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13633                                            GA->getValueType(0),
13634                                            GA->getOffset(),
13635                                            OperandFlags);
13636
13637   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13638                                            : X86ISD::TLSADDR;
13639
13640   if (InFlag) {
13641     SDValue Ops[] = { Chain,  TGA, *InFlag };
13642     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13643   } else {
13644     SDValue Ops[]  = { Chain, TGA };
13645     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13646   }
13647
13648   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13649   MFI->setAdjustsStack(true);
13650   MFI->setHasCalls(true);
13651
13652   SDValue Flag = Chain.getValue(1);
13653   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13654 }
13655
13656 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13657 static SDValue
13658 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13659                                 const EVT PtrVT) {
13660   SDValue InFlag;
13661   SDLoc dl(GA);  // ? function entry point might be better
13662   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13663                                    DAG.getNode(X86ISD::GlobalBaseReg,
13664                                                SDLoc(), PtrVT), InFlag);
13665   InFlag = Chain.getValue(1);
13666
13667   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13668 }
13669
13670 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13671 static SDValue
13672 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13673                                 const EVT PtrVT) {
13674   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13675                     X86::RAX, X86II::MO_TLSGD);
13676 }
13677
13678 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13679                                            SelectionDAG &DAG,
13680                                            const EVT PtrVT,
13681                                            bool is64Bit) {
13682   SDLoc dl(GA);
13683
13684   // Get the start address of the TLS block for this module.
13685   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13686       .getInfo<X86MachineFunctionInfo>();
13687   MFI->incNumLocalDynamicTLSAccesses();
13688
13689   SDValue Base;
13690   if (is64Bit) {
13691     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13692                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
13693   } else {
13694     SDValue InFlag;
13695     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13696         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13697     InFlag = Chain.getValue(1);
13698     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13699                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13700   }
13701
13702   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13703   // of Base.
13704
13705   // Build x@dtpoff.
13706   unsigned char OperandFlags = X86II::MO_DTPOFF;
13707   unsigned WrapperKind = X86ISD::Wrapper;
13708   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13709                                            GA->getValueType(0),
13710                                            GA->getOffset(), OperandFlags);
13711   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13712
13713   // Add x@dtpoff with the base.
13714   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13715 }
13716
13717 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13718 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13719                                    const EVT PtrVT, TLSModel::Model model,
13720                                    bool is64Bit, bool isPIC) {
13721   SDLoc dl(GA);
13722
13723   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13724   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13725                                                          is64Bit ? 257 : 256));
13726
13727   SDValue ThreadPointer =
13728       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
13729                   MachinePointerInfo(Ptr), false, false, false, 0);
13730
13731   unsigned char OperandFlags = 0;
13732   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
13733   // initialexec.
13734   unsigned WrapperKind = X86ISD::Wrapper;
13735   if (model == TLSModel::LocalExec) {
13736     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13737   } else if (model == TLSModel::InitialExec) {
13738     if (is64Bit) {
13739       OperandFlags = X86II::MO_GOTTPOFF;
13740       WrapperKind = X86ISD::WrapperRIP;
13741     } else {
13742       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13743     }
13744   } else {
13745     llvm_unreachable("Unexpected model");
13746   }
13747
13748   // emit "addl x@ntpoff,%eax" (local exec)
13749   // or "addl x@indntpoff,%eax" (initial exec)
13750   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13751   SDValue TGA =
13752       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13753                                  GA->getOffset(), OperandFlags);
13754   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13755
13756   if (model == TLSModel::InitialExec) {
13757     if (isPIC && !is64Bit) {
13758       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13759                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13760                            Offset);
13761     }
13762
13763     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13764                          MachinePointerInfo::getGOT(), false, false, false, 0);
13765   }
13766
13767   // The address of the thread local variable is the add of the thread
13768   // pointer with the offset of the variable.
13769   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13770 }
13771
13772 SDValue
13773 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13774
13775   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13776   const GlobalValue *GV = GA->getGlobal();
13777
13778   if (Subtarget->isTargetELF()) {
13779     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13780
13781     switch (model) {
13782       case TLSModel::GeneralDynamic:
13783         if (Subtarget->is64Bit())
13784           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
13785         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
13786       case TLSModel::LocalDynamic:
13787         return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
13788                                            Subtarget->is64Bit());
13789       case TLSModel::InitialExec:
13790       case TLSModel::LocalExec:
13791         return LowerToTLSExecModel(
13792             GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
13793             DAG.getTarget().getRelocationModel() == Reloc::PIC_);
13794     }
13795     llvm_unreachable("Unknown TLS model.");
13796   }
13797
13798   if (Subtarget->isTargetDarwin()) {
13799     // Darwin only has one model of TLS.  Lower to that.
13800     unsigned char OpFlag = 0;
13801     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
13802                            X86ISD::WrapperRIP : X86ISD::Wrapper;
13803
13804     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13805     // global base reg.
13806     bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
13807                  !Subtarget->is64Bit();
13808     if (PIC32)
13809       OpFlag = X86II::MO_TLVP_PIC_BASE;
13810     else
13811       OpFlag = X86II::MO_TLVP;
13812     SDLoc DL(Op);
13813     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13814                                                 GA->getValueType(0),
13815                                                 GA->getOffset(), OpFlag);
13816     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13817
13818     // With PIC32, the address is actually $g + Offset.
13819     if (PIC32)
13820       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13821                            DAG.getNode(X86ISD::GlobalBaseReg,
13822                                        SDLoc(), getPointerTy()),
13823                            Offset);
13824
13825     // Lowering the machine isd will make sure everything is in the right
13826     // location.
13827     SDValue Chain = DAG.getEntryNode();
13828     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13829     SDValue Args[] = { Chain, Offset };
13830     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13831
13832     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13833     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13834     MFI->setAdjustsStack(true);
13835
13836     // And our return value (tls address) is in the standard call return value
13837     // location.
13838     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
13839     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
13840                               Chain.getValue(1));
13841   }
13842
13843   if (Subtarget->isTargetKnownWindowsMSVC() ||
13844       Subtarget->isTargetWindowsGNU()) {
13845     // Just use the implicit TLS architecture
13846     // Need to generate someting similar to:
13847     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13848     //                                  ; from TEB
13849     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
13850     //   mov     rcx, qword [rdx+rcx*8]
13851     //   mov     eax, .tls$:tlsvar
13852     //   [rax+rcx] contains the address
13853     // Windows 64bit: gs:0x58
13854     // Windows 32bit: fs:__tls_array
13855
13856     SDLoc dl(GA);
13857     SDValue Chain = DAG.getEntryNode();
13858
13859     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13860     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13861     // use its literal value of 0x2C.
13862     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
13863                                         ? Type::getInt8PtrTy(*DAG.getContext(),
13864                                                              256)
13865                                         : Type::getInt32PtrTy(*DAG.getContext(),
13866                                                               257));
13867
13868     SDValue TlsArray =
13869         Subtarget->is64Bit()
13870             ? DAG.getIntPtrConstant(0x58)
13871             : (Subtarget->isTargetWindowsGNU()
13872                    ? DAG.getIntPtrConstant(0x2C)
13873                    : DAG.getExternalSymbol("_tls_array", getPointerTy()));
13874
13875     SDValue ThreadPointer =
13876         DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
13877                     MachinePointerInfo(Ptr), false, false, false, 0);
13878
13879     // Load the _tls_index variable
13880     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
13881     if (Subtarget->is64Bit())
13882       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
13883                            IDX, MachinePointerInfo(), MVT::i32,
13884                            false, false, false, 0);
13885     else
13886       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
13887                         false, false, false, 0);
13888
13889     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
13890                                     getPointerTy());
13891     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
13892
13893     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
13894     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
13895                       false, false, false, 0);
13896
13897     // Get the offset of start of .tls section
13898     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13899                                              GA->getValueType(0),
13900                                              GA->getOffset(), X86II::MO_SECREL);
13901     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
13902
13903     // The address of the thread local variable is the add of the thread
13904     // pointer with the offset of the variable.
13905     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
13906   }
13907
13908   llvm_unreachable("TLS not implemented for this target.");
13909 }
13910
13911 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
13912 /// and take a 2 x i32 value to shift plus a shift amount.
13913 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13914   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13915   MVT VT = Op.getSimpleValueType();
13916   unsigned VTBits = VT.getSizeInBits();
13917   SDLoc dl(Op);
13918   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13919   SDValue ShOpLo = Op.getOperand(0);
13920   SDValue ShOpHi = Op.getOperand(1);
13921   SDValue ShAmt  = Op.getOperand(2);
13922   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13923   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13924   // during isel.
13925   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13926                                   DAG.getConstant(VTBits - 1, MVT::i8));
13927   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13928                                      DAG.getConstant(VTBits - 1, MVT::i8))
13929                        : DAG.getConstant(0, VT);
13930
13931   SDValue Tmp2, Tmp3;
13932   if (Op.getOpcode() == ISD::SHL_PARTS) {
13933     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13934     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13935   } else {
13936     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13937     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13938   }
13939
13940   // If the shift amount is larger or equal than the width of a part we can't
13941   // rely on the results of shld/shrd. Insert a test and select the appropriate
13942   // values for large shift amounts.
13943   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13944                                 DAG.getConstant(VTBits, MVT::i8));
13945   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13946                              AndNode, DAG.getConstant(0, MVT::i8));
13947
13948   SDValue Hi, Lo;
13949   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
13950   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13951   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13952
13953   if (Op.getOpcode() == ISD::SHL_PARTS) {
13954     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13955     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13956   } else {
13957     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13958     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13959   }
13960
13961   SDValue Ops[2] = { Lo, Hi };
13962   return DAG.getMergeValues(Ops, dl);
13963 }
13964
13965 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13966                                            SelectionDAG &DAG) const {
13967   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
13968   SDLoc dl(Op);
13969
13970   if (SrcVT.isVector()) {
13971     if (SrcVT.getVectorElementType() == MVT::i1) {
13972       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13973       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13974                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
13975                                      Op.getOperand(0)));
13976     }
13977     return SDValue();
13978   }
13979
13980   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13981          "Unknown SINT_TO_FP to lower!");
13982
13983   // These are really Legal; return the operand so the caller accepts it as
13984   // Legal.
13985   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13986     return Op;
13987   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13988       Subtarget->is64Bit()) {
13989     return Op;
13990   }
13991
13992   unsigned Size = SrcVT.getSizeInBits()/8;
13993   MachineFunction &MF = DAG.getMachineFunction();
13994   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13995   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13996   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13997                                StackSlot,
13998                                MachinePointerInfo::getFixedStack(SSFI),
13999                                false, false, 0);
14000   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14001 }
14002
14003 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14004                                      SDValue StackSlot,
14005                                      SelectionDAG &DAG) const {
14006   // Build the FILD
14007   SDLoc DL(Op);
14008   SDVTList Tys;
14009   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14010   if (useSSE)
14011     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14012   else
14013     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14014
14015   unsigned ByteSize = SrcVT.getSizeInBits()/8;
14016
14017   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14018   MachineMemOperand *MMO;
14019   if (FI) {
14020     int SSFI = FI->getIndex();
14021     MMO =
14022       DAG.getMachineFunction()
14023       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14024                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
14025   } else {
14026     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14027     StackSlot = StackSlot.getOperand(1);
14028   }
14029   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14030   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14031                                            X86ISD::FILD, DL,
14032                                            Tys, Ops, SrcVT, MMO);
14033
14034   if (useSSE) {
14035     Chain = Result.getValue(1);
14036     SDValue InFlag = Result.getValue(2);
14037
14038     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14039     // shouldn't be necessary except that RFP cannot be live across
14040     // multiple blocks. When stackifier is fixed, they can be uncoupled.
14041     MachineFunction &MF = DAG.getMachineFunction();
14042     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
14043     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
14044     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14045     Tys = DAG.getVTList(MVT::Other);
14046     SDValue Ops[] = {
14047       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
14048     };
14049     MachineMemOperand *MMO =
14050       DAG.getMachineFunction()
14051       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14052                             MachineMemOperand::MOStore, SSFISize, SSFISize);
14053
14054     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
14055                                     Ops, Op.getValueType(), MMO);
14056     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
14057                          MachinePointerInfo::getFixedStack(SSFI),
14058                          false, false, false, 0);
14059   }
14060
14061   return Result;
14062 }
14063
14064 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
14065 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
14066                                                SelectionDAG &DAG) const {
14067   // This algorithm is not obvious. Here it is what we're trying to output:
14068   /*
14069      movq       %rax,  %xmm0
14070      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
14071      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
14072      #ifdef __SSE3__
14073        haddpd   %xmm0, %xmm0
14074      #else
14075        pshufd   $0x4e, %xmm0, %xmm1
14076        addpd    %xmm1, %xmm0
14077      #endif
14078   */
14079
14080   SDLoc dl(Op);
14081   LLVMContext *Context = DAG.getContext();
14082
14083   // Build some magic constants.
14084   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
14085   Constant *C0 = ConstantDataVector::get(*Context, CV0);
14086   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
14087
14088   SmallVector<Constant*,2> CV1;
14089   CV1.push_back(
14090     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14091                                       APInt(64, 0x4330000000000000ULL))));
14092   CV1.push_back(
14093     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14094                                       APInt(64, 0x4530000000000000ULL))));
14095   Constant *C1 = ConstantVector::get(CV1);
14096   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
14097
14098   // Load the 64-bit value into an XMM register.
14099   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
14100                             Op.getOperand(0));
14101   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
14102                               MachinePointerInfo::getConstantPool(),
14103                               false, false, false, 16);
14104   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
14105                               DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
14106                               CLod0);
14107
14108   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
14109                               MachinePointerInfo::getConstantPool(),
14110                               false, false, false, 16);
14111   SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
14112   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
14113   SDValue Result;
14114
14115   if (Subtarget->hasSSE3()) {
14116     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
14117     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
14118   } else {
14119     SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
14120     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
14121                                            S2F, 0x4E, DAG);
14122     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
14123                          DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
14124                          Sub);
14125   }
14126
14127   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
14128                      DAG.getIntPtrConstant(0));
14129 }
14130
14131 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
14132 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
14133                                                SelectionDAG &DAG) const {
14134   SDLoc dl(Op);
14135   // FP constant to bias correct the final result.
14136   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
14137                                    MVT::f64);
14138
14139   // Load the 32-bit value into an XMM register.
14140   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
14141                              Op.getOperand(0));
14142
14143   // Zero out the upper parts of the register.
14144   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
14145
14146   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14147                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
14148                      DAG.getIntPtrConstant(0));
14149
14150   // Or the load with the bias.
14151   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
14152                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14153                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14154                                                    MVT::v2f64, Load)),
14155                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14156                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14157                                                    MVT::v2f64, Bias)));
14158   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14159                    DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
14160                    DAG.getIntPtrConstant(0));
14161
14162   // Subtract the bias.
14163   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
14164
14165   // Handle final rounding.
14166   EVT DestVT = Op.getValueType();
14167
14168   if (DestVT.bitsLT(MVT::f64))
14169     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
14170                        DAG.getIntPtrConstant(0));
14171   if (DestVT.bitsGT(MVT::f64))
14172     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
14173
14174   // Handle final rounding.
14175   return Sub;
14176 }
14177
14178 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
14179                                      const X86Subtarget &Subtarget) {
14180   // The algorithm is the following:
14181   // #ifdef __SSE4_1__
14182   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14183   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14184   //                                 (uint4) 0x53000000, 0xaa);
14185   // #else
14186   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14187   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14188   // #endif
14189   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14190   //     return (float4) lo + fhi;
14191
14192   SDLoc DL(Op);
14193   SDValue V = Op->getOperand(0);
14194   EVT VecIntVT = V.getValueType();
14195   bool Is128 = VecIntVT == MVT::v4i32;
14196   EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
14197   // If we convert to something else than the supported type, e.g., to v4f64,
14198   // abort early.
14199   if (VecFloatVT != Op->getValueType(0))
14200     return SDValue();
14201
14202   unsigned NumElts = VecIntVT.getVectorNumElements();
14203   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
14204          "Unsupported custom type");
14205   assert(NumElts <= 8 && "The size of the constant array must be fixed");
14206
14207   // In the #idef/#else code, we have in common:
14208   // - The vector of constants:
14209   // -- 0x4b000000
14210   // -- 0x53000000
14211   // - A shift:
14212   // -- v >> 16
14213
14214   // Create the splat vector for 0x4b000000.
14215   SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
14216   SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
14217                            CstLow, CstLow, CstLow, CstLow};
14218   SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14219                                   makeArrayRef(&CstLowArray[0], NumElts));
14220   // Create the splat vector for 0x53000000.
14221   SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
14222   SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
14223                             CstHigh, CstHigh, CstHigh, CstHigh};
14224   SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14225                                    makeArrayRef(&CstHighArray[0], NumElts));
14226
14227   // Create the right shift.
14228   SDValue CstShift = DAG.getConstant(16, MVT::i32);
14229   SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
14230                              CstShift, CstShift, CstShift, CstShift};
14231   SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14232                                     makeArrayRef(&CstShiftArray[0], NumElts));
14233   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14234
14235   SDValue Low, High;
14236   if (Subtarget.hasSSE41()) {
14237     EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14238     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14239     SDValue VecCstLowBitcast =
14240         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
14241     SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
14242     // Low will be bitcasted right away, so do not bother bitcasting back to its
14243     // original type.
14244     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14245                       VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
14246     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14247     //                                 (uint4) 0x53000000, 0xaa);
14248     SDValue VecCstHighBitcast =
14249         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
14250     SDValue VecShiftBitcast =
14251         DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
14252     // High will be bitcasted right away, so do not bother bitcasting back to
14253     // its original type.
14254     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14255                        VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
14256   } else {
14257     SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
14258     SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
14259                                      CstMask, CstMask, CstMask);
14260     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14261     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14262     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14263
14264     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14265     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14266   }
14267
14268   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14269   SDValue CstFAdd = DAG.getConstantFP(
14270       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
14271   SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
14272                             CstFAdd, CstFAdd, CstFAdd, CstFAdd};
14273   SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
14274                                    makeArrayRef(&CstFAddArray[0], NumElts));
14275
14276   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14277   SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
14278   SDValue FHigh =
14279       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14280   //     return (float4) lo + fhi;
14281   SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
14282   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14283 }
14284
14285 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14286                                                SelectionDAG &DAG) const {
14287   SDValue N0 = Op.getOperand(0);
14288   MVT SVT = N0.getSimpleValueType();
14289   SDLoc dl(Op);
14290
14291   switch (SVT.SimpleTy) {
14292   default:
14293     llvm_unreachable("Custom UINT_TO_FP is not supported!");
14294   case MVT::v4i8:
14295   case MVT::v4i16:
14296   case MVT::v8i8:
14297   case MVT::v8i16: {
14298     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
14299     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14300                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14301   }
14302   case MVT::v4i32:
14303   case MVT::v8i32:
14304     return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
14305   }
14306   llvm_unreachable(nullptr);
14307 }
14308
14309 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14310                                            SelectionDAG &DAG) const {
14311   SDValue N0 = Op.getOperand(0);
14312   SDLoc dl(Op);
14313
14314   if (Op.getValueType().isVector())
14315     return lowerUINT_TO_FP_vec(Op, DAG);
14316
14317   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14318   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14319   // the optimization here.
14320   if (DAG.SignBitIsZero(N0))
14321     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14322
14323   MVT SrcVT = N0.getSimpleValueType();
14324   MVT DstVT = Op.getSimpleValueType();
14325   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14326     return LowerUINT_TO_FP_i64(Op, DAG);
14327   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14328     return LowerUINT_TO_FP_i32(Op, DAG);
14329   if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14330     return SDValue();
14331
14332   // Make a 64-bit buffer, and use it to build an FILD.
14333   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14334   if (SrcVT == MVT::i32) {
14335     SDValue WordOff = DAG.getConstant(4, getPointerTy());
14336     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
14337                                      getPointerTy(), StackSlot, WordOff);
14338     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14339                                   StackSlot, MachinePointerInfo(),
14340                                   false, false, 0);
14341     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
14342                                   OffsetSlot, MachinePointerInfo(),
14343                                   false, false, 0);
14344     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14345     return Fild;
14346   }
14347
14348   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14349   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14350                                StackSlot, MachinePointerInfo(),
14351                                false, false, 0);
14352   // For i64 source, we need to add the appropriate power of 2 if the input
14353   // was negative.  This is the same as the optimization in
14354   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14355   // we must be careful to do the computation in x87 extended precision, not
14356   // in SSE. (The generic code can't know it's OK to do this, or how to.)
14357   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14358   MachineMemOperand *MMO =
14359     DAG.getMachineFunction()
14360     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14361                           MachineMemOperand::MOLoad, 8, 8);
14362
14363   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14364   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14365   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14366                                          MVT::i64, MMO);
14367
14368   APInt FF(32, 0x5F800000ULL);
14369
14370   // Check whether the sign bit is set.
14371   SDValue SignSet = DAG.getSetCC(dl,
14372                                  getSetCCResultType(*DAG.getContext(), MVT::i64),
14373                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
14374                                  ISD::SETLT);
14375
14376   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14377   SDValue FudgePtr = DAG.getConstantPool(
14378                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
14379                                          getPointerTy());
14380
14381   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14382   SDValue Zero = DAG.getIntPtrConstant(0);
14383   SDValue Four = DAG.getIntPtrConstant(4);
14384   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14385                                Zero, Four);
14386   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
14387
14388   // Load the value out, extending it from f32 to f80.
14389   // FIXME: Avoid the extend by constructing the right constant pool?
14390   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
14391                                  FudgePtr, MachinePointerInfo::getConstantPool(),
14392                                  MVT::f32, false, false, false, 4);
14393   // Extend everything to 80 bits to force it to be done on x87.
14394   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14395   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
14396 }
14397
14398 std::pair<SDValue,SDValue>
14399 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14400                                     bool IsSigned, bool IsReplace) const {
14401   SDLoc DL(Op);
14402
14403   EVT DstTy = Op.getValueType();
14404
14405   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
14406     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14407     DstTy = MVT::i64;
14408   }
14409
14410   assert(DstTy.getSimpleVT() <= MVT::i64 &&
14411          DstTy.getSimpleVT() >= MVT::i16 &&
14412          "Unknown FP_TO_INT to lower!");
14413
14414   // These are really Legal.
14415   if (DstTy == MVT::i32 &&
14416       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14417     return std::make_pair(SDValue(), SDValue());
14418   if (Subtarget->is64Bit() &&
14419       DstTy == MVT::i64 &&
14420       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14421     return std::make_pair(SDValue(), SDValue());
14422
14423   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
14424   // stack slot, or into the FTOL runtime function.
14425   MachineFunction &MF = DAG.getMachineFunction();
14426   unsigned MemSize = DstTy.getSizeInBits()/8;
14427   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14428   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14429
14430   unsigned Opc;
14431   if (!IsSigned && isIntegerTypeFTOL(DstTy))
14432     Opc = X86ISD::WIN_FTOL;
14433   else
14434     switch (DstTy.getSimpleVT().SimpleTy) {
14435     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14436     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14437     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14438     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14439     }
14440
14441   SDValue Chain = DAG.getEntryNode();
14442   SDValue Value = Op.getOperand(0);
14443   EVT TheVT = Op.getOperand(0).getValueType();
14444   // FIXME This causes a redundant load/store if the SSE-class value is already
14445   // in memory, such as if it is on the callstack.
14446   if (isScalarFPTypeInSSEReg(TheVT)) {
14447     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14448     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14449                          MachinePointerInfo::getFixedStack(SSFI),
14450                          false, false, 0);
14451     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14452     SDValue Ops[] = {
14453       Chain, StackSlot, DAG.getValueType(TheVT)
14454     };
14455
14456     MachineMemOperand *MMO =
14457       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14458                               MachineMemOperand::MOLoad, MemSize, MemSize);
14459     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14460     Chain = Value.getValue(1);
14461     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14462     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14463   }
14464
14465   MachineMemOperand *MMO =
14466     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14467                             MachineMemOperand::MOStore, MemSize, MemSize);
14468
14469   if (Opc != X86ISD::WIN_FTOL) {
14470     // Build the FP_TO_INT*_IN_MEM
14471     SDValue Ops[] = { Chain, Value, StackSlot };
14472     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
14473                                            Ops, DstTy, MMO);
14474     return std::make_pair(FIST, StackSlot);
14475   } else {
14476     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
14477       DAG.getVTList(MVT::Other, MVT::Glue),
14478       Chain, Value);
14479     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
14480       MVT::i32, ftol.getValue(1));
14481     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
14482       MVT::i32, eax.getValue(2));
14483     SDValue Ops[] = { eax, edx };
14484     SDValue pair = IsReplace
14485       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
14486       : DAG.getMergeValues(Ops, DL);
14487     return std::make_pair(pair, SDValue());
14488   }
14489 }
14490
14491 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14492                               const X86Subtarget *Subtarget) {
14493   MVT VT = Op->getSimpleValueType(0);
14494   SDValue In = Op->getOperand(0);
14495   MVT InVT = In.getSimpleValueType();
14496   SDLoc dl(Op);
14497
14498   // Optimize vectors in AVX mode:
14499   //
14500   //   v8i16 -> v8i32
14501   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
14502   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
14503   //   Concat upper and lower parts.
14504   //
14505   //   v4i32 -> v4i64
14506   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
14507   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
14508   //   Concat upper and lower parts.
14509   //
14510
14511   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14512       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14513       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14514     return SDValue();
14515
14516   if (Subtarget->hasInt256())
14517     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14518
14519   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14520   SDValue Undef = DAG.getUNDEF(InVT);
14521   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14522   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14523   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14524
14525   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14526                              VT.getVectorNumElements()/2);
14527
14528   OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
14529   OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
14530
14531   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14532 }
14533
14534 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14535                                         SelectionDAG &DAG) {
14536   MVT VT = Op->getSimpleValueType(0);
14537   SDValue In = Op->getOperand(0);
14538   MVT InVT = In.getSimpleValueType();
14539   SDLoc DL(Op);
14540   unsigned int NumElts = VT.getVectorNumElements();
14541   if (NumElts != 8 && NumElts != 16)
14542     return SDValue();
14543
14544   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14545     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14546
14547   EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
14548   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14549   // Now we have only mask extension
14550   assert(InVT.getVectorElementType() == MVT::i1);
14551   SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
14552   const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14553   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
14554   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14555   SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14556                            MachinePointerInfo::getConstantPool(),
14557                            false, false, false, Alignment);
14558
14559   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
14560   if (VT.is512BitVector())
14561     return Brcst;
14562   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
14563 }
14564
14565 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14566                                SelectionDAG &DAG) {
14567   if (Subtarget->hasFp256()) {
14568     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14569     if (Res.getNode())
14570       return Res;
14571   }
14572
14573   return SDValue();
14574 }
14575
14576 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14577                                 SelectionDAG &DAG) {
14578   SDLoc DL(Op);
14579   MVT VT = Op.getSimpleValueType();
14580   SDValue In = Op.getOperand(0);
14581   MVT SVT = In.getSimpleValueType();
14582
14583   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14584     return LowerZERO_EXTEND_AVX512(Op, DAG);
14585
14586   if (Subtarget->hasFp256()) {
14587     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14588     if (Res.getNode())
14589       return Res;
14590   }
14591
14592   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14593          VT.getVectorNumElements() != SVT.getVectorNumElements());
14594   return SDValue();
14595 }
14596
14597 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14598   SDLoc DL(Op);
14599   MVT VT = Op.getSimpleValueType();
14600   SDValue In = Op.getOperand(0);
14601   MVT InVT = In.getSimpleValueType();
14602
14603   if (VT == MVT::i1) {
14604     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14605            "Invalid scalar TRUNCATE operation");
14606     if (InVT.getSizeInBits() >= 32)
14607       return SDValue();
14608     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14609     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14610   }
14611   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14612          "Invalid TRUNCATE operation");
14613
14614   if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
14615     if (VT.getVectorElementType().getSizeInBits() >=8)
14616       return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14617
14618     assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
14619     unsigned NumElts = InVT.getVectorNumElements();
14620     assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
14621     if (InVT.getSizeInBits() < 512) {
14622       MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
14623       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14624       InVT = ExtVT;
14625     }
14626
14627     SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
14628     const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14629     SDValue CP = DAG.getConstantPool(C, getPointerTy());
14630     unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14631     SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14632                            MachinePointerInfo::getConstantPool(),
14633                            false, false, false, Alignment);
14634     SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
14635     SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
14636     return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
14637   }
14638
14639   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14640     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14641     if (Subtarget->hasInt256()) {
14642       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14643       In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
14644       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14645                                 ShufMask);
14646       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14647                          DAG.getIntPtrConstant(0));
14648     }
14649
14650     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14651                                DAG.getIntPtrConstant(0));
14652     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14653                                DAG.getIntPtrConstant(2));
14654     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14655     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14656     static const int ShufMask[] = {0, 2, 4, 6};
14657     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14658   }
14659
14660   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14661     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14662     if (Subtarget->hasInt256()) {
14663       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
14664
14665       SmallVector<SDValue,32> pshufbMask;
14666       for (unsigned i = 0; i < 2; ++i) {
14667         pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
14668         pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
14669         pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
14670         pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
14671         pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
14672         pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
14673         pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
14674         pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
14675         for (unsigned j = 0; j < 8; ++j)
14676           pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
14677       }
14678       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
14679       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14680       In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
14681
14682       static const int ShufMask[] = {0,  2,  -1,  -1};
14683       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
14684                                 &ShufMask[0]);
14685       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14686                        DAG.getIntPtrConstant(0));
14687       return DAG.getNode(ISD::BITCAST, DL, VT, In);
14688     }
14689
14690     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14691                                DAG.getIntPtrConstant(0));
14692
14693     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14694                                DAG.getIntPtrConstant(4));
14695
14696     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
14697     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
14698
14699     // The PSHUFB mask:
14700     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
14701                                    -1, -1, -1, -1, -1, -1, -1, -1};
14702
14703     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14704     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14705     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14706
14707     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14708     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14709
14710     // The MOVLHPS Mask:
14711     static const int ShufMask2[] = {0, 1, 4, 5};
14712     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14713     return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
14714   }
14715
14716   // Handle truncation of V256 to V128 using shuffles.
14717   if (!VT.is128BitVector() || !InVT.is256BitVector())
14718     return SDValue();
14719
14720   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
14721
14722   unsigned NumElems = VT.getVectorNumElements();
14723   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14724
14725   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14726   // Prepare truncation shuffle mask
14727   for (unsigned i = 0; i != NumElems; ++i)
14728     MaskVec[i] = i * 2;
14729   SDValue V = DAG.getVectorShuffle(NVT, DL,
14730                                    DAG.getNode(ISD::BITCAST, DL, NVT, In),
14731                                    DAG.getUNDEF(NVT), &MaskVec[0]);
14732   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14733                      DAG.getIntPtrConstant(0));
14734 }
14735
14736 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14737                                            SelectionDAG &DAG) const {
14738   assert(!Op.getSimpleValueType().isVector());
14739
14740   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14741     /*IsSigned=*/ true, /*IsReplace=*/ false);
14742   SDValue FIST = Vals.first, StackSlot = Vals.second;
14743   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14744   if (!FIST.getNode()) return Op;
14745
14746   if (StackSlot.getNode())
14747     // Load the result.
14748     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14749                        FIST, StackSlot, MachinePointerInfo(),
14750                        false, false, false, 0);
14751
14752   // The node is the result.
14753   return FIST;
14754 }
14755
14756 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14757                                            SelectionDAG &DAG) const {
14758   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14759     /*IsSigned=*/ false, /*IsReplace=*/ false);
14760   SDValue FIST = Vals.first, StackSlot = Vals.second;
14761   assert(FIST.getNode() && "Unexpected failure");
14762
14763   if (StackSlot.getNode())
14764     // Load the result.
14765     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14766                        FIST, StackSlot, MachinePointerInfo(),
14767                        false, false, false, 0);
14768
14769   // The node is the result.
14770   return FIST;
14771 }
14772
14773 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14774   SDLoc DL(Op);
14775   MVT VT = Op.getSimpleValueType();
14776   SDValue In = Op.getOperand(0);
14777   MVT SVT = In.getSimpleValueType();
14778
14779   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14780
14781   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14782                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14783                                  In, DAG.getUNDEF(SVT)));
14784 }
14785
14786 /// The only differences between FABS and FNEG are the mask and the logic op.
14787 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
14788 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14789   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14790          "Wrong opcode for lowering FABS or FNEG.");
14791
14792   bool IsFABS = (Op.getOpcode() == ISD::FABS);
14793
14794   // If this is a FABS and it has an FNEG user, bail out to fold the combination
14795   // into an FNABS. We'll lower the FABS after that if it is still in use.
14796   if (IsFABS)
14797     for (SDNode *User : Op->uses())
14798       if (User->getOpcode() == ISD::FNEG)
14799         return Op;
14800
14801   SDValue Op0 = Op.getOperand(0);
14802   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14803
14804   SDLoc dl(Op);
14805   MVT VT = Op.getSimpleValueType();
14806   // Assume scalar op for initialization; update for vector if needed.
14807   // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
14808   // generate a 16-byte vector constant and logic op even for the scalar case.
14809   // Using a 16-byte mask allows folding the load of the mask with
14810   // the logic op, so it can save (~4 bytes) on code size.
14811   MVT EltVT = VT;
14812   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
14813   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14814   // decide if we should generate a 16-byte constant mask when we only need 4 or
14815   // 8 bytes for the scalar case.
14816   if (VT.isVector()) {
14817     EltVT = VT.getVectorElementType();
14818     NumElts = VT.getVectorNumElements();
14819   }
14820
14821   unsigned EltBits = EltVT.getSizeInBits();
14822   LLVMContext *Context = DAG.getContext();
14823   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14824   APInt MaskElt =
14825     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14826   Constant *C = ConstantInt::get(*Context, MaskElt);
14827   C = ConstantVector::getSplat(NumElts, C);
14828   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14829   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
14830   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14831   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14832                              MachinePointerInfo::getConstantPool(),
14833                              false, false, false, Alignment);
14834
14835   if (VT.isVector()) {
14836     // For a vector, cast operands to a vector type, perform the logic op,
14837     // and cast the result back to the original value type.
14838     MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
14839     SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
14840     SDValue Operand = IsFNABS ?
14841       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
14842       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
14843     unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
14844     return DAG.getNode(ISD::BITCAST, dl, VT,
14845                        DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
14846   }
14847
14848   // If not vector, then scalar.
14849   unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14850   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14851   return DAG.getNode(BitOp, dl, VT, Operand, Mask);
14852 }
14853
14854 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14855   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14856   LLVMContext *Context = DAG.getContext();
14857   SDValue Op0 = Op.getOperand(0);
14858   SDValue Op1 = Op.getOperand(1);
14859   SDLoc dl(Op);
14860   MVT VT = Op.getSimpleValueType();
14861   MVT SrcVT = Op1.getSimpleValueType();
14862
14863   // If second operand is smaller, extend it first.
14864   if (SrcVT.bitsLT(VT)) {
14865     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14866     SrcVT = VT;
14867   }
14868   // And if it is bigger, shrink it first.
14869   if (SrcVT.bitsGT(VT)) {
14870     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
14871     SrcVT = VT;
14872   }
14873
14874   // At this point the operands and the result should have the same
14875   // type, and that won't be f80 since that is not custom lowered.
14876
14877   const fltSemantics &Sem =
14878       VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
14879   const unsigned SizeInBits = VT.getSizeInBits();
14880
14881   SmallVector<Constant *, 4> CV(
14882       VT == MVT::f64 ? 2 : 4,
14883       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14884
14885   // First, clear all bits but the sign bit from the second operand (sign).
14886   CV[0] = ConstantFP::get(*Context,
14887                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14888   Constant *C = ConstantVector::get(CV);
14889   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14890   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
14891                               MachinePointerInfo::getConstantPool(),
14892                               false, false, false, 16);
14893   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
14894
14895   // Next, clear the sign bit from the first operand (magnitude).
14896   // If it's a constant, we can clear it here.
14897   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14898     APFloat APF = Op0CN->getValueAPF();
14899     // If the magnitude is a positive zero, the sign bit alone is enough.
14900     if (APF.isPosZero())
14901       return SignBit;
14902     APF.clearSign();
14903     CV[0] = ConstantFP::get(*Context, APF);
14904   } else {
14905     CV[0] = ConstantFP::get(
14906         *Context,
14907         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14908   }
14909   C = ConstantVector::get(CV);
14910   CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14911   SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14912                             MachinePointerInfo::getConstantPool(),
14913                             false, false, false, 16);
14914   // If the magnitude operand wasn't a constant, we need to AND out the sign.
14915   if (!isa<ConstantFPSDNode>(Op0))
14916     Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
14917
14918   // OR the magnitude value with the sign bit.
14919   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
14920 }
14921
14922 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14923   SDValue N0 = Op.getOperand(0);
14924   SDLoc dl(Op);
14925   MVT VT = Op.getSimpleValueType();
14926
14927   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
14928   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
14929                                   DAG.getConstant(1, VT));
14930   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
14931 }
14932
14933 // Check whether an OR'd tree is PTEST-able.
14934 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
14935                                       SelectionDAG &DAG) {
14936   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14937
14938   if (!Subtarget->hasSSE41())
14939     return SDValue();
14940
14941   if (!Op->hasOneUse())
14942     return SDValue();
14943
14944   SDNode *N = Op.getNode();
14945   SDLoc DL(N);
14946
14947   SmallVector<SDValue, 8> Opnds;
14948   DenseMap<SDValue, unsigned> VecInMap;
14949   SmallVector<SDValue, 8> VecIns;
14950   EVT VT = MVT::Other;
14951
14952   // Recognize a special case where a vector is casted into wide integer to
14953   // test all 0s.
14954   Opnds.push_back(N->getOperand(0));
14955   Opnds.push_back(N->getOperand(1));
14956
14957   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14958     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14959     // BFS traverse all OR'd operands.
14960     if (I->getOpcode() == ISD::OR) {
14961       Opnds.push_back(I->getOperand(0));
14962       Opnds.push_back(I->getOperand(1));
14963       // Re-evaluate the number of nodes to be traversed.
14964       e += 2; // 2 more nodes (LHS and RHS) are pushed.
14965       continue;
14966     }
14967
14968     // Quit if a non-EXTRACT_VECTOR_ELT
14969     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14970       return SDValue();
14971
14972     // Quit if without a constant index.
14973     SDValue Idx = I->getOperand(1);
14974     if (!isa<ConstantSDNode>(Idx))
14975       return SDValue();
14976
14977     SDValue ExtractedFromVec = I->getOperand(0);
14978     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14979     if (M == VecInMap.end()) {
14980       VT = ExtractedFromVec.getValueType();
14981       // Quit if not 128/256-bit vector.
14982       if (!VT.is128BitVector() && !VT.is256BitVector())
14983         return SDValue();
14984       // Quit if not the same type.
14985       if (VecInMap.begin() != VecInMap.end() &&
14986           VT != VecInMap.begin()->first.getValueType())
14987         return SDValue();
14988       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14989       VecIns.push_back(ExtractedFromVec);
14990     }
14991     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14992   }
14993
14994   assert((VT.is128BitVector() || VT.is256BitVector()) &&
14995          "Not extracted from 128-/256-bit vector.");
14996
14997   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14998
14999   for (DenseMap<SDValue, unsigned>::const_iterator
15000         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
15001     // Quit if not all elements are used.
15002     if (I->second != FullMask)
15003       return SDValue();
15004   }
15005
15006   EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
15007
15008   // Cast all vectors into TestVT for PTEST.
15009   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
15010     VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
15011
15012   // If more than one full vectors are evaluated, OR them first before PTEST.
15013   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
15014     // Each iteration will OR 2 nodes and append the result until there is only
15015     // 1 node left, i.e. the final OR'd value of all vectors.
15016     SDValue LHS = VecIns[Slot];
15017     SDValue RHS = VecIns[Slot + 1];
15018     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
15019   }
15020
15021   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
15022                      VecIns.back(), VecIns.back());
15023 }
15024
15025 /// \brief return true if \c Op has a use that doesn't just read flags.
15026 static bool hasNonFlagsUse(SDValue Op) {
15027   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
15028        ++UI) {
15029     SDNode *User = *UI;
15030     unsigned UOpNo = UI.getOperandNo();
15031     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
15032       // Look pass truncate.
15033       UOpNo = User->use_begin().getOperandNo();
15034       User = *User->use_begin();
15035     }
15036
15037     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
15038         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
15039       return true;
15040   }
15041   return false;
15042 }
15043
15044 /// Emit nodes that will be selected as "test Op0,Op0", or something
15045 /// equivalent.
15046 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
15047                                     SelectionDAG &DAG) const {
15048   if (Op.getValueType() == MVT::i1)
15049     // KORTEST instruction should be selected
15050     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15051                        DAG.getConstant(0, Op.getValueType()));
15052
15053   // CF and OF aren't always set the way we want. Determine which
15054   // of these we need.
15055   bool NeedCF = false;
15056   bool NeedOF = false;
15057   switch (X86CC) {
15058   default: break;
15059   case X86::COND_A: case X86::COND_AE:
15060   case X86::COND_B: case X86::COND_BE:
15061     NeedCF = true;
15062     break;
15063   case X86::COND_G: case X86::COND_GE:
15064   case X86::COND_L: case X86::COND_LE:
15065   case X86::COND_O: case X86::COND_NO: {
15066     // Check if we really need to set the
15067     // Overflow flag. If NoSignedWrap is present
15068     // that is not actually needed.
15069     switch (Op->getOpcode()) {
15070     case ISD::ADD:
15071     case ISD::SUB:
15072     case ISD::MUL:
15073     case ISD::SHL: {
15074       const BinaryWithFlagsSDNode *BinNode =
15075           cast<BinaryWithFlagsSDNode>(Op.getNode());
15076       if (BinNode->hasNoSignedWrap())
15077         break;
15078     }
15079     default:
15080       NeedOF = true;
15081       break;
15082     }
15083     break;
15084   }
15085   }
15086   // See if we can use the EFLAGS value from the operand instead of
15087   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
15088   // we prove that the arithmetic won't overflow, we can't use OF or CF.
15089   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
15090     // Emit a CMP with 0, which is the TEST pattern.
15091     //if (Op.getValueType() == MVT::i1)
15092     //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
15093     //                     DAG.getConstant(0, MVT::i1));
15094     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15095                        DAG.getConstant(0, Op.getValueType()));
15096   }
15097   unsigned Opcode = 0;
15098   unsigned NumOperands = 0;
15099
15100   // Truncate operations may prevent the merge of the SETCC instruction
15101   // and the arithmetic instruction before it. Attempt to truncate the operands
15102   // of the arithmetic instruction and use a reduced bit-width instruction.
15103   bool NeedTruncation = false;
15104   SDValue ArithOp = Op;
15105   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
15106     SDValue Arith = Op->getOperand(0);
15107     // Both the trunc and the arithmetic op need to have one user each.
15108     if (Arith->hasOneUse())
15109       switch (Arith.getOpcode()) {
15110         default: break;
15111         case ISD::ADD:
15112         case ISD::SUB:
15113         case ISD::AND:
15114         case ISD::OR:
15115         case ISD::XOR: {
15116           NeedTruncation = true;
15117           ArithOp = Arith;
15118         }
15119       }
15120   }
15121
15122   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
15123   // which may be the result of a CAST.  We use the variable 'Op', which is the
15124   // non-casted variable when we check for possible users.
15125   switch (ArithOp.getOpcode()) {
15126   case ISD::ADD:
15127     // Due to an isel shortcoming, be conservative if this add is likely to be
15128     // selected as part of a load-modify-store instruction. When the root node
15129     // in a match is a store, isel doesn't know how to remap non-chain non-flag
15130     // uses of other nodes in the match, such as the ADD in this case. This
15131     // leads to the ADD being left around and reselected, with the result being
15132     // two adds in the output.  Alas, even if none our users are stores, that
15133     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
15134     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
15135     // climbing the DAG back to the root, and it doesn't seem to be worth the
15136     // effort.
15137     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15138          UE = Op.getNode()->use_end(); UI != UE; ++UI)
15139       if (UI->getOpcode() != ISD::CopyToReg &&
15140           UI->getOpcode() != ISD::SETCC &&
15141           UI->getOpcode() != ISD::STORE)
15142         goto default_case;
15143
15144     if (ConstantSDNode *C =
15145         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
15146       // An add of one will be selected as an INC.
15147       if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
15148         Opcode = X86ISD::INC;
15149         NumOperands = 1;
15150         break;
15151       }
15152
15153       // An add of negative one (subtract of one) will be selected as a DEC.
15154       if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
15155         Opcode = X86ISD::DEC;
15156         NumOperands = 1;
15157         break;
15158       }
15159     }
15160
15161     // Otherwise use a regular EFLAGS-setting add.
15162     Opcode = X86ISD::ADD;
15163     NumOperands = 2;
15164     break;
15165   case ISD::SHL:
15166   case ISD::SRL:
15167     // If we have a constant logical shift that's only used in a comparison
15168     // against zero turn it into an equivalent AND. This allows turning it into
15169     // a TEST instruction later.
15170     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
15171         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
15172       EVT VT = Op.getValueType();
15173       unsigned BitWidth = VT.getSizeInBits();
15174       unsigned ShAmt = Op->getConstantOperandVal(1);
15175       if (ShAmt >= BitWidth) // Avoid undefined shifts.
15176         break;
15177       APInt Mask = ArithOp.getOpcode() == ISD::SRL
15178                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
15179                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
15180       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
15181         break;
15182       SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
15183                                 DAG.getConstant(Mask, VT));
15184       DAG.ReplaceAllUsesWith(Op, New);
15185       Op = New;
15186     }
15187     break;
15188
15189   case ISD::AND:
15190     // If the primary and result isn't used, don't bother using X86ISD::AND,
15191     // because a TEST instruction will be better.
15192     if (!hasNonFlagsUse(Op))
15193       break;
15194     // FALL THROUGH
15195   case ISD::SUB:
15196   case ISD::OR:
15197   case ISD::XOR:
15198     // Due to the ISEL shortcoming noted above, be conservative if this op is
15199     // likely to be selected as part of a load-modify-store instruction.
15200     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15201            UE = Op.getNode()->use_end(); UI != UE; ++UI)
15202       if (UI->getOpcode() == ISD::STORE)
15203         goto default_case;
15204
15205     // Otherwise use a regular EFLAGS-setting instruction.
15206     switch (ArithOp.getOpcode()) {
15207     default: llvm_unreachable("unexpected operator!");
15208     case ISD::SUB: Opcode = X86ISD::SUB; break;
15209     case ISD::XOR: Opcode = X86ISD::XOR; break;
15210     case ISD::AND: Opcode = X86ISD::AND; break;
15211     case ISD::OR: {
15212       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
15213         SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
15214         if (EFLAGS.getNode())
15215           return EFLAGS;
15216       }
15217       Opcode = X86ISD::OR;
15218       break;
15219     }
15220     }
15221
15222     NumOperands = 2;
15223     break;
15224   case X86ISD::ADD:
15225   case X86ISD::SUB:
15226   case X86ISD::INC:
15227   case X86ISD::DEC:
15228   case X86ISD::OR:
15229   case X86ISD::XOR:
15230   case X86ISD::AND:
15231     return SDValue(Op.getNode(), 1);
15232   default:
15233   default_case:
15234     break;
15235   }
15236
15237   // If we found that truncation is beneficial, perform the truncation and
15238   // update 'Op'.
15239   if (NeedTruncation) {
15240     EVT VT = Op.getValueType();
15241     SDValue WideVal = Op->getOperand(0);
15242     EVT WideVT = WideVal.getValueType();
15243     unsigned ConvertedOp = 0;
15244     // Use a target machine opcode to prevent further DAGCombine
15245     // optimizations that may separate the arithmetic operations
15246     // from the setcc node.
15247     switch (WideVal.getOpcode()) {
15248       default: break;
15249       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
15250       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
15251       case ISD::AND: ConvertedOp = X86ISD::AND; break;
15252       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
15253       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15254     }
15255
15256     if (ConvertedOp) {
15257       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15258       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15259         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15260         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15261         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15262       }
15263     }
15264   }
15265
15266   if (Opcode == 0)
15267     // Emit a CMP with 0, which is the TEST pattern.
15268     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15269                        DAG.getConstant(0, Op.getValueType()));
15270
15271   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15272   SmallVector<SDValue, 4> Ops;
15273   for (unsigned i = 0; i != NumOperands; ++i)
15274     Ops.push_back(Op.getOperand(i));
15275
15276   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15277   DAG.ReplaceAllUsesWith(Op, New);
15278   return SDValue(New.getNode(), 1);
15279 }
15280
15281 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15282 /// equivalent.
15283 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15284                                    SDLoc dl, SelectionDAG &DAG) const {
15285   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
15286     if (C->getAPIntValue() == 0)
15287       return EmitTest(Op0, X86CC, dl, DAG);
15288
15289      if (Op0.getValueType() == MVT::i1)
15290        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
15291   }
15292
15293   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
15294        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
15295     // Do the comparison at i32 if it's smaller, besides the Atom case.
15296     // This avoids subregister aliasing issues. Keep the smaller reference
15297     // if we're optimizing for size, however, as that'll allow better folding
15298     // of memory operations.
15299     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
15300         !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
15301              AttributeSet::FunctionIndex, Attribute::MinSize) &&
15302         !Subtarget->isAtom()) {
15303       unsigned ExtendOp =
15304           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
15305       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
15306       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
15307     }
15308     // Use SUB instead of CMP to enable CSE between SUB and CMP.
15309     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
15310     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
15311                               Op0, Op1);
15312     return SDValue(Sub.getNode(), 1);
15313   }
15314   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
15315 }
15316
15317 /// Convert a comparison if required by the subtarget.
15318 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
15319                                                  SelectionDAG &DAG) const {
15320   // If the subtarget does not support the FUCOMI instruction, floating-point
15321   // comparisons have to be converted.
15322   if (Subtarget->hasCMov() ||
15323       Cmp.getOpcode() != X86ISD::CMP ||
15324       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
15325       !Cmp.getOperand(1).getValueType().isFloatingPoint())
15326     return Cmp;
15327
15328   // The instruction selector will select an FUCOM instruction instead of
15329   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
15330   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
15331   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
15332   SDLoc dl(Cmp);
15333   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
15334   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
15335   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
15336                             DAG.getConstant(8, MVT::i8));
15337   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
15338   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
15339 }
15340
15341 /// The minimum architected relative accuracy is 2^-12. We need one
15342 /// Newton-Raphson step to have a good float result (24 bits of precision).
15343 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
15344                                             DAGCombinerInfo &DCI,
15345                                             unsigned &RefinementSteps,
15346                                             bool &UseOneConstNR) const {
15347   // FIXME: We should use instruction latency models to calculate the cost of
15348   // each potential sequence, but this is very hard to do reliably because
15349   // at least Intel's Core* chips have variable timing based on the number of
15350   // significant digits in the divisor and/or sqrt operand.
15351   if (!Subtarget->useSqrtEst())
15352     return SDValue();
15353
15354   EVT VT = Op.getValueType();
15355
15356   // SSE1 has rsqrtss and rsqrtps.
15357   // TODO: Add support for AVX512 (v16f32).
15358   // It is likely not profitable to do this for f64 because a double-precision
15359   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
15360   // instructions: convert to single, rsqrtss, convert back to double, refine
15361   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
15362   // along with FMA, this could be a throughput win.
15363   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15364       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15365     RefinementSteps = 1;
15366     UseOneConstNR = false;
15367     return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
15368   }
15369   return SDValue();
15370 }
15371
15372 /// The minimum architected relative accuracy is 2^-12. We need one
15373 /// Newton-Raphson step to have a good float result (24 bits of precision).
15374 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
15375                                             DAGCombinerInfo &DCI,
15376                                             unsigned &RefinementSteps) const {
15377   // FIXME: We should use instruction latency models to calculate the cost of
15378   // each potential sequence, but this is very hard to do reliably because
15379   // at least Intel's Core* chips have variable timing based on the number of
15380   // significant digits in the divisor.
15381   if (!Subtarget->useReciprocalEst())
15382     return SDValue();
15383
15384   EVT VT = Op.getValueType();
15385
15386   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
15387   // TODO: Add support for AVX512 (v16f32).
15388   // It is likely not profitable to do this for f64 because a double-precision
15389   // reciprocal estimate with refinement on x86 prior to FMA requires
15390   // 15 instructions: convert to single, rcpss, convert back to double, refine
15391   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
15392   // along with FMA, this could be a throughput win.
15393   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15394       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15395     RefinementSteps = ReciprocalEstimateRefinementSteps;
15396     return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15397   }
15398   return SDValue();
15399 }
15400
15401 static bool isAllOnes(SDValue V) {
15402   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
15403   return C && C->isAllOnesValue();
15404 }
15405
15406 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
15407 /// if it's possible.
15408 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15409                                      SDLoc dl, SelectionDAG &DAG) const {
15410   SDValue Op0 = And.getOperand(0);
15411   SDValue Op1 = And.getOperand(1);
15412   if (Op0.getOpcode() == ISD::TRUNCATE)
15413     Op0 = Op0.getOperand(0);
15414   if (Op1.getOpcode() == ISD::TRUNCATE)
15415     Op1 = Op1.getOperand(0);
15416
15417   SDValue LHS, RHS;
15418   if (Op1.getOpcode() == ISD::SHL)
15419     std::swap(Op0, Op1);
15420   if (Op0.getOpcode() == ISD::SHL) {
15421     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
15422       if (And00C->getZExtValue() == 1) {
15423         // If we looked past a truncate, check that it's only truncating away
15424         // known zeros.
15425         unsigned BitWidth = Op0.getValueSizeInBits();
15426         unsigned AndBitWidth = And.getValueSizeInBits();
15427         if (BitWidth > AndBitWidth) {
15428           APInt Zeros, Ones;
15429           DAG.computeKnownBits(Op0, Zeros, Ones);
15430           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15431             return SDValue();
15432         }
15433         LHS = Op1;
15434         RHS = Op0.getOperand(1);
15435       }
15436   } else if (Op1.getOpcode() == ISD::Constant) {
15437     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15438     uint64_t AndRHSVal = AndRHS->getZExtValue();
15439     SDValue AndLHS = Op0;
15440
15441     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15442       LHS = AndLHS.getOperand(0);
15443       RHS = AndLHS.getOperand(1);
15444     }
15445
15446     // Use BT if the immediate can't be encoded in a TEST instruction.
15447     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15448       LHS = AndLHS;
15449       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
15450     }
15451   }
15452
15453   if (LHS.getNode()) {
15454     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
15455     // instruction.  Since the shift amount is in-range-or-undefined, we know
15456     // that doing a bittest on the i32 value is ok.  We extend to i32 because
15457     // the encoding for the i16 version is larger than the i32 version.
15458     // Also promote i16 to i32 for performance / code size reason.
15459     if (LHS.getValueType() == MVT::i8 ||
15460         LHS.getValueType() == MVT::i16)
15461       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15462
15463     // If the operand types disagree, extend the shift amount to match.  Since
15464     // BT ignores high bits (like shifts) we can use anyextend.
15465     if (LHS.getValueType() != RHS.getValueType())
15466       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15467
15468     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15469     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15470     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15471                        DAG.getConstant(Cond, MVT::i8), BT);
15472   }
15473
15474   return SDValue();
15475 }
15476
15477 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
15478 /// mask CMPs.
15479 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15480                               SDValue &Op1) {
15481   unsigned SSECC;
15482   bool Swap = false;
15483
15484   // SSE Condition code mapping:
15485   //  0 - EQ
15486   //  1 - LT
15487   //  2 - LE
15488   //  3 - UNORD
15489   //  4 - NEQ
15490   //  5 - NLT
15491   //  6 - NLE
15492   //  7 - ORD
15493   switch (SetCCOpcode) {
15494   default: llvm_unreachable("Unexpected SETCC condition");
15495   case ISD::SETOEQ:
15496   case ISD::SETEQ:  SSECC = 0; break;
15497   case ISD::SETOGT:
15498   case ISD::SETGT:  Swap = true; // Fallthrough
15499   case ISD::SETLT:
15500   case ISD::SETOLT: SSECC = 1; break;
15501   case ISD::SETOGE:
15502   case ISD::SETGE:  Swap = true; // Fallthrough
15503   case ISD::SETLE:
15504   case ISD::SETOLE: SSECC = 2; break;
15505   case ISD::SETUO:  SSECC = 3; break;
15506   case ISD::SETUNE:
15507   case ISD::SETNE:  SSECC = 4; break;
15508   case ISD::SETULE: Swap = true; // Fallthrough
15509   case ISD::SETUGE: SSECC = 5; break;
15510   case ISD::SETULT: Swap = true; // Fallthrough
15511   case ISD::SETUGT: SSECC = 6; break;
15512   case ISD::SETO:   SSECC = 7; break;
15513   case ISD::SETUEQ:
15514   case ISD::SETONE: SSECC = 8; break;
15515   }
15516   if (Swap)
15517     std::swap(Op0, Op1);
15518
15519   return SSECC;
15520 }
15521
15522 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
15523 // ones, and then concatenate the result back.
15524 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15525   MVT VT = Op.getSimpleValueType();
15526
15527   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15528          "Unsupported value type for operation");
15529
15530   unsigned NumElems = VT.getVectorNumElements();
15531   SDLoc dl(Op);
15532   SDValue CC = Op.getOperand(2);
15533
15534   // Extract the LHS vectors
15535   SDValue LHS = Op.getOperand(0);
15536   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
15537   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
15538
15539   // Extract the RHS vectors
15540   SDValue RHS = Op.getOperand(1);
15541   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
15542   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
15543
15544   // Issue the operation on the smaller types and concatenate the result back
15545   MVT EltVT = VT.getVectorElementType();
15546   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15547   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15548                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15549                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15550 }
15551
15552 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
15553                                      const X86Subtarget *Subtarget) {
15554   SDValue Op0 = Op.getOperand(0);
15555   SDValue Op1 = Op.getOperand(1);
15556   SDValue CC = Op.getOperand(2);
15557   MVT VT = Op.getSimpleValueType();
15558   SDLoc dl(Op);
15559
15560   assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
15561          Op.getValueType().getScalarType() == MVT::i1 &&
15562          "Cannot set masked compare for this operation");
15563
15564   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15565   unsigned  Opc = 0;
15566   bool Unsigned = false;
15567   bool Swap = false;
15568   unsigned SSECC;
15569   switch (SetCCOpcode) {
15570   default: llvm_unreachable("Unexpected SETCC condition");
15571   case ISD::SETNE:  SSECC = 4; break;
15572   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
15573   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15574   case ISD::SETLT:  Swap = true; //fall-through
15575   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
15576   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15577   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15578   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
15579   case ISD::SETULE: Unsigned = true; //fall-through
15580   case ISD::SETLE:  SSECC = 2; break;
15581   }
15582
15583   if (Swap)
15584     std::swap(Op0, Op1);
15585   if (Opc)
15586     return DAG.getNode(Opc, dl, VT, Op0, Op1);
15587   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15588   return DAG.getNode(Opc, dl, VT, Op0, Op1,
15589                      DAG.getConstant(SSECC, MVT::i8));
15590 }
15591
15592 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15593 /// operand \p Op1.  If non-trivial (for example because it's not constant)
15594 /// return an empty value.
15595 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
15596 {
15597   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15598   if (!BV)
15599     return SDValue();
15600
15601   MVT VT = Op1.getSimpleValueType();
15602   MVT EVT = VT.getVectorElementType();
15603   unsigned n = VT.getVectorNumElements();
15604   SmallVector<SDValue, 8> ULTOp1;
15605
15606   for (unsigned i = 0; i < n; ++i) {
15607     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15608     if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
15609       return SDValue();
15610
15611     // Avoid underflow.
15612     APInt Val = Elt->getAPIntValue();
15613     if (Val == 0)
15614       return SDValue();
15615
15616     ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
15617   }
15618
15619   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
15620 }
15621
15622 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
15623                            SelectionDAG &DAG) {
15624   SDValue Op0 = Op.getOperand(0);
15625   SDValue Op1 = Op.getOperand(1);
15626   SDValue CC = Op.getOperand(2);
15627   MVT VT = Op.getSimpleValueType();
15628   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15629   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15630   SDLoc dl(Op);
15631
15632   if (isFP) {
15633 #ifndef NDEBUG
15634     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15635     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15636 #endif
15637
15638     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15639     unsigned Opc = X86ISD::CMPP;
15640     if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15641       assert(VT.getVectorNumElements() <= 16);
15642       Opc = X86ISD::CMPM;
15643     }
15644     // In the two special cases we can't handle, emit two comparisons.
15645     if (SSECC == 8) {
15646       unsigned CC0, CC1;
15647       unsigned CombineOpc;
15648       if (SetCCOpcode == ISD::SETUEQ) {
15649         CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
15650       } else {
15651         assert(SetCCOpcode == ISD::SETONE);
15652         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
15653       }
15654
15655       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15656                                  DAG.getConstant(CC0, MVT::i8));
15657       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15658                                  DAG.getConstant(CC1, MVT::i8));
15659       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15660     }
15661     // Handle all other FP comparisons here.
15662     return DAG.getNode(Opc, dl, VT, Op0, Op1,
15663                        DAG.getConstant(SSECC, MVT::i8));
15664   }
15665
15666   // Break 256-bit integer vector compare into smaller ones.
15667   if (VT.is256BitVector() && !Subtarget->hasInt256())
15668     return Lower256IntVSETCC(Op, DAG);
15669
15670   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
15671   EVT OpVT = Op1.getValueType();
15672   if (Subtarget->hasAVX512()) {
15673     if (Op1.getValueType().is512BitVector() ||
15674         (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
15675         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
15676       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
15677
15678     // In AVX-512 architecture setcc returns mask with i1 elements,
15679     // But there is no compare instruction for i8 and i16 elements in KNL.
15680     // We are not talking about 512-bit operands in this case, these
15681     // types are illegal.
15682     if (MaskResult &&
15683         (OpVT.getVectorElementType().getSizeInBits() < 32 &&
15684          OpVT.getVectorElementType().getSizeInBits() >= 8))
15685       return DAG.getNode(ISD::TRUNCATE, dl, VT,
15686                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15687   }
15688
15689   // We are handling one of the integer comparisons here.  Since SSE only has
15690   // GT and EQ comparisons for integer, swapping operands and multiple
15691   // operations may be required for some comparisons.
15692   unsigned Opc;
15693   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15694   bool Subus = false;
15695
15696   switch (SetCCOpcode) {
15697   default: llvm_unreachable("Unexpected SETCC condition");
15698   case ISD::SETNE:  Invert = true;
15699   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
15700   case ISD::SETLT:  Swap = true;
15701   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
15702   case ISD::SETGE:  Swap = true;
15703   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
15704                     Invert = true; break;
15705   case ISD::SETULT: Swap = true;
15706   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15707                     FlipSigns = true; break;
15708   case ISD::SETUGE: Swap = true;
15709   case ISD::SETULE: Opc = X86ISD::PCMPGT;
15710                     FlipSigns = true; Invert = true; break;
15711   }
15712
15713   // Special case: Use min/max operations for SETULE/SETUGE
15714   MVT VET = VT.getVectorElementType();
15715   bool hasMinMax =
15716        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15717     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
15718
15719   if (hasMinMax) {
15720     switch (SetCCOpcode) {
15721     default: break;
15722     case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
15723     case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
15724     }
15725
15726     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15727   }
15728
15729   bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15730   if (!MinMax && hasSubus) {
15731     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15732     // Op0 u<= Op1:
15733     //   t = psubus Op0, Op1
15734     //   pcmpeq t, <0..0>
15735     switch (SetCCOpcode) {
15736     default: break;
15737     case ISD::SETULT: {
15738       // If the comparison is against a constant we can turn this into a
15739       // setule.  With psubus, setule does not require a swap.  This is
15740       // beneficial because the constant in the register is no longer
15741       // destructed as the destination so it can be hoisted out of a loop.
15742       // Only do this pre-AVX since vpcmp* is no longer destructive.
15743       if (Subtarget->hasAVX())
15744         break;
15745       SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
15746       if (ULEOp1.getNode()) {
15747         Op1 = ULEOp1;
15748         Subus = true; Invert = false; Swap = false;
15749       }
15750       break;
15751     }
15752     // Psubus is better than flip-sign because it requires no inversion.
15753     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
15754     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15755     }
15756
15757     if (Subus) {
15758       Opc = X86ISD::SUBUS;
15759       FlipSigns = false;
15760     }
15761   }
15762
15763   if (Swap)
15764     std::swap(Op0, Op1);
15765
15766   // Check that the operation in question is available (most are plain SSE2,
15767   // but PCMPGTQ and PCMPEQQ have different requirements).
15768   if (VT == MVT::v2i64) {
15769     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
15770       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
15771
15772       // First cast everything to the right type.
15773       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15774       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15775
15776       // Since SSE has no unsigned integer comparisons, we need to flip the sign
15777       // bits of the inputs before performing those operations. The lower
15778       // compare is always unsigned.
15779       SDValue SB;
15780       if (FlipSigns) {
15781         SB = DAG.getConstant(0x80000000U, MVT::v4i32);
15782       } else {
15783         SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
15784         SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
15785         SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
15786                          Sign, Zero, Sign, Zero);
15787       }
15788       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15789       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15790
15791       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15792       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15793       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15794
15795       // Create masks for only the low parts/high parts of the 64 bit integers.
15796       static const int MaskHi[] = { 1, 1, 3, 3 };
15797       static const int MaskLo[] = { 0, 0, 2, 2 };
15798       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15799       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15800       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15801
15802       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15803       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15804
15805       if (Invert)
15806         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15807
15808       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15809     }
15810
15811     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
15812       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15813       // pcmpeqd + pshufd + pand.
15814       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
15815
15816       // First cast everything to the right type.
15817       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15818       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15819
15820       // Do the compare.
15821       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15822
15823       // Make sure the lower and upper halves are both all-ones.
15824       static const int Mask[] = { 1, 0, 3, 2 };
15825       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15826       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15827
15828       if (Invert)
15829         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15830
15831       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15832     }
15833   }
15834
15835   // Since SSE has no unsigned integer comparisons, we need to flip the sign
15836   // bits of the inputs before performing those operations.
15837   if (FlipSigns) {
15838     EVT EltVT = VT.getVectorElementType();
15839     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
15840     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15841     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15842   }
15843
15844   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15845
15846   // If the logical-not of the result is required, perform that now.
15847   if (Invert)
15848     Result = DAG.getNOT(dl, Result, VT);
15849
15850   if (MinMax)
15851     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15852
15853   if (Subus)
15854     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15855                          getZeroVector(VT, Subtarget, DAG, dl));
15856
15857   return Result;
15858 }
15859
15860 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15861
15862   MVT VT = Op.getSimpleValueType();
15863
15864   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15865
15866   assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15867          && "SetCC type must be 8-bit or 1-bit integer");
15868   SDValue Op0 = Op.getOperand(0);
15869   SDValue Op1 = Op.getOperand(1);
15870   SDLoc dl(Op);
15871   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15872
15873   // Optimize to BT if possible.
15874   // Lower (X & (1 << N)) == 0 to BT(X, N).
15875   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15876   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15877   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15878       Op1.getOpcode() == ISD::Constant &&
15879       cast<ConstantSDNode>(Op1)->isNullValue() &&
15880       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15881     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
15882     if (NewSetCC.getNode()) {
15883       if (VT == MVT::i1)
15884         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15885       return NewSetCC;
15886     }
15887   }
15888
15889   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
15890   // these.
15891   if (Op1.getOpcode() == ISD::Constant &&
15892       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
15893        cast<ConstantSDNode>(Op1)->isNullValue()) &&
15894       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15895
15896     // If the input is a setcc, then reuse the input setcc or use a new one with
15897     // the inverted condition.
15898     if (Op0.getOpcode() == X86ISD::SETCC) {
15899       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15900       bool Invert = (CC == ISD::SETNE) ^
15901         cast<ConstantSDNode>(Op1)->isNullValue();
15902       if (!Invert)
15903         return Op0;
15904
15905       CCode = X86::GetOppositeBranchCondition(CCode);
15906       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15907                                   DAG.getConstant(CCode, MVT::i8),
15908                                   Op0.getOperand(1));
15909       if (VT == MVT::i1)
15910         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15911       return SetCC;
15912     }
15913   }
15914   if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
15915       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
15916       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15917
15918     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15919     return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
15920   }
15921
15922   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15923   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
15924   if (X86CC == X86::COND_INVALID)
15925     return SDValue();
15926
15927   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15928   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15929   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15930                               DAG.getConstant(X86CC, MVT::i8), EFLAGS);
15931   if (VT == MVT::i1)
15932     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15933   return SetCC;
15934 }
15935
15936 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
15937 static bool isX86LogicalCmp(SDValue Op) {
15938   unsigned Opc = Op.getNode()->getOpcode();
15939   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15940       Opc == X86ISD::SAHF)
15941     return true;
15942   if (Op.getResNo() == 1 &&
15943       (Opc == X86ISD::ADD ||
15944        Opc == X86ISD::SUB ||
15945        Opc == X86ISD::ADC ||
15946        Opc == X86ISD::SBB ||
15947        Opc == X86ISD::SMUL ||
15948        Opc == X86ISD::UMUL ||
15949        Opc == X86ISD::INC ||
15950        Opc == X86ISD::DEC ||
15951        Opc == X86ISD::OR ||
15952        Opc == X86ISD::XOR ||
15953        Opc == X86ISD::AND))
15954     return true;
15955
15956   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15957     return true;
15958
15959   return false;
15960 }
15961
15962 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15963   if (V.getOpcode() != ISD::TRUNCATE)
15964     return false;
15965
15966   SDValue VOp0 = V.getOperand(0);
15967   unsigned InBits = VOp0.getValueSizeInBits();
15968   unsigned Bits = V.getValueSizeInBits();
15969   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
15970 }
15971
15972 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15973   bool addTest = true;
15974   SDValue Cond  = Op.getOperand(0);
15975   SDValue Op1 = Op.getOperand(1);
15976   SDValue Op2 = Op.getOperand(2);
15977   SDLoc DL(Op);
15978   EVT VT = Op1.getValueType();
15979   SDValue CC;
15980
15981   // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15982   // are available. Otherwise fp cmovs get lowered into a less efficient branch
15983   // sequence later on.
15984   if (Cond.getOpcode() == ISD::SETCC &&
15985       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15986        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
15987       VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
15988     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15989     int SSECC = translateX86FSETCC(
15990         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15991
15992     if (SSECC != 8) {
15993       if (Subtarget->hasAVX512()) {
15994         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15995                                   DAG.getConstant(SSECC, MVT::i8));
15996         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15997       }
15998       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
15999                                 DAG.getConstant(SSECC, MVT::i8));
16000       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
16001       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
16002       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
16003     }
16004   }
16005
16006   if (Cond.getOpcode() == ISD::SETCC) {
16007     SDValue NewCond = LowerSETCC(Cond, DAG);
16008     if (NewCond.getNode())
16009       Cond = NewCond;
16010   }
16011
16012   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
16013   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
16014   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
16015   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
16016   if (Cond.getOpcode() == X86ISD::SETCC &&
16017       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
16018       isZero(Cond.getOperand(1).getOperand(1))) {
16019     SDValue Cmp = Cond.getOperand(1);
16020
16021     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
16022
16023     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
16024         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
16025       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
16026
16027       SDValue CmpOp0 = Cmp.getOperand(0);
16028       // Apply further optimizations for special cases
16029       // (select (x != 0), -1, 0) -> neg & sbb
16030       // (select (x == 0), 0, -1) -> neg & sbb
16031       if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
16032         if (YC->isNullValue() &&
16033             (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
16034           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
16035           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
16036                                     DAG.getConstant(0, CmpOp0.getValueType()),
16037                                     CmpOp0);
16038           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16039                                     DAG.getConstant(X86::COND_B, MVT::i8),
16040                                     SDValue(Neg.getNode(), 1));
16041           return Res;
16042         }
16043
16044       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
16045                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
16046       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16047
16048       SDValue Res =   // Res = 0 or -1.
16049         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16050                     DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
16051
16052       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
16053         Res = DAG.getNOT(DL, Res, Res.getValueType());
16054
16055       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
16056       if (!N2C || !N2C->isNullValue())
16057         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
16058       return Res;
16059     }
16060   }
16061
16062   // Look past (and (setcc_carry (cmp ...)), 1).
16063   if (Cond.getOpcode() == ISD::AND &&
16064       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16065     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16066     if (C && C->getAPIntValue() == 1)
16067       Cond = Cond.getOperand(0);
16068   }
16069
16070   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16071   // setting operand in place of the X86ISD::SETCC.
16072   unsigned CondOpcode = Cond.getOpcode();
16073   if (CondOpcode == X86ISD::SETCC ||
16074       CondOpcode == X86ISD::SETCC_CARRY) {
16075     CC = Cond.getOperand(0);
16076
16077     SDValue Cmp = Cond.getOperand(1);
16078     unsigned Opc = Cmp.getOpcode();
16079     MVT VT = Op.getSimpleValueType();
16080
16081     bool IllegalFPCMov = false;
16082     if (VT.isFloatingPoint() && !VT.isVector() &&
16083         !isScalarFPTypeInSSEReg(VT))  // FPStack?
16084       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
16085
16086     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
16087         Opc == X86ISD::BT) { // FIXME
16088       Cond = Cmp;
16089       addTest = false;
16090     }
16091   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16092              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16093              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16094               Cond.getOperand(0).getValueType() != MVT::i8)) {
16095     SDValue LHS = Cond.getOperand(0);
16096     SDValue RHS = Cond.getOperand(1);
16097     unsigned X86Opcode;
16098     unsigned X86Cond;
16099     SDVTList VTs;
16100     switch (CondOpcode) {
16101     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16102     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16103     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16104     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16105     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16106     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16107     default: llvm_unreachable("unexpected overflowing operator");
16108     }
16109     if (CondOpcode == ISD::UMULO)
16110       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16111                           MVT::i32);
16112     else
16113       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16114
16115     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
16116
16117     if (CondOpcode == ISD::UMULO)
16118       Cond = X86Op.getValue(2);
16119     else
16120       Cond = X86Op.getValue(1);
16121
16122     CC = DAG.getConstant(X86Cond, MVT::i8);
16123     addTest = false;
16124   }
16125
16126   if (addTest) {
16127     // Look pass the truncate if the high bits are known zero.
16128     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16129         Cond = Cond.getOperand(0);
16130
16131     // We know the result of AND is compared against zero. Try to match
16132     // it to BT.
16133     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16134       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
16135       if (NewSetCC.getNode()) {
16136         CC = NewSetCC.getOperand(0);
16137         Cond = NewSetCC.getOperand(1);
16138         addTest = false;
16139       }
16140     }
16141   }
16142
16143   if (addTest) {
16144     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16145     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
16146   }
16147
16148   // a <  b ? -1 :  0 -> RES = ~setcc_carry
16149   // a <  b ?  0 : -1 -> RES = setcc_carry
16150   // a >= b ? -1 :  0 -> RES = setcc_carry
16151   // a >= b ?  0 : -1 -> RES = ~setcc_carry
16152   if (Cond.getOpcode() == X86ISD::SUB) {
16153     Cond = ConvertCmpIfNecessary(Cond, DAG);
16154     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
16155
16156     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
16157         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
16158       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16159                                 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
16160       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
16161         return DAG.getNOT(DL, Res, Res.getValueType());
16162       return Res;
16163     }
16164   }
16165
16166   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
16167   // widen the cmov and push the truncate through. This avoids introducing a new
16168   // branch during isel and doesn't add any extensions.
16169   if (Op.getValueType() == MVT::i8 &&
16170       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
16171     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
16172     if (T1.getValueType() == T2.getValueType() &&
16173         // Blacklist CopyFromReg to avoid partial register stalls.
16174         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
16175       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
16176       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
16177       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
16178     }
16179   }
16180
16181   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
16182   // condition is true.
16183   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
16184   SDValue Ops[] = { Op2, Op1, CC, Cond };
16185   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
16186 }
16187
16188 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
16189                                        SelectionDAG &DAG) {
16190   MVT VT = Op->getSimpleValueType(0);
16191   SDValue In = Op->getOperand(0);
16192   MVT InVT = In.getSimpleValueType();
16193   MVT VTElt = VT.getVectorElementType();
16194   MVT InVTElt = InVT.getVectorElementType();
16195   SDLoc dl(Op);
16196
16197   // SKX processor
16198   if ((InVTElt == MVT::i1) &&
16199       (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
16200         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
16201
16202        ((Subtarget->hasBWI() && VT.is512BitVector() &&
16203         VTElt.getSizeInBits() <= 16)) ||
16204
16205        ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
16206         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
16207
16208        ((Subtarget->hasDQI() && VT.is512BitVector() &&
16209         VTElt.getSizeInBits() >= 32))))
16210     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16211
16212   unsigned int NumElts = VT.getVectorNumElements();
16213
16214   if (NumElts != 8 && NumElts != 16)
16215     return SDValue();
16216
16217   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
16218     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
16219       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
16220     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16221   }
16222
16223   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16224   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16225
16226   MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
16227   Constant *C = ConstantInt::get(*DAG.getContext(),
16228     APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
16229
16230   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
16231   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
16232   SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
16233                           MachinePointerInfo::getConstantPool(),
16234                           false, false, false, Alignment);
16235   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
16236   if (VT.is512BitVector())
16237     return Brcst;
16238   return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
16239 }
16240
16241 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
16242                                 SelectionDAG &DAG) {
16243   MVT VT = Op->getSimpleValueType(0);
16244   SDValue In = Op->getOperand(0);
16245   MVT InVT = In.getSimpleValueType();
16246   SDLoc dl(Op);
16247
16248   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16249     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16250
16251   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16252       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16253       (VT != MVT::v16i16 || InVT != MVT::v16i8))
16254     return SDValue();
16255
16256   if (Subtarget->hasInt256())
16257     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16258
16259   // Optimize vectors in AVX mode
16260   // Sign extend  v8i16 to v8i32 and
16261   //              v4i32 to v4i64
16262   //
16263   // Divide input vector into two parts
16264   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16265   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16266   // concat the vectors to original VT
16267
16268   unsigned NumElems = InVT.getVectorNumElements();
16269   SDValue Undef = DAG.getUNDEF(InVT);
16270
16271   SmallVector<int,8> ShufMask1(NumElems, -1);
16272   for (unsigned i = 0; i != NumElems/2; ++i)
16273     ShufMask1[i] = i;
16274
16275   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
16276
16277   SmallVector<int,8> ShufMask2(NumElems, -1);
16278   for (unsigned i = 0; i != NumElems/2; ++i)
16279     ShufMask2[i] = i + NumElems/2;
16280
16281   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
16282
16283   MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
16284                                 VT.getVectorNumElements()/2);
16285
16286   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16287   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16288
16289   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16290 }
16291
16292 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16293 // may emit an illegal shuffle but the expansion is still better than scalar
16294 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16295 // we'll emit a shuffle and a arithmetic shift.
16296 // TODO: It is possible to support ZExt by zeroing the undef values during
16297 // the shuffle phase or after the shuffle.
16298 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
16299                                  SelectionDAG &DAG) {
16300   MVT RegVT = Op.getSimpleValueType();
16301   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16302   assert(RegVT.isInteger() &&
16303          "We only custom lower integer vector sext loads.");
16304
16305   // Nothing useful we can do without SSE2 shuffles.
16306   assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
16307
16308   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16309   SDLoc dl(Ld);
16310   EVT MemVT = Ld->getMemoryVT();
16311   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16312   unsigned RegSz = RegVT.getSizeInBits();
16313
16314   ISD::LoadExtType Ext = Ld->getExtensionType();
16315
16316   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16317          && "Only anyext and sext are currently implemented.");
16318   assert(MemVT != RegVT && "Cannot extend to the same type");
16319   assert(MemVT.isVector() && "Must load a vector from memory");
16320
16321   unsigned NumElems = RegVT.getVectorNumElements();
16322   unsigned MemSz = MemVT.getSizeInBits();
16323   assert(RegSz > MemSz && "Register size must be greater than the mem size");
16324
16325   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
16326     // The only way in which we have a legal 256-bit vector result but not the
16327     // integer 256-bit operations needed to directly lower a sextload is if we
16328     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16329     // a 128-bit vector and a normal sign_extend to 256-bits that should get
16330     // correctly legalized. We do this late to allow the canonical form of
16331     // sextload to persist throughout the rest of the DAG combiner -- it wants
16332     // to fold together any extensions it can, and so will fuse a sign_extend
16333     // of an sextload into a sextload targeting a wider value.
16334     SDValue Load;
16335     if (MemSz == 128) {
16336       // Just switch this to a normal load.
16337       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16338                                        "it must be a legal 128-bit vector "
16339                                        "type!");
16340       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16341                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16342                   Ld->isInvariant(), Ld->getAlignment());
16343     } else {
16344       assert(MemSz < 128 &&
16345              "Can't extend a type wider than 128 bits to a 256 bit vector!");
16346       // Do an sext load to a 128-bit vector type. We want to use the same
16347       // number of elements, but elements half as wide. This will end up being
16348       // recursively lowered by this routine, but will succeed as we definitely
16349       // have all the necessary features if we're using AVX1.
16350       EVT HalfEltVT =
16351           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16352       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16353       Load =
16354           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16355                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16356                          Ld->isNonTemporal(), Ld->isInvariant(),
16357                          Ld->getAlignment());
16358     }
16359
16360     // Replace chain users with the new chain.
16361     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16362     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16363
16364     // Finally, do a normal sign-extend to the desired register.
16365     return DAG.getSExtOrTrunc(Load, dl, RegVT);
16366   }
16367
16368   // All sizes must be a power of two.
16369   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16370          "Non-power-of-two elements are not custom lowered!");
16371
16372   // Attempt to load the original value using scalar loads.
16373   // Find the largest scalar type that divides the total loaded size.
16374   MVT SclrLoadTy = MVT::i8;
16375   for (MVT Tp : MVT::integer_valuetypes()) {
16376     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16377       SclrLoadTy = Tp;
16378     }
16379   }
16380
16381   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16382   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16383       (64 <= MemSz))
16384     SclrLoadTy = MVT::f64;
16385
16386   // Calculate the number of scalar loads that we need to perform
16387   // in order to load our vector from memory.
16388   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16389
16390   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16391          "Can only lower sext loads with a single scalar load!");
16392
16393   unsigned loadRegZize = RegSz;
16394   if (Ext == ISD::SEXTLOAD && RegSz == 256)
16395     loadRegZize /= 2;
16396
16397   // Represent our vector as a sequence of elements which are the
16398   // largest scalar that we can load.
16399   EVT LoadUnitVecVT = EVT::getVectorVT(
16400       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16401
16402   // Represent the data using the same element type that is stored in
16403   // memory. In practice, we ''widen'' MemVT.
16404   EVT WideVecVT =
16405       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16406                        loadRegZize / MemVT.getScalarType().getSizeInBits());
16407
16408   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16409          "Invalid vector type");
16410
16411   // We can't shuffle using an illegal type.
16412   assert(TLI.isTypeLegal(WideVecVT) &&
16413          "We only lower types that form legal widened vector types");
16414
16415   SmallVector<SDValue, 8> Chains;
16416   SDValue Ptr = Ld->getBasePtr();
16417   SDValue Increment =
16418       DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
16419   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16420
16421   for (unsigned i = 0; i < NumLoads; ++i) {
16422     // Perform a single load.
16423     SDValue ScalarLoad =
16424         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16425                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16426                     Ld->getAlignment());
16427     Chains.push_back(ScalarLoad.getValue(1));
16428     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16429     // another round of DAGCombining.
16430     if (i == 0)
16431       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16432     else
16433       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16434                         ScalarLoad, DAG.getIntPtrConstant(i));
16435
16436     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16437   }
16438
16439   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16440
16441   // Bitcast the loaded value to a vector of the original element type, in
16442   // the size of the target vector type.
16443   SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
16444   unsigned SizeRatio = RegSz / MemSz;
16445
16446   if (Ext == ISD::SEXTLOAD) {
16447     // If we have SSE4.1, we can directly emit a VSEXT node.
16448     if (Subtarget->hasSSE41()) {
16449       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16450       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16451       return Sext;
16452     }
16453
16454     // Otherwise we'll shuffle the small elements in the high bits of the
16455     // larger type and perform an arithmetic shift. If the shift is not legal
16456     // it's better to scalarize.
16457     assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
16458            "We can't implement a sext load without an arithmetic right shift!");
16459
16460     // Redistribute the loaded elements into the different locations.
16461     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16462     for (unsigned i = 0; i != NumElems; ++i)
16463       ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
16464
16465     SDValue Shuff = DAG.getVectorShuffle(
16466         WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16467
16468     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16469
16470     // Build the arithmetic shift.
16471     unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16472                    MemVT.getVectorElementType().getSizeInBits();
16473     Shuff =
16474         DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
16475
16476     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16477     return Shuff;
16478   }
16479
16480   // Redistribute the loaded elements into the different locations.
16481   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16482   for (unsigned i = 0; i != NumElems; ++i)
16483     ShuffleVec[i * SizeRatio] = i;
16484
16485   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16486                                        DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16487
16488   // Bitcast to the requested type.
16489   Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16490   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16491   return Shuff;
16492 }
16493
16494 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
16495 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
16496 // from the AND / OR.
16497 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16498   Opc = Op.getOpcode();
16499   if (Opc != ISD::OR && Opc != ISD::AND)
16500     return false;
16501   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16502           Op.getOperand(0).hasOneUse() &&
16503           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16504           Op.getOperand(1).hasOneUse());
16505 }
16506
16507 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
16508 // 1 and that the SETCC node has a single use.
16509 static bool isXor1OfSetCC(SDValue Op) {
16510   if (Op.getOpcode() != ISD::XOR)
16511     return false;
16512   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16513   if (N1C && N1C->getAPIntValue() == 1) {
16514     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16515       Op.getOperand(0).hasOneUse();
16516   }
16517   return false;
16518 }
16519
16520 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16521   bool addTest = true;
16522   SDValue Chain = Op.getOperand(0);
16523   SDValue Cond  = Op.getOperand(1);
16524   SDValue Dest  = Op.getOperand(2);
16525   SDLoc dl(Op);
16526   SDValue CC;
16527   bool Inverted = false;
16528
16529   if (Cond.getOpcode() == ISD::SETCC) {
16530     // Check for setcc([su]{add,sub,mul}o == 0).
16531     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16532         isa<ConstantSDNode>(Cond.getOperand(1)) &&
16533         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
16534         Cond.getOperand(0).getResNo() == 1 &&
16535         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16536          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16537          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16538          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16539          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16540          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16541       Inverted = true;
16542       Cond = Cond.getOperand(0);
16543     } else {
16544       SDValue NewCond = LowerSETCC(Cond, DAG);
16545       if (NewCond.getNode())
16546         Cond = NewCond;
16547     }
16548   }
16549 #if 0
16550   // FIXME: LowerXALUO doesn't handle these!!
16551   else if (Cond.getOpcode() == X86ISD::ADD  ||
16552            Cond.getOpcode() == X86ISD::SUB  ||
16553            Cond.getOpcode() == X86ISD::SMUL ||
16554            Cond.getOpcode() == X86ISD::UMUL)
16555     Cond = LowerXALUO(Cond, DAG);
16556 #endif
16557
16558   // Look pass (and (setcc_carry (cmp ...)), 1).
16559   if (Cond.getOpcode() == ISD::AND &&
16560       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16561     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16562     if (C && C->getAPIntValue() == 1)
16563       Cond = Cond.getOperand(0);
16564   }
16565
16566   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16567   // setting operand in place of the X86ISD::SETCC.
16568   unsigned CondOpcode = Cond.getOpcode();
16569   if (CondOpcode == X86ISD::SETCC ||
16570       CondOpcode == X86ISD::SETCC_CARRY) {
16571     CC = Cond.getOperand(0);
16572
16573     SDValue Cmp = Cond.getOperand(1);
16574     unsigned Opc = Cmp.getOpcode();
16575     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16576     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16577       Cond = Cmp;
16578       addTest = false;
16579     } else {
16580       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16581       default: break;
16582       case X86::COND_O:
16583       case X86::COND_B:
16584         // These can only come from an arithmetic instruction with overflow,
16585         // e.g. SADDO, UADDO.
16586         Cond = Cond.getNode()->getOperand(1);
16587         addTest = false;
16588         break;
16589       }
16590     }
16591   }
16592   CondOpcode = Cond.getOpcode();
16593   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16594       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16595       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16596        Cond.getOperand(0).getValueType() != MVT::i8)) {
16597     SDValue LHS = Cond.getOperand(0);
16598     SDValue RHS = Cond.getOperand(1);
16599     unsigned X86Opcode;
16600     unsigned X86Cond;
16601     SDVTList VTs;
16602     // Keep this in sync with LowerXALUO, otherwise we might create redundant
16603     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16604     // X86ISD::INC).
16605     switch (CondOpcode) {
16606     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16607     case ISD::SADDO:
16608       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16609         if (C->isOne()) {
16610           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16611           break;
16612         }
16613       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16614     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16615     case ISD::SSUBO:
16616       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16617         if (C->isOne()) {
16618           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16619           break;
16620         }
16621       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16622     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16623     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16624     default: llvm_unreachable("unexpected overflowing operator");
16625     }
16626     if (Inverted)
16627       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16628     if (CondOpcode == ISD::UMULO)
16629       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16630                           MVT::i32);
16631     else
16632       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16633
16634     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16635
16636     if (CondOpcode == ISD::UMULO)
16637       Cond = X86Op.getValue(2);
16638     else
16639       Cond = X86Op.getValue(1);
16640
16641     CC = DAG.getConstant(X86Cond, MVT::i8);
16642     addTest = false;
16643   } else {
16644     unsigned CondOpc;
16645     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16646       SDValue Cmp = Cond.getOperand(0).getOperand(1);
16647       if (CondOpc == ISD::OR) {
16648         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16649         // two branches instead of an explicit OR instruction with a
16650         // separate test.
16651         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16652             isX86LogicalCmp(Cmp)) {
16653           CC = Cond.getOperand(0).getOperand(0);
16654           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16655                               Chain, Dest, CC, Cmp);
16656           CC = Cond.getOperand(1).getOperand(0);
16657           Cond = Cmp;
16658           addTest = false;
16659         }
16660       } else { // ISD::AND
16661         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16662         // two branches instead of an explicit AND instruction with a
16663         // separate test. However, we only do this if this block doesn't
16664         // have a fall-through edge, because this requires an explicit
16665         // jmp when the condition is false.
16666         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16667             isX86LogicalCmp(Cmp) &&
16668             Op.getNode()->hasOneUse()) {
16669           X86::CondCode CCode =
16670             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16671           CCode = X86::GetOppositeBranchCondition(CCode);
16672           CC = DAG.getConstant(CCode, MVT::i8);
16673           SDNode *User = *Op.getNode()->use_begin();
16674           // Look for an unconditional branch following this conditional branch.
16675           // We need this because we need to reverse the successors in order
16676           // to implement FCMP_OEQ.
16677           if (User->getOpcode() == ISD::BR) {
16678             SDValue FalseBB = User->getOperand(1);
16679             SDNode *NewBR =
16680               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16681             assert(NewBR == User);
16682             (void)NewBR;
16683             Dest = FalseBB;
16684
16685             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16686                                 Chain, Dest, CC, Cmp);
16687             X86::CondCode CCode =
16688               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16689             CCode = X86::GetOppositeBranchCondition(CCode);
16690             CC = DAG.getConstant(CCode, MVT::i8);
16691             Cond = Cmp;
16692             addTest = false;
16693           }
16694         }
16695       }
16696     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16697       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16698       // It should be transformed during dag combiner except when the condition
16699       // is set by a arithmetics with overflow node.
16700       X86::CondCode CCode =
16701         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16702       CCode = X86::GetOppositeBranchCondition(CCode);
16703       CC = DAG.getConstant(CCode, MVT::i8);
16704       Cond = Cond.getOperand(0).getOperand(1);
16705       addTest = false;
16706     } else if (Cond.getOpcode() == ISD::SETCC &&
16707                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16708       // For FCMP_OEQ, we can emit
16709       // two branches instead of an explicit AND instruction with a
16710       // separate test. However, we only do this if this block doesn't
16711       // have a fall-through edge, because this requires an explicit
16712       // jmp when the condition is false.
16713       if (Op.getNode()->hasOneUse()) {
16714         SDNode *User = *Op.getNode()->use_begin();
16715         // Look for an unconditional branch following this conditional branch.
16716         // We need this because we need to reverse the successors in order
16717         // to implement FCMP_OEQ.
16718         if (User->getOpcode() == ISD::BR) {
16719           SDValue FalseBB = User->getOperand(1);
16720           SDNode *NewBR =
16721             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16722           assert(NewBR == User);
16723           (void)NewBR;
16724           Dest = FalseBB;
16725
16726           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16727                                     Cond.getOperand(0), Cond.getOperand(1));
16728           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16729           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16730           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16731                               Chain, Dest, CC, Cmp);
16732           CC = DAG.getConstant(X86::COND_P, MVT::i8);
16733           Cond = Cmp;
16734           addTest = false;
16735         }
16736       }
16737     } else if (Cond.getOpcode() == ISD::SETCC &&
16738                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16739       // For FCMP_UNE, we can emit
16740       // two branches instead of an explicit AND instruction with a
16741       // separate test. However, we only do this if this block doesn't
16742       // have a fall-through edge, because this requires an explicit
16743       // jmp when the condition is false.
16744       if (Op.getNode()->hasOneUse()) {
16745         SDNode *User = *Op.getNode()->use_begin();
16746         // Look for an unconditional branch following this conditional branch.
16747         // We need this because we need to reverse the successors in order
16748         // to implement FCMP_UNE.
16749         if (User->getOpcode() == ISD::BR) {
16750           SDValue FalseBB = User->getOperand(1);
16751           SDNode *NewBR =
16752             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16753           assert(NewBR == User);
16754           (void)NewBR;
16755
16756           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16757                                     Cond.getOperand(0), Cond.getOperand(1));
16758           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16759           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16760           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16761                               Chain, Dest, CC, Cmp);
16762           CC = DAG.getConstant(X86::COND_NP, MVT::i8);
16763           Cond = Cmp;
16764           addTest = false;
16765           Dest = FalseBB;
16766         }
16767       }
16768     }
16769   }
16770
16771   if (addTest) {
16772     // Look pass the truncate if the high bits are known zero.
16773     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16774         Cond = Cond.getOperand(0);
16775
16776     // We know the result of AND is compared against zero. Try to match
16777     // it to BT.
16778     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16779       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
16780       if (NewSetCC.getNode()) {
16781         CC = NewSetCC.getOperand(0);
16782         Cond = NewSetCC.getOperand(1);
16783         addTest = false;
16784       }
16785     }
16786   }
16787
16788   if (addTest) {
16789     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16790     CC = DAG.getConstant(X86Cond, MVT::i8);
16791     Cond = EmitTest(Cond, X86Cond, dl, DAG);
16792   }
16793   Cond = ConvertCmpIfNecessary(Cond, DAG);
16794   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16795                      Chain, Dest, CC, Cond);
16796 }
16797
16798 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16799 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16800 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16801 // that the guard pages used by the OS virtual memory manager are allocated in
16802 // correct sequence.
16803 SDValue
16804 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16805                                            SelectionDAG &DAG) const {
16806   MachineFunction &MF = DAG.getMachineFunction();
16807   bool SplitStack = MF.shouldSplitStack();
16808   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
16809                SplitStack;
16810   SDLoc dl(Op);
16811
16812   if (!Lower) {
16813     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16814     SDNode* Node = Op.getNode();
16815
16816     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16817     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16818         " not tell us which reg is the stack pointer!");
16819     EVT VT = Node->getValueType(0);
16820     SDValue Tmp1 = SDValue(Node, 0);
16821     SDValue Tmp2 = SDValue(Node, 1);
16822     SDValue Tmp3 = Node->getOperand(2);
16823     SDValue Chain = Tmp1.getOperand(0);
16824
16825     // Chain the dynamic stack allocation so that it doesn't modify the stack
16826     // pointer when other instructions are using the stack.
16827     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
16828         SDLoc(Node));
16829
16830     SDValue Size = Tmp2.getOperand(1);
16831     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16832     Chain = SP.getValue(1);
16833     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
16834     const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
16835     unsigned StackAlign = TFI.getStackAlignment();
16836     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16837     if (Align > StackAlign)
16838       Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
16839           DAG.getConstant(-(uint64_t)Align, VT));
16840     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
16841
16842     Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
16843         DAG.getIntPtrConstant(0, true), SDValue(),
16844         SDLoc(Node));
16845
16846     SDValue Ops[2] = { Tmp1, Tmp2 };
16847     return DAG.getMergeValues(Ops, dl);
16848   }
16849
16850   // Get the inputs.
16851   SDValue Chain = Op.getOperand(0);
16852   SDValue Size  = Op.getOperand(1);
16853   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16854   EVT VT = Op.getNode()->getValueType(0);
16855
16856   bool Is64Bit = Subtarget->is64Bit();
16857   EVT SPTy = getPointerTy();
16858
16859   if (SplitStack) {
16860     MachineRegisterInfo &MRI = MF.getRegInfo();
16861
16862     if (Is64Bit) {
16863       // The 64 bit implementation of segmented stacks needs to clobber both r10
16864       // r11. This makes it impossible to use it along with nested parameters.
16865       const Function *F = MF.getFunction();
16866
16867       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
16868            I != E; ++I)
16869         if (I->hasNestAttr())
16870           report_fatal_error("Cannot use segmented stacks with functions that "
16871                              "have nested arguments.");
16872     }
16873
16874     const TargetRegisterClass *AddrRegClass =
16875       getRegClassFor(getPointerTy());
16876     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16877     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16878     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16879                                 DAG.getRegister(Vreg, SPTy));
16880     SDValue Ops1[2] = { Value, Chain };
16881     return DAG.getMergeValues(Ops1, dl);
16882   } else {
16883     SDValue Flag;
16884     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
16885
16886     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
16887     Flag = Chain.getValue(1);
16888     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16889
16890     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
16891
16892     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
16893     unsigned SPReg = RegInfo->getStackRegister();
16894     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16895     Chain = SP.getValue(1);
16896
16897     if (Align) {
16898       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16899                        DAG.getConstant(-(uint64_t)Align, VT));
16900       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16901     }
16902
16903     SDValue Ops1[2] = { SP, Chain };
16904     return DAG.getMergeValues(Ops1, dl);
16905   }
16906 }
16907
16908 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16909   MachineFunction &MF = DAG.getMachineFunction();
16910   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16911
16912   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16913   SDLoc DL(Op);
16914
16915   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
16916     // vastart just stores the address of the VarArgsFrameIndex slot into the
16917     // memory location argument.
16918     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16919                                    getPointerTy());
16920     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16921                         MachinePointerInfo(SV), false, false, 0);
16922   }
16923
16924   // __va_list_tag:
16925   //   gp_offset         (0 - 6 * 8)
16926   //   fp_offset         (48 - 48 + 8 * 16)
16927   //   overflow_arg_area (point to parameters coming in memory).
16928   //   reg_save_area
16929   SmallVector<SDValue, 8> MemOps;
16930   SDValue FIN = Op.getOperand(1);
16931   // Store gp_offset
16932   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
16933                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
16934                                                MVT::i32),
16935                                FIN, MachinePointerInfo(SV), false, false, 0);
16936   MemOps.push_back(Store);
16937
16938   // Store fp_offset
16939   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16940                     FIN, DAG.getIntPtrConstant(4));
16941   Store = DAG.getStore(Op.getOperand(0), DL,
16942                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
16943                                        MVT::i32),
16944                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
16945   MemOps.push_back(Store);
16946
16947   // Store ptr to overflow_arg_area
16948   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16949                     FIN, DAG.getIntPtrConstant(4));
16950   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16951                                     getPointerTy());
16952   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
16953                        MachinePointerInfo(SV, 8),
16954                        false, false, 0);
16955   MemOps.push_back(Store);
16956
16957   // Store ptr to reg_save_area.
16958   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16959                     FIN, DAG.getIntPtrConstant(8));
16960   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
16961                                     getPointerTy());
16962   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
16963                        MachinePointerInfo(SV, 16), false, false, 0);
16964   MemOps.push_back(Store);
16965   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16966 }
16967
16968 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16969   assert(Subtarget->is64Bit() &&
16970          "LowerVAARG only handles 64-bit va_arg!");
16971   assert((Subtarget->isTargetLinux() ||
16972           Subtarget->isTargetDarwin()) &&
16973           "Unhandled target in LowerVAARG");
16974   assert(Op.getNode()->getNumOperands() == 4);
16975   SDValue Chain = Op.getOperand(0);
16976   SDValue SrcPtr = Op.getOperand(1);
16977   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16978   unsigned Align = Op.getConstantOperandVal(3);
16979   SDLoc dl(Op);
16980
16981   EVT ArgVT = Op.getNode()->getValueType(0);
16982   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16983   uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
16984   uint8_t ArgMode;
16985
16986   // Decide which area this value should be read from.
16987   // TODO: Implement the AMD64 ABI in its entirety. This simple
16988   // selection mechanism works only for the basic types.
16989   if (ArgVT == MVT::f80) {
16990     llvm_unreachable("va_arg for f80 not yet implemented");
16991   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16992     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
16993   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16994     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
16995   } else {
16996     llvm_unreachable("Unhandled argument type in LowerVAARG");
16997   }
16998
16999   if (ArgMode == 2) {
17000     // Sanity Check: Make sure using fp_offset makes sense.
17001     assert(!DAG.getTarget().Options.UseSoftFloat &&
17002            !(DAG.getMachineFunction()
17003                 .getFunction()->getAttributes()
17004                 .hasAttribute(AttributeSet::FunctionIndex,
17005                               Attribute::NoImplicitFloat)) &&
17006            Subtarget->hasSSE1());
17007   }
17008
17009   // Insert VAARG_64 node into the DAG
17010   // VAARG_64 returns two values: Variable Argument Address, Chain
17011   SmallVector<SDValue, 11> InstOps;
17012   InstOps.push_back(Chain);
17013   InstOps.push_back(SrcPtr);
17014   InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
17015   InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
17016   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
17017   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
17018   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
17019                                           VTs, InstOps, MVT::i64,
17020                                           MachinePointerInfo(SV),
17021                                           /*Align=*/0,
17022                                           /*Volatile=*/false,
17023                                           /*ReadMem=*/true,
17024                                           /*WriteMem=*/true);
17025   Chain = VAARG.getValue(1);
17026
17027   // Load the next argument and return it
17028   return DAG.getLoad(ArgVT, dl,
17029                      Chain,
17030                      VAARG,
17031                      MachinePointerInfo(),
17032                      false, false, false, 0);
17033 }
17034
17035 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
17036                            SelectionDAG &DAG) {
17037   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
17038   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
17039   SDValue Chain = Op.getOperand(0);
17040   SDValue DstPtr = Op.getOperand(1);
17041   SDValue SrcPtr = Op.getOperand(2);
17042   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
17043   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17044   SDLoc DL(Op);
17045
17046   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
17047                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
17048                        false,
17049                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
17050 }
17051
17052 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
17053 // amount is a constant. Takes immediate version of shift as input.
17054 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
17055                                           SDValue SrcOp, uint64_t ShiftAmt,
17056                                           SelectionDAG &DAG) {
17057   MVT ElementType = VT.getVectorElementType();
17058
17059   // Fold this packed shift into its first operand if ShiftAmt is 0.
17060   if (ShiftAmt == 0)
17061     return SrcOp;
17062
17063   // Check for ShiftAmt >= element width
17064   if (ShiftAmt >= ElementType.getSizeInBits()) {
17065     if (Opc == X86ISD::VSRAI)
17066       ShiftAmt = ElementType.getSizeInBits() - 1;
17067     else
17068       return DAG.getConstant(0, VT);
17069   }
17070
17071   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
17072          && "Unknown target vector shift-by-constant node");
17073
17074   // Fold this packed vector shift into a build vector if SrcOp is a
17075   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
17076   if (VT == SrcOp.getSimpleValueType() &&
17077       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
17078     SmallVector<SDValue, 8> Elts;
17079     unsigned NumElts = SrcOp->getNumOperands();
17080     ConstantSDNode *ND;
17081
17082     switch(Opc) {
17083     default: llvm_unreachable(nullptr);
17084     case X86ISD::VSHLI:
17085       for (unsigned i=0; i!=NumElts; ++i) {
17086         SDValue CurrentOp = SrcOp->getOperand(i);
17087         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17088           Elts.push_back(CurrentOp);
17089           continue;
17090         }
17091         ND = cast<ConstantSDNode>(CurrentOp);
17092         const APInt &C = ND->getAPIntValue();
17093         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
17094       }
17095       break;
17096     case X86ISD::VSRLI:
17097       for (unsigned i=0; i!=NumElts; ++i) {
17098         SDValue CurrentOp = SrcOp->getOperand(i);
17099         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17100           Elts.push_back(CurrentOp);
17101           continue;
17102         }
17103         ND = cast<ConstantSDNode>(CurrentOp);
17104         const APInt &C = ND->getAPIntValue();
17105         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
17106       }
17107       break;
17108     case X86ISD::VSRAI:
17109       for (unsigned i=0; i!=NumElts; ++i) {
17110         SDValue CurrentOp = SrcOp->getOperand(i);
17111         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17112           Elts.push_back(CurrentOp);
17113           continue;
17114         }
17115         ND = cast<ConstantSDNode>(CurrentOp);
17116         const APInt &C = ND->getAPIntValue();
17117         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
17118       }
17119       break;
17120     }
17121
17122     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
17123   }
17124
17125   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
17126 }
17127
17128 // getTargetVShiftNode - Handle vector element shifts where the shift amount
17129 // may or may not be a constant. Takes immediate version of shift as input.
17130 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
17131                                    SDValue SrcOp, SDValue ShAmt,
17132                                    SelectionDAG &DAG) {
17133   MVT SVT = ShAmt.getSimpleValueType();
17134   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
17135
17136   // Catch shift-by-constant.
17137   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
17138     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
17139                                       CShAmt->getZExtValue(), DAG);
17140
17141   // Change opcode to non-immediate version
17142   switch (Opc) {
17143     default: llvm_unreachable("Unknown target vector shift node");
17144     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
17145     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
17146     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
17147   }
17148
17149   const X86Subtarget &Subtarget =
17150       static_cast<const X86Subtarget &>(DAG.getSubtarget());
17151   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
17152       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
17153     // Let the shuffle legalizer expand this shift amount node.
17154     SDValue Op0 = ShAmt.getOperand(0);
17155     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
17156     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
17157   } else {
17158     // Need to build a vector containing shift amount.
17159     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
17160     SmallVector<SDValue, 4> ShOps;
17161     ShOps.push_back(ShAmt);
17162     if (SVT == MVT::i32) {
17163       ShOps.push_back(DAG.getConstant(0, SVT));
17164       ShOps.push_back(DAG.getUNDEF(SVT));
17165     }
17166     ShOps.push_back(DAG.getUNDEF(SVT));
17167
17168     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
17169     ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
17170   }
17171
17172   // The return type has to be a 128-bit type with the same element
17173   // type as the input type.
17174   MVT EltVT = VT.getVectorElementType();
17175   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
17176
17177   ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
17178   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
17179 }
17180
17181 /// \brief Return (and \p Op, \p Mask) for compare instructions or
17182 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
17183 /// necessary casting for \p Mask when lowering masking intrinsics.
17184 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
17185                                     SDValue PreservedSrc,
17186                                     const X86Subtarget *Subtarget,
17187                                     SelectionDAG &DAG) {
17188     EVT VT = Op.getValueType();
17189     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
17190                                   MVT::i1, VT.getVectorNumElements());
17191     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17192                                      Mask.getValueType().getSizeInBits());
17193     SDLoc dl(Op);
17194
17195     assert(MaskVT.isSimple() && "invalid mask type");
17196
17197     if (isAllOnes(Mask))
17198       return Op;
17199
17200     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
17201     // are extracted by EXTRACT_SUBVECTOR.
17202     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17203                               DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17204                               DAG.getIntPtrConstant(0));
17205
17206     switch (Op.getOpcode()) {
17207       default: break;
17208       case X86ISD::PCMPEQM:
17209       case X86ISD::PCMPGTM:
17210       case X86ISD::CMPM:
17211       case X86ISD::CMPMU:
17212         return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
17213     }
17214     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17215       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17216     return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
17217 }
17218
17219 /// \brief Creates an SDNode for a predicated scalar operation.
17220 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17221 /// The mask is comming as MVT::i8 and it should be truncated
17222 /// to MVT::i1 while lowering masking intrinsics.
17223 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17224 /// "X86select" instead of "vselect". We just can't create the "vselect" node for
17225 /// a scalar instruction.
17226 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17227                                     SDValue PreservedSrc,
17228                                     const X86Subtarget *Subtarget,
17229                                     SelectionDAG &DAG) {
17230     if (isAllOnes(Mask))
17231       return Op;
17232
17233     EVT VT = Op.getValueType();
17234     SDLoc dl(Op);
17235     // The mask should be of type MVT::i1
17236     SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17237
17238     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17239       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17240     return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17241 }
17242
17243 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17244                                        SelectionDAG &DAG) {
17245   SDLoc dl(Op);
17246   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17247   EVT VT = Op.getValueType();
17248   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17249   if (IntrData) {
17250     switch(IntrData->Type) {
17251     case INTR_TYPE_1OP:
17252       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17253     case INTR_TYPE_2OP:
17254       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17255         Op.getOperand(2));
17256     case INTR_TYPE_3OP:
17257       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17258         Op.getOperand(2), Op.getOperand(3));
17259     case INTR_TYPE_1OP_MASK_RM: {
17260       SDValue Src = Op.getOperand(1);
17261       SDValue Src0 = Op.getOperand(2);
17262       SDValue Mask = Op.getOperand(3);
17263       SDValue RoundingMode = Op.getOperand(4);
17264       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17265                                               RoundingMode),
17266                                   Mask, Src0, Subtarget, DAG);
17267     }
17268     case INTR_TYPE_SCALAR_MASK_RM: {
17269       SDValue Src1 = Op.getOperand(1);
17270       SDValue Src2 = Op.getOperand(2);
17271       SDValue Src0 = Op.getOperand(3);
17272       SDValue Mask = Op.getOperand(4);
17273       SDValue RoundingMode = Op.getOperand(5);
17274       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17275                                               RoundingMode),
17276                                   Mask, Src0, Subtarget, DAG);
17277     }
17278     case INTR_TYPE_2OP_MASK: {
17279       SDValue Mask = Op.getOperand(4);
17280       SDValue PassThru = Op.getOperand(3);
17281       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17282       if (IntrWithRoundingModeOpcode != 0) {
17283         unsigned Round = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
17284         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17285           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17286                                       dl, Op.getValueType(),
17287                                       Op.getOperand(1), Op.getOperand(2),
17288                                       Op.getOperand(3), Op.getOperand(5)),
17289                                       Mask, PassThru, Subtarget, DAG);
17290         }
17291       }
17292       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17293                                               Op.getOperand(1),
17294                                               Op.getOperand(2)),
17295                                   Mask, PassThru, Subtarget, DAG);
17296     }
17297     case FMA_OP_MASK: {
17298       SDValue Src1 = Op.getOperand(1);
17299       SDValue Src2 = Op.getOperand(2);
17300       SDValue Src3 = Op.getOperand(3);
17301       SDValue Mask = Op.getOperand(4);
17302       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17303       if (IntrWithRoundingModeOpcode != 0) {
17304         SDValue Rnd = Op.getOperand(5);
17305         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17306             X86::STATIC_ROUNDING::CUR_DIRECTION)
17307           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17308                                                   dl, Op.getValueType(),
17309                                                   Src1, Src2, Src3, Rnd),
17310                                       Mask, Src1, Subtarget, DAG);
17311       }
17312       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17313                                               dl, Op.getValueType(),
17314                                               Src1, Src2, Src3),
17315                                   Mask, Src1, Subtarget, DAG);
17316     }
17317     case CMP_MASK:
17318     case CMP_MASK_CC: {
17319       // Comparison intrinsics with masks.
17320       // Example of transformation:
17321       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17322       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17323       // (i8 (bitcast
17324       //   (v8i1 (insert_subvector undef,
17325       //           (v2i1 (and (PCMPEQM %a, %b),
17326       //                      (extract_subvector
17327       //                         (v8i1 (bitcast %mask)), 0))), 0))))
17328       EVT VT = Op.getOperand(1).getValueType();
17329       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17330                                     VT.getVectorNumElements());
17331       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17332       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17333                                        Mask.getValueType().getSizeInBits());
17334       SDValue Cmp;
17335       if (IntrData->Type == CMP_MASK_CC) {
17336         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17337                     Op.getOperand(2), Op.getOperand(3));
17338       } else {
17339         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17340         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17341                     Op.getOperand(2));
17342       }
17343       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17344                                              DAG.getTargetConstant(0, MaskVT),
17345                                              Subtarget, DAG);
17346       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17347                                 DAG.getUNDEF(BitcastVT), CmpMask,
17348                                 DAG.getIntPtrConstant(0));
17349       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
17350     }
17351     case COMI: { // Comparison intrinsics
17352       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17353       SDValue LHS = Op.getOperand(1);
17354       SDValue RHS = Op.getOperand(2);
17355       unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
17356       assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
17357       SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17358       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17359                                   DAG.getConstant(X86CC, MVT::i8), Cond);
17360       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17361     }
17362     case VSHIFT:
17363       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17364                                  Op.getOperand(1), Op.getOperand(2), DAG);
17365     case VSHIFT_MASK:
17366       return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
17367                                                       Op.getSimpleValueType(),
17368                                                       Op.getOperand(1),
17369                                                       Op.getOperand(2), DAG),
17370                                   Op.getOperand(4), Op.getOperand(3), Subtarget,
17371                                   DAG);
17372     case COMPRESS_EXPAND_IN_REG: {
17373       SDValue Mask = Op.getOperand(3);
17374       SDValue DataToCompress = Op.getOperand(1);
17375       SDValue PassThru = Op.getOperand(2);
17376       if (isAllOnes(Mask)) // return data as is
17377         return Op.getOperand(1);
17378       EVT VT = Op.getValueType();
17379       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17380                                     VT.getVectorNumElements());
17381       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17382                                        Mask.getValueType().getSizeInBits());
17383       SDLoc dl(Op);
17384       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17385                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17386                                   DAG.getIntPtrConstant(0));
17387
17388       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
17389                          PassThru);
17390     }
17391     case BLEND: {
17392       SDValue Mask = Op.getOperand(3);
17393       EVT VT = Op.getValueType();
17394       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17395                                     VT.getVectorNumElements());
17396       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17397                                        Mask.getValueType().getSizeInBits());
17398       SDLoc dl(Op);
17399       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17400                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17401                                   DAG.getIntPtrConstant(0));
17402       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
17403                          Op.getOperand(2));
17404     }
17405     default:
17406       break;
17407     }
17408   }
17409
17410   switch (IntNo) {
17411   default: return SDValue();    // Don't custom lower most intrinsics.
17412
17413   case Intrinsic::x86_avx512_mask_valign_q_512:
17414   case Intrinsic::x86_avx512_mask_valign_d_512:
17415     // Vector source operands are swapped.
17416     return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
17417                                             Op.getValueType(), Op.getOperand(2),
17418                                             Op.getOperand(1),
17419                                             Op.getOperand(3)),
17420                                 Op.getOperand(5), Op.getOperand(4),
17421                                 Subtarget, DAG);
17422
17423   // ptest and testp intrinsics. The intrinsic these come from are designed to
17424   // return an integer value, not just an instruction so lower it to the ptest
17425   // or testp pattern and a setcc for the result.
17426   case Intrinsic::x86_sse41_ptestz:
17427   case Intrinsic::x86_sse41_ptestc:
17428   case Intrinsic::x86_sse41_ptestnzc:
17429   case Intrinsic::x86_avx_ptestz_256:
17430   case Intrinsic::x86_avx_ptestc_256:
17431   case Intrinsic::x86_avx_ptestnzc_256:
17432   case Intrinsic::x86_avx_vtestz_ps:
17433   case Intrinsic::x86_avx_vtestc_ps:
17434   case Intrinsic::x86_avx_vtestnzc_ps:
17435   case Intrinsic::x86_avx_vtestz_pd:
17436   case Intrinsic::x86_avx_vtestc_pd:
17437   case Intrinsic::x86_avx_vtestnzc_pd:
17438   case Intrinsic::x86_avx_vtestz_ps_256:
17439   case Intrinsic::x86_avx_vtestc_ps_256:
17440   case Intrinsic::x86_avx_vtestnzc_ps_256:
17441   case Intrinsic::x86_avx_vtestz_pd_256:
17442   case Intrinsic::x86_avx_vtestc_pd_256:
17443   case Intrinsic::x86_avx_vtestnzc_pd_256: {
17444     bool IsTestPacked = false;
17445     unsigned X86CC;
17446     switch (IntNo) {
17447     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17448     case Intrinsic::x86_avx_vtestz_ps:
17449     case Intrinsic::x86_avx_vtestz_pd:
17450     case Intrinsic::x86_avx_vtestz_ps_256:
17451     case Intrinsic::x86_avx_vtestz_pd_256:
17452       IsTestPacked = true; // Fallthrough
17453     case Intrinsic::x86_sse41_ptestz:
17454     case Intrinsic::x86_avx_ptestz_256:
17455       // ZF = 1
17456       X86CC = X86::COND_E;
17457       break;
17458     case Intrinsic::x86_avx_vtestc_ps:
17459     case Intrinsic::x86_avx_vtestc_pd:
17460     case Intrinsic::x86_avx_vtestc_ps_256:
17461     case Intrinsic::x86_avx_vtestc_pd_256:
17462       IsTestPacked = true; // Fallthrough
17463     case Intrinsic::x86_sse41_ptestc:
17464     case Intrinsic::x86_avx_ptestc_256:
17465       // CF = 1
17466       X86CC = X86::COND_B;
17467       break;
17468     case Intrinsic::x86_avx_vtestnzc_ps:
17469     case Intrinsic::x86_avx_vtestnzc_pd:
17470     case Intrinsic::x86_avx_vtestnzc_ps_256:
17471     case Intrinsic::x86_avx_vtestnzc_pd_256:
17472       IsTestPacked = true; // Fallthrough
17473     case Intrinsic::x86_sse41_ptestnzc:
17474     case Intrinsic::x86_avx_ptestnzc_256:
17475       // ZF and CF = 0
17476       X86CC = X86::COND_A;
17477       break;
17478     }
17479
17480     SDValue LHS = Op.getOperand(1);
17481     SDValue RHS = Op.getOperand(2);
17482     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
17483     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
17484     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17485     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
17486     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17487   }
17488   case Intrinsic::x86_avx512_kortestz_w:
17489   case Intrinsic::x86_avx512_kortestc_w: {
17490     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
17491     SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
17492     SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
17493     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17494     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
17495     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
17496     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17497   }
17498
17499   case Intrinsic::x86_sse42_pcmpistria128:
17500   case Intrinsic::x86_sse42_pcmpestria128:
17501   case Intrinsic::x86_sse42_pcmpistric128:
17502   case Intrinsic::x86_sse42_pcmpestric128:
17503   case Intrinsic::x86_sse42_pcmpistrio128:
17504   case Intrinsic::x86_sse42_pcmpestrio128:
17505   case Intrinsic::x86_sse42_pcmpistris128:
17506   case Intrinsic::x86_sse42_pcmpestris128:
17507   case Intrinsic::x86_sse42_pcmpistriz128:
17508   case Intrinsic::x86_sse42_pcmpestriz128: {
17509     unsigned Opcode;
17510     unsigned X86CC;
17511     switch (IntNo) {
17512     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
17513     case Intrinsic::x86_sse42_pcmpistria128:
17514       Opcode = X86ISD::PCMPISTRI;
17515       X86CC = X86::COND_A;
17516       break;
17517     case Intrinsic::x86_sse42_pcmpestria128:
17518       Opcode = X86ISD::PCMPESTRI;
17519       X86CC = X86::COND_A;
17520       break;
17521     case Intrinsic::x86_sse42_pcmpistric128:
17522       Opcode = X86ISD::PCMPISTRI;
17523       X86CC = X86::COND_B;
17524       break;
17525     case Intrinsic::x86_sse42_pcmpestric128:
17526       Opcode = X86ISD::PCMPESTRI;
17527       X86CC = X86::COND_B;
17528       break;
17529     case Intrinsic::x86_sse42_pcmpistrio128:
17530       Opcode = X86ISD::PCMPISTRI;
17531       X86CC = X86::COND_O;
17532       break;
17533     case Intrinsic::x86_sse42_pcmpestrio128:
17534       Opcode = X86ISD::PCMPESTRI;
17535       X86CC = X86::COND_O;
17536       break;
17537     case Intrinsic::x86_sse42_pcmpistris128:
17538       Opcode = X86ISD::PCMPISTRI;
17539       X86CC = X86::COND_S;
17540       break;
17541     case Intrinsic::x86_sse42_pcmpestris128:
17542       Opcode = X86ISD::PCMPESTRI;
17543       X86CC = X86::COND_S;
17544       break;
17545     case Intrinsic::x86_sse42_pcmpistriz128:
17546       Opcode = X86ISD::PCMPISTRI;
17547       X86CC = X86::COND_E;
17548       break;
17549     case Intrinsic::x86_sse42_pcmpestriz128:
17550       Opcode = X86ISD::PCMPESTRI;
17551       X86CC = X86::COND_E;
17552       break;
17553     }
17554     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17555     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17556     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
17557     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17558                                 DAG.getConstant(X86CC, MVT::i8),
17559                                 SDValue(PCMP.getNode(), 1));
17560     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17561   }
17562
17563   case Intrinsic::x86_sse42_pcmpistri128:
17564   case Intrinsic::x86_sse42_pcmpestri128: {
17565     unsigned Opcode;
17566     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
17567       Opcode = X86ISD::PCMPISTRI;
17568     else
17569       Opcode = X86ISD::PCMPESTRI;
17570
17571     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17572     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17573     return DAG.getNode(Opcode, dl, VTs, NewOps);
17574   }
17575   }
17576 }
17577
17578 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17579                               SDValue Src, SDValue Mask, SDValue Base,
17580                               SDValue Index, SDValue ScaleOp, SDValue Chain,
17581                               const X86Subtarget * Subtarget) {
17582   SDLoc dl(Op);
17583   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17584   assert(C && "Invalid scale type");
17585   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17586   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17587                              Index.getSimpleValueType().getVectorNumElements());
17588   SDValue MaskInReg;
17589   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17590   if (MaskC)
17591     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17592   else
17593     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17594   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
17595   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17596   SDValue Segment = DAG.getRegister(0, MVT::i32);
17597   if (Src.getOpcode() == ISD::UNDEF)
17598     Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
17599   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17600   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17601   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
17602   return DAG.getMergeValues(RetOps, dl);
17603 }
17604
17605 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17606                                SDValue Src, SDValue Mask, SDValue Base,
17607                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
17608   SDLoc dl(Op);
17609   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17610   assert(C && "Invalid scale type");
17611   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17612   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17613   SDValue Segment = DAG.getRegister(0, MVT::i32);
17614   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17615                              Index.getSimpleValueType().getVectorNumElements());
17616   SDValue MaskInReg;
17617   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17618   if (MaskC)
17619     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17620   else
17621     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17622   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
17623   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
17624   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17625   return SDValue(Res, 1);
17626 }
17627
17628 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17629                                SDValue Mask, SDValue Base, SDValue Index,
17630                                SDValue ScaleOp, SDValue Chain) {
17631   SDLoc dl(Op);
17632   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17633   assert(C && "Invalid scale type");
17634   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17635   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17636   SDValue Segment = DAG.getRegister(0, MVT::i32);
17637   EVT MaskVT =
17638     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
17639   SDValue MaskInReg;
17640   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17641   if (MaskC)
17642     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17643   else
17644     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17645   //SDVTList VTs = DAG.getVTList(MVT::Other);
17646   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17647   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
17648   return SDValue(Res, 0);
17649 }
17650
17651 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
17652 // read performance monitor counters (x86_rdpmc).
17653 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
17654                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17655                               SmallVectorImpl<SDValue> &Results) {
17656   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17657   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17658   SDValue LO, HI;
17659
17660   // The ECX register is used to select the index of the performance counter
17661   // to read.
17662   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
17663                                    N->getOperand(2));
17664   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
17665
17666   // Reads the content of a 64-bit performance counter and returns it in the
17667   // registers EDX:EAX.
17668   if (Subtarget->is64Bit()) {
17669     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17670     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17671                             LO.getValue(2));
17672   } else {
17673     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17674     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17675                             LO.getValue(2));
17676   }
17677   Chain = HI.getValue(1);
17678
17679   if (Subtarget->is64Bit()) {
17680     // The EAX register is loaded with the low-order 32 bits. The EDX register
17681     // is loaded with the supported high-order bits of the counter.
17682     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17683                               DAG.getConstant(32, MVT::i8));
17684     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17685     Results.push_back(Chain);
17686     return;
17687   }
17688
17689   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17690   SDValue Ops[] = { LO, HI };
17691   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17692   Results.push_back(Pair);
17693   Results.push_back(Chain);
17694 }
17695
17696 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
17697 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
17698 // also used to custom lower READCYCLECOUNTER nodes.
17699 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
17700                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17701                               SmallVectorImpl<SDValue> &Results) {
17702   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17703   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
17704   SDValue LO, HI;
17705
17706   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
17707   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
17708   // and the EAX register is loaded with the low-order 32 bits.
17709   if (Subtarget->is64Bit()) {
17710     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17711     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17712                             LO.getValue(2));
17713   } else {
17714     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17715     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17716                             LO.getValue(2));
17717   }
17718   SDValue Chain = HI.getValue(1);
17719
17720   if (Opcode == X86ISD::RDTSCP_DAG) {
17721     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17722
17723     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
17724     // the ECX register. Add 'ecx' explicitly to the chain.
17725     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
17726                                      HI.getValue(2));
17727     // Explicitly store the content of ECX at the location passed in input
17728     // to the 'rdtscp' intrinsic.
17729     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
17730                          MachinePointerInfo(), false, false, 0);
17731   }
17732
17733   if (Subtarget->is64Bit()) {
17734     // The EDX register is loaded with the high-order 32 bits of the MSR, and
17735     // the EAX register is loaded with the low-order 32 bits.
17736     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17737                               DAG.getConstant(32, MVT::i8));
17738     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17739     Results.push_back(Chain);
17740     return;
17741   }
17742
17743   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17744   SDValue Ops[] = { LO, HI };
17745   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17746   Results.push_back(Pair);
17747   Results.push_back(Chain);
17748 }
17749
17750 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
17751                                      SelectionDAG &DAG) {
17752   SmallVector<SDValue, 2> Results;
17753   SDLoc DL(Op);
17754   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
17755                           Results);
17756   return DAG.getMergeValues(Results, DL);
17757 }
17758
17759
17760 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17761                                       SelectionDAG &DAG) {
17762   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
17763
17764   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
17765   if (!IntrData)
17766     return SDValue();
17767
17768   SDLoc dl(Op);
17769   switch(IntrData->Type) {
17770   default:
17771     llvm_unreachable("Unknown Intrinsic Type");
17772     break;
17773   case RDSEED:
17774   case RDRAND: {
17775     // Emit the node with the right value type.
17776     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
17777     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17778
17779     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
17780     // Otherwise return the value from Rand, which is always 0, casted to i32.
17781     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
17782                       DAG.getConstant(1, Op->getValueType(1)),
17783                       DAG.getConstant(X86::COND_B, MVT::i32),
17784                       SDValue(Result.getNode(), 1) };
17785     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
17786                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
17787                                   Ops);
17788
17789     // Return { result, isValid, chain }.
17790     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
17791                        SDValue(Result.getNode(), 2));
17792   }
17793   case GATHER: {
17794   //gather(v1, mask, index, base, scale);
17795     SDValue Chain = Op.getOperand(0);
17796     SDValue Src   = Op.getOperand(2);
17797     SDValue Base  = Op.getOperand(3);
17798     SDValue Index = Op.getOperand(4);
17799     SDValue Mask  = Op.getOperand(5);
17800     SDValue Scale = Op.getOperand(6);
17801     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
17802                           Subtarget);
17803   }
17804   case SCATTER: {
17805   //scatter(base, mask, index, v1, scale);
17806     SDValue Chain = Op.getOperand(0);
17807     SDValue Base  = Op.getOperand(2);
17808     SDValue Mask  = Op.getOperand(3);
17809     SDValue Index = Op.getOperand(4);
17810     SDValue Src   = Op.getOperand(5);
17811     SDValue Scale = Op.getOperand(6);
17812     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
17813   }
17814   case PREFETCH: {
17815     SDValue Hint = Op.getOperand(6);
17816     unsigned HintVal;
17817     if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
17818         (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
17819       llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
17820     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
17821     SDValue Chain = Op.getOperand(0);
17822     SDValue Mask  = Op.getOperand(2);
17823     SDValue Index = Op.getOperand(3);
17824     SDValue Base  = Op.getOperand(4);
17825     SDValue Scale = Op.getOperand(5);
17826     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
17827   }
17828   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
17829   case RDTSC: {
17830     SmallVector<SDValue, 2> Results;
17831     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
17832     return DAG.getMergeValues(Results, dl);
17833   }
17834   // Read Performance Monitoring Counters.
17835   case RDPMC: {
17836     SmallVector<SDValue, 2> Results;
17837     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
17838     return DAG.getMergeValues(Results, dl);
17839   }
17840   // XTEST intrinsics.
17841   case XTEST: {
17842     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17843     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17844     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17845                                 DAG.getConstant(X86::COND_NE, MVT::i8),
17846                                 InTrans);
17847     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
17848     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
17849                        Ret, SDValue(InTrans.getNode(), 1));
17850   }
17851   // ADC/ADCX/SBB
17852   case ADX: {
17853     SmallVector<SDValue, 2> Results;
17854     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17855     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
17856     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
17857                                 DAG.getConstant(-1, MVT::i8));
17858     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
17859                               Op.getOperand(4), GenCF.getValue(1));
17860     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
17861                                  Op.getOperand(5), MachinePointerInfo(),
17862                                  false, false, 0);
17863     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17864                                 DAG.getConstant(X86::COND_B, MVT::i8),
17865                                 Res.getValue(1));
17866     Results.push_back(SetCC);
17867     Results.push_back(Store);
17868     return DAG.getMergeValues(Results, dl);
17869   }
17870   case COMPRESS_TO_MEM: {
17871     SDLoc dl(Op);
17872     SDValue Mask = Op.getOperand(4);
17873     SDValue DataToCompress = Op.getOperand(3);
17874     SDValue Addr = Op.getOperand(2);
17875     SDValue Chain = Op.getOperand(0);
17876
17877     if (isAllOnes(Mask)) // return just a store
17878       return DAG.getStore(Chain, dl, DataToCompress, Addr,
17879                           MachinePointerInfo(), false, false, 0);
17880
17881     EVT VT = DataToCompress.getValueType();
17882     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17883                                   VT.getVectorNumElements());
17884     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17885                                      Mask.getValueType().getSizeInBits());
17886     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17887                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17888                                 DAG.getIntPtrConstant(0));
17889
17890     SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
17891                                       DataToCompress, DAG.getUNDEF(VT));
17892     return DAG.getStore(Chain, dl, Compressed, Addr,
17893                         MachinePointerInfo(), false, false, 0);
17894   }
17895   case EXPAND_FROM_MEM: {
17896     SDLoc dl(Op);
17897     SDValue Mask = Op.getOperand(4);
17898     SDValue PathThru = Op.getOperand(3);
17899     SDValue Addr = Op.getOperand(2);
17900     SDValue Chain = Op.getOperand(0);
17901     EVT VT = Op.getValueType();
17902
17903     if (isAllOnes(Mask)) // return just a load
17904       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
17905                          false, 0);
17906     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17907                                   VT.getVectorNumElements());
17908     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17909                                      Mask.getValueType().getSizeInBits());
17910     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17911                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17912                                 DAG.getIntPtrConstant(0));
17913
17914     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
17915                                    false, false, false, 0);
17916
17917     SmallVector<SDValue, 2> Results;
17918     Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
17919                                   PathThru));
17920     Results.push_back(Chain);
17921     return DAG.getMergeValues(Results, dl);
17922   }
17923   }
17924 }
17925
17926 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
17927                                            SelectionDAG &DAG) const {
17928   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17929   MFI->setReturnAddressIsTaken(true);
17930
17931   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
17932     return SDValue();
17933
17934   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17935   SDLoc dl(Op);
17936   EVT PtrVT = getPointerTy();
17937
17938   if (Depth > 0) {
17939     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
17940     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17941     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
17942     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17943                        DAG.getNode(ISD::ADD, dl, PtrVT,
17944                                    FrameAddr, Offset),
17945                        MachinePointerInfo(), false, false, false, 0);
17946   }
17947
17948   // Just load the return address.
17949   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
17950   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17951                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
17952 }
17953
17954 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
17955   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17956   MFI->setFrameAddressIsTaken(true);
17957
17958   EVT VT = Op.getValueType();
17959   SDLoc dl(Op);  // FIXME probably not meaningful
17960   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17961   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17962   unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(
17963       DAG.getMachineFunction());
17964   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
17965           (FrameReg == X86::EBP && VT == MVT::i32)) &&
17966          "Invalid Frame Register!");
17967   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
17968   while (Depth--)
17969     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
17970                             MachinePointerInfo(),
17971                             false, false, false, 0);
17972   return FrameAddr;
17973 }
17974
17975 // FIXME? Maybe this could be a TableGen attribute on some registers and
17976 // this table could be generated automatically from RegInfo.
17977 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
17978                                               EVT VT) const {
17979   unsigned Reg = StringSwitch<unsigned>(RegName)
17980                        .Case("esp", X86::ESP)
17981                        .Case("rsp", X86::RSP)
17982                        .Default(0);
17983   if (Reg)
17984     return Reg;
17985   report_fatal_error("Invalid register name global variable");
17986 }
17987
17988 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
17989                                                      SelectionDAG &DAG) const {
17990   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17991   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
17992 }
17993
17994 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
17995   SDValue Chain     = Op.getOperand(0);
17996   SDValue Offset    = Op.getOperand(1);
17997   SDValue Handler   = Op.getOperand(2);
17998   SDLoc dl      (Op);
17999
18000   EVT PtrVT = getPointerTy();
18001   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18002   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
18003   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
18004           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
18005          "Invalid Frame Register!");
18006   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
18007   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
18008
18009   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
18010                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
18011   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
18012   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
18013                        false, false, 0);
18014   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
18015
18016   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
18017                      DAG.getRegister(StoreAddrReg, PtrVT));
18018 }
18019
18020 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
18021                                                SelectionDAG &DAG) const {
18022   SDLoc DL(Op);
18023   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
18024                      DAG.getVTList(MVT::i32, MVT::Other),
18025                      Op.getOperand(0), Op.getOperand(1));
18026 }
18027
18028 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
18029                                                 SelectionDAG &DAG) const {
18030   SDLoc DL(Op);
18031   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
18032                      Op.getOperand(0), Op.getOperand(1));
18033 }
18034
18035 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
18036   return Op.getOperand(0);
18037 }
18038
18039 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
18040                                                 SelectionDAG &DAG) const {
18041   SDValue Root = Op.getOperand(0);
18042   SDValue Trmp = Op.getOperand(1); // trampoline
18043   SDValue FPtr = Op.getOperand(2); // nested function
18044   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
18045   SDLoc dl (Op);
18046
18047   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18048   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
18049
18050   if (Subtarget->is64Bit()) {
18051     SDValue OutChains[6];
18052
18053     // Large code-model.
18054     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
18055     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
18056
18057     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
18058     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
18059
18060     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
18061
18062     // Load the pointer to the nested function into R11.
18063     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
18064     SDValue Addr = Trmp;
18065     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18066                                 Addr, MachinePointerInfo(TrmpAddr),
18067                                 false, false, 0);
18068
18069     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18070                        DAG.getConstant(2, MVT::i64));
18071     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
18072                                 MachinePointerInfo(TrmpAddr, 2),
18073                                 false, false, 2);
18074
18075     // Load the 'nest' parameter value into R10.
18076     // R10 is specified in X86CallingConv.td
18077     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
18078     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18079                        DAG.getConstant(10, MVT::i64));
18080     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18081                                 Addr, MachinePointerInfo(TrmpAddr, 10),
18082                                 false, false, 0);
18083
18084     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18085                        DAG.getConstant(12, MVT::i64));
18086     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
18087                                 MachinePointerInfo(TrmpAddr, 12),
18088                                 false, false, 2);
18089
18090     // Jump to the nested function.
18091     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
18092     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18093                        DAG.getConstant(20, MVT::i64));
18094     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18095                                 Addr, MachinePointerInfo(TrmpAddr, 20),
18096                                 false, false, 0);
18097
18098     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
18099     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18100                        DAG.getConstant(22, MVT::i64));
18101     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
18102                                 MachinePointerInfo(TrmpAddr, 22),
18103                                 false, false, 0);
18104
18105     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18106   } else {
18107     const Function *Func =
18108       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
18109     CallingConv::ID CC = Func->getCallingConv();
18110     unsigned NestReg;
18111
18112     switch (CC) {
18113     default:
18114       llvm_unreachable("Unsupported calling convention");
18115     case CallingConv::C:
18116     case CallingConv::X86_StdCall: {
18117       // Pass 'nest' parameter in ECX.
18118       // Must be kept in sync with X86CallingConv.td
18119       NestReg = X86::ECX;
18120
18121       // Check that ECX wasn't needed by an 'inreg' parameter.
18122       FunctionType *FTy = Func->getFunctionType();
18123       const AttributeSet &Attrs = Func->getAttributes();
18124
18125       if (!Attrs.isEmpty() && !Func->isVarArg()) {
18126         unsigned InRegCount = 0;
18127         unsigned Idx = 1;
18128
18129         for (FunctionType::param_iterator I = FTy->param_begin(),
18130              E = FTy->param_end(); I != E; ++I, ++Idx)
18131           if (Attrs.hasAttribute(Idx, Attribute::InReg))
18132             // FIXME: should only count parameters that are lowered to integers.
18133             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
18134
18135         if (InRegCount > 2) {
18136           report_fatal_error("Nest register in use - reduce number of inreg"
18137                              " parameters!");
18138         }
18139       }
18140       break;
18141     }
18142     case CallingConv::X86_FastCall:
18143     case CallingConv::X86_ThisCall:
18144     case CallingConv::Fast:
18145       // Pass 'nest' parameter in EAX.
18146       // Must be kept in sync with X86CallingConv.td
18147       NestReg = X86::EAX;
18148       break;
18149     }
18150
18151     SDValue OutChains[4];
18152     SDValue Addr, Disp;
18153
18154     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18155                        DAG.getConstant(10, MVT::i32));
18156     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
18157
18158     // This is storing the opcode for MOV32ri.
18159     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
18160     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
18161     OutChains[0] = DAG.getStore(Root, dl,
18162                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
18163                                 Trmp, MachinePointerInfo(TrmpAddr),
18164                                 false, false, 0);
18165
18166     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18167                        DAG.getConstant(1, MVT::i32));
18168     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
18169                                 MachinePointerInfo(TrmpAddr, 1),
18170                                 false, false, 1);
18171
18172     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18173     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18174                        DAG.getConstant(5, MVT::i32));
18175     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
18176                                 MachinePointerInfo(TrmpAddr, 5),
18177                                 false, false, 1);
18178
18179     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18180                        DAG.getConstant(6, MVT::i32));
18181     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
18182                                 MachinePointerInfo(TrmpAddr, 6),
18183                                 false, false, 1);
18184
18185     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18186   }
18187 }
18188
18189 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18190                                             SelectionDAG &DAG) const {
18191   /*
18192    The rounding mode is in bits 11:10 of FPSR, and has the following
18193    settings:
18194      00 Round to nearest
18195      01 Round to -inf
18196      10 Round to +inf
18197      11 Round to 0
18198
18199   FLT_ROUNDS, on the other hand, expects the following:
18200     -1 Undefined
18201      0 Round to 0
18202      1 Round to nearest
18203      2 Round to +inf
18204      3 Round to -inf
18205
18206   To perform the conversion, we do:
18207     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18208   */
18209
18210   MachineFunction &MF = DAG.getMachineFunction();
18211   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
18212   unsigned StackAlignment = TFI.getStackAlignment();
18213   MVT VT = Op.getSimpleValueType();
18214   SDLoc DL(Op);
18215
18216   // Save FP Control Word to stack slot
18217   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18218   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
18219
18220   MachineMemOperand *MMO =
18221    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
18222                            MachineMemOperand::MOStore, 2, 2);
18223
18224   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18225   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18226                                           DAG.getVTList(MVT::Other),
18227                                           Ops, MVT::i16, MMO);
18228
18229   // Load FP Control Word from stack slot
18230   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18231                             MachinePointerInfo(), false, false, false, 0);
18232
18233   // Transform as necessary
18234   SDValue CWD1 =
18235     DAG.getNode(ISD::SRL, DL, MVT::i16,
18236                 DAG.getNode(ISD::AND, DL, MVT::i16,
18237                             CWD, DAG.getConstant(0x800, MVT::i16)),
18238                 DAG.getConstant(11, MVT::i8));
18239   SDValue CWD2 =
18240     DAG.getNode(ISD::SRL, DL, MVT::i16,
18241                 DAG.getNode(ISD::AND, DL, MVT::i16,
18242                             CWD, DAG.getConstant(0x400, MVT::i16)),
18243                 DAG.getConstant(9, MVT::i8));
18244
18245   SDValue RetVal =
18246     DAG.getNode(ISD::AND, DL, MVT::i16,
18247                 DAG.getNode(ISD::ADD, DL, MVT::i16,
18248                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18249                             DAG.getConstant(1, MVT::i16)),
18250                 DAG.getConstant(3, MVT::i16));
18251
18252   return DAG.getNode((VT.getSizeInBits() < 16 ?
18253                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18254 }
18255
18256 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
18257   MVT VT = Op.getSimpleValueType();
18258   EVT OpVT = VT;
18259   unsigned NumBits = VT.getSizeInBits();
18260   SDLoc dl(Op);
18261
18262   Op = Op.getOperand(0);
18263   if (VT == MVT::i8) {
18264     // Zero extend to i32 since there is not an i8 bsr.
18265     OpVT = MVT::i32;
18266     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18267   }
18268
18269   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
18270   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18271   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18272
18273   // If src is zero (i.e. bsr sets ZF), returns NumBits.
18274   SDValue Ops[] = {
18275     Op,
18276     DAG.getConstant(NumBits+NumBits-1, OpVT),
18277     DAG.getConstant(X86::COND_E, MVT::i8),
18278     Op.getValue(1)
18279   };
18280   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
18281
18282   // Finally xor with NumBits-1.
18283   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18284
18285   if (VT == MVT::i8)
18286     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18287   return Op;
18288 }
18289
18290 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
18291   MVT VT = Op.getSimpleValueType();
18292   EVT OpVT = VT;
18293   unsigned NumBits = VT.getSizeInBits();
18294   SDLoc dl(Op);
18295
18296   Op = Op.getOperand(0);
18297   if (VT == MVT::i8) {
18298     // Zero extend to i32 since there is not an i8 bsr.
18299     OpVT = MVT::i32;
18300     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18301   }
18302
18303   // Issue a bsr (scan bits in reverse).
18304   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18305   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18306
18307   // And xor with NumBits-1.
18308   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18309
18310   if (VT == MVT::i8)
18311     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18312   return Op;
18313 }
18314
18315 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
18316   MVT VT = Op.getSimpleValueType();
18317   unsigned NumBits = VT.getSizeInBits();
18318   SDLoc dl(Op);
18319   Op = Op.getOperand(0);
18320
18321   // Issue a bsf (scan bits forward) which also sets EFLAGS.
18322   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18323   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
18324
18325   // If src is zero (i.e. bsf sets ZF), returns NumBits.
18326   SDValue Ops[] = {
18327     Op,
18328     DAG.getConstant(NumBits, VT),
18329     DAG.getConstant(X86::COND_E, MVT::i8),
18330     Op.getValue(1)
18331   };
18332   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
18333 }
18334
18335 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
18336 // ones, and then concatenate the result back.
18337 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
18338   MVT VT = Op.getSimpleValueType();
18339
18340   assert(VT.is256BitVector() && VT.isInteger() &&
18341          "Unsupported value type for operation");
18342
18343   unsigned NumElems = VT.getVectorNumElements();
18344   SDLoc dl(Op);
18345
18346   // Extract the LHS vectors
18347   SDValue LHS = Op.getOperand(0);
18348   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
18349   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
18350
18351   // Extract the RHS vectors
18352   SDValue RHS = Op.getOperand(1);
18353   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
18354   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
18355
18356   MVT EltVT = VT.getVectorElementType();
18357   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18358
18359   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18360                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
18361                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
18362 }
18363
18364 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
18365   assert(Op.getSimpleValueType().is256BitVector() &&
18366          Op.getSimpleValueType().isInteger() &&
18367          "Only handle AVX 256-bit vector integer operation");
18368   return Lower256IntArith(Op, DAG);
18369 }
18370
18371 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
18372   assert(Op.getSimpleValueType().is256BitVector() &&
18373          Op.getSimpleValueType().isInteger() &&
18374          "Only handle AVX 256-bit vector integer operation");
18375   return Lower256IntArith(Op, DAG);
18376 }
18377
18378 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
18379                         SelectionDAG &DAG) {
18380   SDLoc dl(Op);
18381   MVT VT = Op.getSimpleValueType();
18382
18383   // Decompose 256-bit ops into smaller 128-bit ops.
18384   if (VT.is256BitVector() && !Subtarget->hasInt256())
18385     return Lower256IntArith(Op, DAG);
18386
18387   SDValue A = Op.getOperand(0);
18388   SDValue B = Op.getOperand(1);
18389
18390   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
18391   if (VT == MVT::v4i32) {
18392     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
18393            "Should not custom lower when pmuldq is available!");
18394
18395     // Extract the odd parts.
18396     static const int UnpackMask[] = { 1, -1, 3, -1 };
18397     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
18398     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
18399
18400     // Multiply the even parts.
18401     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
18402     // Now multiply odd parts.
18403     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
18404
18405     Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
18406     Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
18407
18408     // Merge the two vectors back together with a shuffle. This expands into 2
18409     // shuffles.
18410     static const int ShufMask[] = { 0, 4, 2, 6 };
18411     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
18412   }
18413
18414   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
18415          "Only know how to lower V2I64/V4I64/V8I64 multiply");
18416
18417   //  Ahi = psrlqi(a, 32);
18418   //  Bhi = psrlqi(b, 32);
18419   //
18420   //  AloBlo = pmuludq(a, b);
18421   //  AloBhi = pmuludq(a, Bhi);
18422   //  AhiBlo = pmuludq(Ahi, b);
18423
18424   //  AloBhi = psllqi(AloBhi, 32);
18425   //  AhiBlo = psllqi(AhiBlo, 32);
18426   //  return AloBlo + AloBhi + AhiBlo;
18427
18428   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
18429   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
18430
18431   // Bit cast to 32-bit vectors for MULUDQ
18432   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
18433                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
18434   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
18435   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
18436   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
18437   Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
18438
18439   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
18440   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
18441   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
18442
18443   AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
18444   AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
18445
18446   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
18447   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
18448 }
18449
18450 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
18451   assert(Subtarget->isTargetWin64() && "Unexpected target");
18452   EVT VT = Op.getValueType();
18453   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
18454          "Unexpected return type for lowering");
18455
18456   RTLIB::Libcall LC;
18457   bool isSigned;
18458   switch (Op->getOpcode()) {
18459   default: llvm_unreachable("Unexpected request for libcall!");
18460   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
18461   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
18462   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
18463   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
18464   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
18465   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
18466   }
18467
18468   SDLoc dl(Op);
18469   SDValue InChain = DAG.getEntryNode();
18470
18471   TargetLowering::ArgListTy Args;
18472   TargetLowering::ArgListEntry Entry;
18473   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
18474     EVT ArgVT = Op->getOperand(i).getValueType();
18475     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
18476            "Unexpected argument type for lowering");
18477     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
18478     Entry.Node = StackPtr;
18479     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
18480                            false, false, 16);
18481     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18482     Entry.Ty = PointerType::get(ArgTy,0);
18483     Entry.isSExt = false;
18484     Entry.isZExt = false;
18485     Args.push_back(Entry);
18486   }
18487
18488   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18489                                          getPointerTy());
18490
18491   TargetLowering::CallLoweringInfo CLI(DAG);
18492   CLI.setDebugLoc(dl).setChain(InChain)
18493     .setCallee(getLibcallCallingConv(LC),
18494                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
18495                Callee, std::move(Args), 0)
18496     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
18497
18498   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
18499   return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
18500 }
18501
18502 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
18503                              SelectionDAG &DAG) {
18504   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
18505   EVT VT = Op0.getValueType();
18506   SDLoc dl(Op);
18507
18508   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
18509          (VT == MVT::v8i32 && Subtarget->hasInt256()));
18510
18511   // PMULxD operations multiply each even value (starting at 0) of LHS with
18512   // the related value of RHS and produce a widen result.
18513   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18514   // => <2 x i64> <ae|cg>
18515   //
18516   // In other word, to have all the results, we need to perform two PMULxD:
18517   // 1. one with the even values.
18518   // 2. one with the odd values.
18519   // To achieve #2, with need to place the odd values at an even position.
18520   //
18521   // Place the odd value at an even position (basically, shift all values 1
18522   // step to the left):
18523   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
18524   // <a|b|c|d> => <b|undef|d|undef>
18525   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
18526   // <e|f|g|h> => <f|undef|h|undef>
18527   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
18528
18529   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
18530   // ints.
18531   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
18532   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
18533   unsigned Opcode =
18534       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
18535   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18536   // => <2 x i64> <ae|cg>
18537   SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
18538                              DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
18539   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
18540   // => <2 x i64> <bf|dh>
18541   SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
18542                              DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
18543
18544   // Shuffle it back into the right order.
18545   SDValue Highs, Lows;
18546   if (VT == MVT::v8i32) {
18547     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
18548     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18549     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
18550     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18551   } else {
18552     const int HighMask[] = {1, 5, 3, 7};
18553     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18554     const int LowMask[] = {0, 4, 2, 6};
18555     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18556   }
18557
18558   // If we have a signed multiply but no PMULDQ fix up the high parts of a
18559   // unsigned multiply.
18560   if (IsSigned && !Subtarget->hasSSE41()) {
18561     SDValue ShAmt =
18562         DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
18563     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
18564                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
18565     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
18566                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
18567
18568     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
18569     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
18570   }
18571
18572   // The first result of MUL_LOHI is actually the low value, followed by the
18573   // high value.
18574   SDValue Ops[] = {Lows, Highs};
18575   return DAG.getMergeValues(Ops, dl);
18576 }
18577
18578 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
18579                                          const X86Subtarget *Subtarget) {
18580   MVT VT = Op.getSimpleValueType();
18581   SDLoc dl(Op);
18582   SDValue R = Op.getOperand(0);
18583   SDValue Amt = Op.getOperand(1);
18584
18585   // Optimize shl/srl/sra with constant shift amount.
18586   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
18587     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
18588       uint64_t ShiftAmt = ShiftConst->getZExtValue();
18589
18590       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
18591           (Subtarget->hasInt256() &&
18592            (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18593           (Subtarget->hasAVX512() &&
18594            (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18595         if (Op.getOpcode() == ISD::SHL)
18596           return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18597                                             DAG);
18598         if (Op.getOpcode() == ISD::SRL)
18599           return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18600                                             DAG);
18601         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
18602           return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18603                                             DAG);
18604       }
18605
18606       if (VT == MVT::v16i8) {
18607         if (Op.getOpcode() == ISD::SHL) {
18608           // Make a large shift.
18609           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18610                                                    MVT::v8i16, R, ShiftAmt,
18611                                                    DAG);
18612           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18613           // Zero out the rightmost bits.
18614           SmallVector<SDValue, 16> V(16,
18615                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18616                                                      MVT::i8));
18617           return DAG.getNode(ISD::AND, dl, VT, SHL,
18618                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18619         }
18620         if (Op.getOpcode() == ISD::SRL) {
18621           // Make a large shift.
18622           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18623                                                    MVT::v8i16, R, ShiftAmt,
18624                                                    DAG);
18625           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18626           // Zero out the leftmost bits.
18627           SmallVector<SDValue, 16> V(16,
18628                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18629                                                      MVT::i8));
18630           return DAG.getNode(ISD::AND, dl, VT, SRL,
18631                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18632         }
18633         if (Op.getOpcode() == ISD::SRA) {
18634           if (ShiftAmt == 7) {
18635             // R s>> 7  ===  R s< 0
18636             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18637             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18638           }
18639
18640           // R s>> a === ((R u>> a) ^ m) - m
18641           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18642           SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
18643                                                          MVT::i8));
18644           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18645           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18646           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18647           return Res;
18648         }
18649         llvm_unreachable("Unknown shift opcode.");
18650       }
18651
18652       if (Subtarget->hasInt256() && VT == MVT::v32i8) {
18653         if (Op.getOpcode() == ISD::SHL) {
18654           // Make a large shift.
18655           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18656                                                    MVT::v16i16, R, ShiftAmt,
18657                                                    DAG);
18658           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18659           // Zero out the rightmost bits.
18660           SmallVector<SDValue, 32> V(32,
18661                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18662                                                      MVT::i8));
18663           return DAG.getNode(ISD::AND, dl, VT, SHL,
18664                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18665         }
18666         if (Op.getOpcode() == ISD::SRL) {
18667           // Make a large shift.
18668           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18669                                                    MVT::v16i16, R, ShiftAmt,
18670                                                    DAG);
18671           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18672           // Zero out the leftmost bits.
18673           SmallVector<SDValue, 32> V(32,
18674                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18675                                                      MVT::i8));
18676           return DAG.getNode(ISD::AND, dl, VT, SRL,
18677                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18678         }
18679         if (Op.getOpcode() == ISD::SRA) {
18680           if (ShiftAmt == 7) {
18681             // R s>> 7  ===  R s< 0
18682             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18683             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18684           }
18685
18686           // R s>> a === ((R u>> a) ^ m) - m
18687           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18688           SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
18689                                                          MVT::i8));
18690           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18691           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18692           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18693           return Res;
18694         }
18695         llvm_unreachable("Unknown shift opcode.");
18696       }
18697     }
18698   }
18699
18700   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18701   if (!Subtarget->is64Bit() &&
18702       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
18703       Amt.getOpcode() == ISD::BITCAST &&
18704       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18705     Amt = Amt.getOperand(0);
18706     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18707                      VT.getVectorNumElements();
18708     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
18709     uint64_t ShiftAmt = 0;
18710     for (unsigned i = 0; i != Ratio; ++i) {
18711       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
18712       if (!C)
18713         return SDValue();
18714       // 6 == Log2(64)
18715       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
18716     }
18717     // Check remaining shift amounts.
18718     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18719       uint64_t ShAmt = 0;
18720       for (unsigned j = 0; j != Ratio; ++j) {
18721         ConstantSDNode *C =
18722           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
18723         if (!C)
18724           return SDValue();
18725         // 6 == Log2(64)
18726         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
18727       }
18728       if (ShAmt != ShiftAmt)
18729         return SDValue();
18730     }
18731     switch (Op.getOpcode()) {
18732     default:
18733       llvm_unreachable("Unknown shift opcode!");
18734     case ISD::SHL:
18735       return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18736                                         DAG);
18737     case ISD::SRL:
18738       return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18739                                         DAG);
18740     case ISD::SRA:
18741       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18742                                         DAG);
18743     }
18744   }
18745
18746   return SDValue();
18747 }
18748
18749 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
18750                                         const X86Subtarget* Subtarget) {
18751   MVT VT = Op.getSimpleValueType();
18752   SDLoc dl(Op);
18753   SDValue R = Op.getOperand(0);
18754   SDValue Amt = Op.getOperand(1);
18755
18756   if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
18757       VT == MVT::v4i32 || VT == MVT::v8i16 ||
18758       (Subtarget->hasInt256() &&
18759        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
18760         VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18761        (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18762     SDValue BaseShAmt;
18763     EVT EltVT = VT.getVectorElementType();
18764
18765     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
18766       // Check if this build_vector node is doing a splat.
18767       // If so, then set BaseShAmt equal to the splat value.
18768       BaseShAmt = BV->getSplatValue();
18769       if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
18770         BaseShAmt = SDValue();
18771     } else {
18772       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
18773         Amt = Amt.getOperand(0);
18774
18775       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
18776       if (SVN && SVN->isSplat()) {
18777         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
18778         SDValue InVec = Amt.getOperand(0);
18779         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
18780           assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
18781                  "Unexpected shuffle index found!");
18782           BaseShAmt = InVec.getOperand(SplatIdx);
18783         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
18784            if (ConstantSDNode *C =
18785                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
18786              if (C->getZExtValue() == SplatIdx)
18787                BaseShAmt = InVec.getOperand(1);
18788            }
18789         }
18790
18791         if (!BaseShAmt)
18792           // Avoid introducing an extract element from a shuffle.
18793           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
18794                                     DAG.getIntPtrConstant(SplatIdx));
18795       }
18796     }
18797
18798     if (BaseShAmt.getNode()) {
18799       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
18800       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
18801         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
18802       else if (EltVT.bitsLT(MVT::i32))
18803         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
18804
18805       switch (Op.getOpcode()) {
18806       default:
18807         llvm_unreachable("Unknown shift opcode!");
18808       case ISD::SHL:
18809         switch (VT.SimpleTy) {
18810         default: return SDValue();
18811         case MVT::v2i64:
18812         case MVT::v4i32:
18813         case MVT::v8i16:
18814         case MVT::v4i64:
18815         case MVT::v8i32:
18816         case MVT::v16i16:
18817         case MVT::v16i32:
18818         case MVT::v8i64:
18819           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
18820         }
18821       case ISD::SRA:
18822         switch (VT.SimpleTy) {
18823         default: return SDValue();
18824         case MVT::v4i32:
18825         case MVT::v8i16:
18826         case MVT::v8i32:
18827         case MVT::v16i16:
18828         case MVT::v16i32:
18829         case MVT::v8i64:
18830           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
18831         }
18832       case ISD::SRL:
18833         switch (VT.SimpleTy) {
18834         default: return SDValue();
18835         case MVT::v2i64:
18836         case MVT::v4i32:
18837         case MVT::v8i16:
18838         case MVT::v4i64:
18839         case MVT::v8i32:
18840         case MVT::v16i16:
18841         case MVT::v16i32:
18842         case MVT::v8i64:
18843           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
18844         }
18845       }
18846     }
18847   }
18848
18849   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18850   if (!Subtarget->is64Bit() &&
18851       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
18852       (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
18853       Amt.getOpcode() == ISD::BITCAST &&
18854       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18855     Amt = Amt.getOperand(0);
18856     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18857                      VT.getVectorNumElements();
18858     std::vector<SDValue> Vals(Ratio);
18859     for (unsigned i = 0; i != Ratio; ++i)
18860       Vals[i] = Amt.getOperand(i);
18861     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18862       for (unsigned j = 0; j != Ratio; ++j)
18863         if (Vals[j] != Amt.getOperand(i + j))
18864           return SDValue();
18865     }
18866     switch (Op.getOpcode()) {
18867     default:
18868       llvm_unreachable("Unknown shift opcode!");
18869     case ISD::SHL:
18870       return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
18871     case ISD::SRL:
18872       return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
18873     case ISD::SRA:
18874       return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
18875     }
18876   }
18877
18878   return SDValue();
18879 }
18880
18881 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
18882                           SelectionDAG &DAG) {
18883   MVT VT = Op.getSimpleValueType();
18884   SDLoc dl(Op);
18885   SDValue R = Op.getOperand(0);
18886   SDValue Amt = Op.getOperand(1);
18887   SDValue V;
18888
18889   assert(VT.isVector() && "Custom lowering only for vector shifts!");
18890   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
18891
18892   V = LowerScalarImmediateShift(Op, DAG, Subtarget);
18893   if (V.getNode())
18894     return V;
18895
18896   V = LowerScalarVariableShift(Op, DAG, Subtarget);
18897   if (V.getNode())
18898       return V;
18899
18900   if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
18901     return Op;
18902   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
18903   if (Subtarget->hasInt256()) {
18904     if (Op.getOpcode() == ISD::SRL &&
18905         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18906          VT == MVT::v4i64 || VT == MVT::v8i32))
18907       return Op;
18908     if (Op.getOpcode() == ISD::SHL &&
18909         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18910          VT == MVT::v4i64 || VT == MVT::v8i32))
18911       return Op;
18912     if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
18913       return Op;
18914   }
18915
18916   // If possible, lower this packed shift into a vector multiply instead of
18917   // expanding it into a sequence of scalar shifts.
18918   // Do this only if the vector shift count is a constant build_vector.
18919   if (Op.getOpcode() == ISD::SHL &&
18920       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
18921        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
18922       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18923     SmallVector<SDValue, 8> Elts;
18924     EVT SVT = VT.getScalarType();
18925     unsigned SVTBits = SVT.getSizeInBits();
18926     const APInt &One = APInt(SVTBits, 1);
18927     unsigned NumElems = VT.getVectorNumElements();
18928
18929     for (unsigned i=0; i !=NumElems; ++i) {
18930       SDValue Op = Amt->getOperand(i);
18931       if (Op->getOpcode() == ISD::UNDEF) {
18932         Elts.push_back(Op);
18933         continue;
18934       }
18935
18936       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
18937       const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
18938       uint64_t ShAmt = C.getZExtValue();
18939       if (ShAmt >= SVTBits) {
18940         Elts.push_back(DAG.getUNDEF(SVT));
18941         continue;
18942       }
18943       Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
18944     }
18945     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
18946     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
18947   }
18948
18949   // Lower SHL with variable shift amount.
18950   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
18951     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
18952
18953     Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
18954     Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
18955     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
18956     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
18957   }
18958
18959   // If possible, lower this shift as a sequence of two shifts by
18960   // constant plus a MOVSS/MOVSD instead of scalarizing it.
18961   // Example:
18962   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
18963   //
18964   // Could be rewritten as:
18965   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
18966   //
18967   // The advantage is that the two shifts from the example would be
18968   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
18969   // the vector shift into four scalar shifts plus four pairs of vector
18970   // insert/extract.
18971   if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
18972       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18973     unsigned TargetOpcode = X86ISD::MOVSS;
18974     bool CanBeSimplified;
18975     // The splat value for the first packed shift (the 'X' from the example).
18976     SDValue Amt1 = Amt->getOperand(0);
18977     // The splat value for the second packed shift (the 'Y' from the example).
18978     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
18979                                         Amt->getOperand(2);
18980
18981     // See if it is possible to replace this node with a sequence of
18982     // two shifts followed by a MOVSS/MOVSD
18983     if (VT == MVT::v4i32) {
18984       // Check if it is legal to use a MOVSS.
18985       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
18986                         Amt2 == Amt->getOperand(3);
18987       if (!CanBeSimplified) {
18988         // Otherwise, check if we can still simplify this node using a MOVSD.
18989         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
18990                           Amt->getOperand(2) == Amt->getOperand(3);
18991         TargetOpcode = X86ISD::MOVSD;
18992         Amt2 = Amt->getOperand(2);
18993       }
18994     } else {
18995       // Do similar checks for the case where the machine value type
18996       // is MVT::v8i16.
18997       CanBeSimplified = Amt1 == Amt->getOperand(1);
18998       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
18999         CanBeSimplified = Amt2 == Amt->getOperand(i);
19000
19001       if (!CanBeSimplified) {
19002         TargetOpcode = X86ISD::MOVSD;
19003         CanBeSimplified = true;
19004         Amt2 = Amt->getOperand(4);
19005         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
19006           CanBeSimplified = Amt1 == Amt->getOperand(i);
19007         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
19008           CanBeSimplified = Amt2 == Amt->getOperand(j);
19009       }
19010     }
19011
19012     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
19013         isa<ConstantSDNode>(Amt2)) {
19014       // Replace this node with two shifts followed by a MOVSS/MOVSD.
19015       EVT CastVT = MVT::v4i32;
19016       SDValue Splat1 =
19017         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
19018       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
19019       SDValue Splat2 =
19020         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
19021       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
19022       if (TargetOpcode == X86ISD::MOVSD)
19023         CastVT = MVT::v2i64;
19024       SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
19025       SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
19026       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
19027                                             BitCast1, DAG);
19028       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
19029     }
19030   }
19031
19032   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
19033     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
19034
19035     // a = a << 5;
19036     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
19037     Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
19038
19039     // Turn 'a' into a mask suitable for VSELECT
19040     SDValue VSelM = DAG.getConstant(0x80, VT);
19041     SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19042     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19043
19044     SDValue CM1 = DAG.getConstant(0x0f, VT);
19045     SDValue CM2 = DAG.getConstant(0x3f, VT);
19046
19047     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
19048     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
19049     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
19050     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19051     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19052
19053     // a += a
19054     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19055     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19056     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19057
19058     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
19059     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
19060     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
19061     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19062     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19063
19064     // a += a
19065     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19066     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19067     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19068
19069     // return VSELECT(r, r+r, a);
19070     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
19071                     DAG.getNode(ISD::ADD, dl, VT, R, R), R);
19072     return R;
19073   }
19074
19075   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
19076   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
19077   // solution better.
19078   if (Subtarget->hasInt256() && VT == MVT::v8i16) {
19079     MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
19080     unsigned ExtOpc =
19081         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
19082     R = DAG.getNode(ExtOpc, dl, NewVT, R);
19083     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
19084     return DAG.getNode(ISD::TRUNCATE, dl, VT,
19085                        DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
19086     }
19087
19088   // Decompose 256-bit shifts into smaller 128-bit shifts.
19089   if (VT.is256BitVector()) {
19090     unsigned NumElems = VT.getVectorNumElements();
19091     MVT EltVT = VT.getVectorElementType();
19092     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19093
19094     // Extract the two vectors
19095     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
19096     SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
19097
19098     // Recreate the shift amount vectors
19099     SDValue Amt1, Amt2;
19100     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
19101       // Constant shift amount
19102       SmallVector<SDValue, 4> Amt1Csts;
19103       SmallVector<SDValue, 4> Amt2Csts;
19104       for (unsigned i = 0; i != NumElems/2; ++i)
19105         Amt1Csts.push_back(Amt->getOperand(i));
19106       for (unsigned i = NumElems/2; i != NumElems; ++i)
19107         Amt2Csts.push_back(Amt->getOperand(i));
19108
19109       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
19110       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
19111     } else {
19112       // Variable shift amount
19113       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
19114       Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
19115     }
19116
19117     // Issue new vector shifts for the smaller types
19118     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
19119     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
19120
19121     // Concatenate the result back
19122     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
19123   }
19124
19125   return SDValue();
19126 }
19127
19128 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
19129   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
19130   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
19131   // looks for this combo and may remove the "setcc" instruction if the "setcc"
19132   // has only one use.
19133   SDNode *N = Op.getNode();
19134   SDValue LHS = N->getOperand(0);
19135   SDValue RHS = N->getOperand(1);
19136   unsigned BaseOp = 0;
19137   unsigned Cond = 0;
19138   SDLoc DL(Op);
19139   switch (Op.getOpcode()) {
19140   default: llvm_unreachable("Unknown ovf instruction!");
19141   case ISD::SADDO:
19142     // A subtract of one will be selected as a INC. Note that INC doesn't
19143     // set CF, so we can't do this for UADDO.
19144     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19145       if (C->isOne()) {
19146         BaseOp = X86ISD::INC;
19147         Cond = X86::COND_O;
19148         break;
19149       }
19150     BaseOp = X86ISD::ADD;
19151     Cond = X86::COND_O;
19152     break;
19153   case ISD::UADDO:
19154     BaseOp = X86ISD::ADD;
19155     Cond = X86::COND_B;
19156     break;
19157   case ISD::SSUBO:
19158     // A subtract of one will be selected as a DEC. Note that DEC doesn't
19159     // set CF, so we can't do this for USUBO.
19160     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19161       if (C->isOne()) {
19162         BaseOp = X86ISD::DEC;
19163         Cond = X86::COND_O;
19164         break;
19165       }
19166     BaseOp = X86ISD::SUB;
19167     Cond = X86::COND_O;
19168     break;
19169   case ISD::USUBO:
19170     BaseOp = X86ISD::SUB;
19171     Cond = X86::COND_B;
19172     break;
19173   case ISD::SMULO:
19174     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
19175     Cond = X86::COND_O;
19176     break;
19177   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
19178     if (N->getValueType(0) == MVT::i8) {
19179       BaseOp = X86ISD::UMUL8;
19180       Cond = X86::COND_O;
19181       break;
19182     }
19183     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
19184                                  MVT::i32);
19185     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
19186
19187     SDValue SetCC =
19188       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
19189                   DAG.getConstant(X86::COND_O, MVT::i32),
19190                   SDValue(Sum.getNode(), 2));
19191
19192     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19193   }
19194   }
19195
19196   // Also sets EFLAGS.
19197   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
19198   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
19199
19200   SDValue SetCC =
19201     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
19202                 DAG.getConstant(Cond, MVT::i32),
19203                 SDValue(Sum.getNode(), 1));
19204
19205   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19206 }
19207
19208 // Sign extension of the low part of vector elements. This may be used either
19209 // when sign extend instructions are not available or if the vector element
19210 // sizes already match the sign-extended size. If the vector elements are in
19211 // their pre-extended size and sign extend instructions are available, that will
19212 // be handled by LowerSIGN_EXTEND.
19213 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
19214                                                   SelectionDAG &DAG) const {
19215   SDLoc dl(Op);
19216   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
19217   MVT VT = Op.getSimpleValueType();
19218
19219   if (!Subtarget->hasSSE2() || !VT.isVector())
19220     return SDValue();
19221
19222   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
19223                       ExtraVT.getScalarType().getSizeInBits();
19224
19225   switch (VT.SimpleTy) {
19226     default: return SDValue();
19227     case MVT::v8i32:
19228     case MVT::v16i16:
19229       if (!Subtarget->hasFp256())
19230         return SDValue();
19231       if (!Subtarget->hasInt256()) {
19232         // needs to be split
19233         unsigned NumElems = VT.getVectorNumElements();
19234
19235         // Extract the LHS vectors
19236         SDValue LHS = Op.getOperand(0);
19237         SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
19238         SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
19239
19240         MVT EltVT = VT.getVectorElementType();
19241         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19242
19243         EVT ExtraEltVT = ExtraVT.getVectorElementType();
19244         unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
19245         ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
19246                                    ExtraNumElems/2);
19247         SDValue Extra = DAG.getValueType(ExtraVT);
19248
19249         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
19250         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
19251
19252         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
19253       }
19254       // fall through
19255     case MVT::v4i32:
19256     case MVT::v8i16: {
19257       SDValue Op0 = Op.getOperand(0);
19258
19259       // This is a sign extension of some low part of vector elements without
19260       // changing the size of the vector elements themselves:
19261       // Shift-Left + Shift-Right-Algebraic.
19262       SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
19263                                                BitsDiff, DAG);
19264       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
19265                                         DAG);
19266     }
19267   }
19268 }
19269
19270 /// Returns true if the operand type is exactly twice the native width, and
19271 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
19272 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
19273 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
19274 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
19275   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
19276
19277   if (OpWidth == 64)
19278     return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
19279   else if (OpWidth == 128)
19280     return Subtarget->hasCmpxchg16b();
19281   else
19282     return false;
19283 }
19284
19285 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19286   return needsCmpXchgNb(SI->getValueOperand()->getType());
19287 }
19288
19289 // Note: this turns large loads into lock cmpxchg8b/16b.
19290 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
19291 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19292   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
19293   return needsCmpXchgNb(PTy->getElementType());
19294 }
19295
19296 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
19297   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19298   const Type *MemType = AI->getType();
19299
19300   // If the operand is too big, we must see if cmpxchg8/16b is available
19301   // and default to library calls otherwise.
19302   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19303     return needsCmpXchgNb(MemType);
19304
19305   AtomicRMWInst::BinOp Op = AI->getOperation();
19306   switch (Op) {
19307   default:
19308     llvm_unreachable("Unknown atomic operation");
19309   case AtomicRMWInst::Xchg:
19310   case AtomicRMWInst::Add:
19311   case AtomicRMWInst::Sub:
19312     // It's better to use xadd, xsub or xchg for these in all cases.
19313     return false;
19314   case AtomicRMWInst::Or:
19315   case AtomicRMWInst::And:
19316   case AtomicRMWInst::Xor:
19317     // If the atomicrmw's result isn't actually used, we can just add a "lock"
19318     // prefix to a normal instruction for these operations.
19319     return !AI->use_empty();
19320   case AtomicRMWInst::Nand:
19321   case AtomicRMWInst::Max:
19322   case AtomicRMWInst::Min:
19323   case AtomicRMWInst::UMax:
19324   case AtomicRMWInst::UMin:
19325     // These always require a non-trivial set of data operations on x86. We must
19326     // use a cmpxchg loop.
19327     return true;
19328   }
19329 }
19330
19331 static bool hasMFENCE(const X86Subtarget& Subtarget) {
19332   // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
19333   // no-sse2). There isn't any reason to disable it if the target processor
19334   // supports it.
19335   return Subtarget.hasSSE2() || Subtarget.is64Bit();
19336 }
19337
19338 LoadInst *
19339 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19340   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19341   const Type *MemType = AI->getType();
19342   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
19343   // there is no benefit in turning such RMWs into loads, and it is actually
19344   // harmful as it introduces a mfence.
19345   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19346     return nullptr;
19347
19348   auto Builder = IRBuilder<>(AI);
19349   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
19350   auto SynchScope = AI->getSynchScope();
19351   // We must restrict the ordering to avoid generating loads with Release or
19352   // ReleaseAcquire orderings.
19353   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
19354   auto Ptr = AI->getPointerOperand();
19355
19356   // Before the load we need a fence. Here is an example lifted from
19357   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
19358   // is required:
19359   // Thread 0:
19360   //   x.store(1, relaxed);
19361   //   r1 = y.fetch_add(0, release);
19362   // Thread 1:
19363   //   y.fetch_add(42, acquire);
19364   //   r2 = x.load(relaxed);
19365   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
19366   // lowered to just a load without a fence. A mfence flushes the store buffer,
19367   // making the optimization clearly correct.
19368   // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
19369   // otherwise, we might be able to be more agressive on relaxed idempotent
19370   // rmw. In practice, they do not look useful, so we don't try to be
19371   // especially clever.
19372   if (SynchScope == SingleThread) {
19373     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
19374     // the IR level, so we must wrap it in an intrinsic.
19375     return nullptr;
19376   } else if (hasMFENCE(*Subtarget)) {
19377     Function *MFence = llvm::Intrinsic::getDeclaration(M,
19378             Intrinsic::x86_sse2_mfence);
19379     Builder.CreateCall(MFence);
19380   } else {
19381     // FIXME: it might make sense to use a locked operation here but on a
19382     // different cache-line to prevent cache-line bouncing. In practice it
19383     // is probably a small win, and x86 processors without mfence are rare
19384     // enough that we do not bother.
19385     return nullptr;
19386   }
19387
19388   // Finally we can emit the atomic load.
19389   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
19390           AI->getType()->getPrimitiveSizeInBits());
19391   Loaded->setAtomic(Order, SynchScope);
19392   AI->replaceAllUsesWith(Loaded);
19393   AI->eraseFromParent();
19394   return Loaded;
19395 }
19396
19397 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
19398                                  SelectionDAG &DAG) {
19399   SDLoc dl(Op);
19400   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
19401     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
19402   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
19403     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
19404
19405   // The only fence that needs an instruction is a sequentially-consistent
19406   // cross-thread fence.
19407   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
19408     if (hasMFENCE(*Subtarget))
19409       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
19410
19411     SDValue Chain = Op.getOperand(0);
19412     SDValue Zero = DAG.getConstant(0, MVT::i32);
19413     SDValue Ops[] = {
19414       DAG.getRegister(X86::ESP, MVT::i32), // Base
19415       DAG.getTargetConstant(1, MVT::i8),   // Scale
19416       DAG.getRegister(0, MVT::i32),        // Index
19417       DAG.getTargetConstant(0, MVT::i32),  // Disp
19418       DAG.getRegister(0, MVT::i32),        // Segment.
19419       Zero,
19420       Chain
19421     };
19422     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
19423     return SDValue(Res, 0);
19424   }
19425
19426   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
19427   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
19428 }
19429
19430 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
19431                              SelectionDAG &DAG) {
19432   MVT T = Op.getSimpleValueType();
19433   SDLoc DL(Op);
19434   unsigned Reg = 0;
19435   unsigned size = 0;
19436   switch(T.SimpleTy) {
19437   default: llvm_unreachable("Invalid value type!");
19438   case MVT::i8:  Reg = X86::AL;  size = 1; break;
19439   case MVT::i16: Reg = X86::AX;  size = 2; break;
19440   case MVT::i32: Reg = X86::EAX; size = 4; break;
19441   case MVT::i64:
19442     assert(Subtarget->is64Bit() && "Node not type legal!");
19443     Reg = X86::RAX; size = 8;
19444     break;
19445   }
19446   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
19447                                   Op.getOperand(2), SDValue());
19448   SDValue Ops[] = { cpIn.getValue(0),
19449                     Op.getOperand(1),
19450                     Op.getOperand(3),
19451                     DAG.getTargetConstant(size, MVT::i8),
19452                     cpIn.getValue(1) };
19453   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19454   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
19455   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
19456                                            Ops, T, MMO);
19457
19458   SDValue cpOut =
19459     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
19460   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
19461                                       MVT::i32, cpOut.getValue(2));
19462   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
19463                                 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19464
19465   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
19466   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
19467   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
19468   return SDValue();
19469 }
19470
19471 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
19472                             SelectionDAG &DAG) {
19473   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
19474   MVT DstVT = Op.getSimpleValueType();
19475
19476   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
19477     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19478     if (DstVT != MVT::f64)
19479       // This conversion needs to be expanded.
19480       return SDValue();
19481
19482     SDValue InVec = Op->getOperand(0);
19483     SDLoc dl(Op);
19484     unsigned NumElts = SrcVT.getVectorNumElements();
19485     EVT SVT = SrcVT.getVectorElementType();
19486
19487     // Widen the vector in input in the case of MVT::v2i32.
19488     // Example: from MVT::v2i32 to MVT::v4i32.
19489     SmallVector<SDValue, 16> Elts;
19490     for (unsigned i = 0, e = NumElts; i != e; ++i)
19491       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
19492                                  DAG.getIntPtrConstant(i)));
19493
19494     // Explicitly mark the extra elements as Undef.
19495     SDValue Undef = DAG.getUNDEF(SVT);
19496     for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
19497       Elts.push_back(Undef);
19498
19499     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19500     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
19501     SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
19502     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
19503                        DAG.getIntPtrConstant(0));
19504   }
19505
19506   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
19507          Subtarget->hasMMX() && "Unexpected custom BITCAST");
19508   assert((DstVT == MVT::i64 ||
19509           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
19510          "Unexpected custom BITCAST");
19511   // i64 <=> MMX conversions are Legal.
19512   if (SrcVT==MVT::i64 && DstVT.isVector())
19513     return Op;
19514   if (DstVT==MVT::i64 && SrcVT.isVector())
19515     return Op;
19516   // MMX <=> MMX conversions are Legal.
19517   if (SrcVT.isVector() && DstVT.isVector())
19518     return Op;
19519   // All other conversions need to be expanded.
19520   return SDValue();
19521 }
19522
19523 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
19524                           SelectionDAG &DAG) {
19525   SDNode *Node = Op.getNode();
19526   SDLoc dl(Node);
19527
19528   Op = Op.getOperand(0);
19529   EVT VT = Op.getValueType();
19530   assert((VT.is128BitVector() || VT.is256BitVector()) &&
19531          "CTPOP lowering only implemented for 128/256-bit wide vector types");
19532
19533   unsigned NumElts = VT.getVectorNumElements();
19534   EVT EltVT = VT.getVectorElementType();
19535   unsigned Len = EltVT.getSizeInBits();
19536
19537   // This is the vectorized version of the "best" algorithm from
19538   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
19539   // with a minor tweak to use a series of adds + shifts instead of vector
19540   // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
19541   //
19542   //  v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
19543   //  v8i32 => Always profitable
19544   //
19545   // FIXME: There a couple of possible improvements:
19546   //
19547   // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
19548   // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
19549   //
19550   assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
19551          "CTPOP not implemented for this vector element type.");
19552
19553   // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
19554   // extra legalization.
19555   bool NeedsBitcast = EltVT == MVT::i32;
19556   MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
19557
19558   SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
19559   SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
19560   SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
19561
19562   // v = v - ((v >> 1) & 0x55555555...)
19563   SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
19564   SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
19565   SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
19566   if (NeedsBitcast)
19567     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19568
19569   SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
19570   SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
19571   if (NeedsBitcast)
19572     M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
19573
19574   SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
19575   if (VT != And.getValueType())
19576     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19577   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
19578
19579   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
19580   SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
19581   SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
19582   SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
19583   SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
19584
19585   Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
19586   if (NeedsBitcast) {
19587     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19588     M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
19589     Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
19590   }
19591
19592   SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
19593   SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
19594   if (VT != AndRHS.getValueType()) {
19595     AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
19596     AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
19597   }
19598   SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
19599
19600   // v = (v + (v >> 4)) & 0x0F0F0F0F...
19601   SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
19602   SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
19603   Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
19604   Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19605
19606   SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
19607   SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
19608   if (NeedsBitcast) {
19609     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19610     M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
19611   }
19612   And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
19613   if (VT != And.getValueType())
19614     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19615
19616   // The algorithm mentioned above uses:
19617   //    v = (v * 0x01010101...) >> (Len - 8)
19618   //
19619   // Change it to use vector adds + vector shifts which yield faster results on
19620   // Haswell than using vector integer multiplication.
19621   //
19622   // For i32 elements:
19623   //    v = v + (v >> 8)
19624   //    v = v + (v >> 16)
19625   //
19626   // For i64 elements:
19627   //    v = v + (v >> 8)
19628   //    v = v + (v >> 16)
19629   //    v = v + (v >> 32)
19630   //
19631   Add = And;
19632   SmallVector<SDValue, 8> Csts;
19633   for (unsigned i = 8; i <= Len/2; i *= 2) {
19634     Csts.assign(NumElts, DAG.getConstant(i, EltVT));
19635     SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
19636     Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
19637     Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19638     Csts.clear();
19639   }
19640
19641   // The result is on the least significant 6-bits on i32 and 7-bits on i64.
19642   SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
19643   SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
19644   SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
19645   if (NeedsBitcast) {
19646     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19647     M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
19648   }
19649   And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
19650   if (VT != And.getValueType())
19651     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19652
19653   return And;
19654 }
19655
19656 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
19657   SDNode *Node = Op.getNode();
19658   SDLoc dl(Node);
19659   EVT T = Node->getValueType(0);
19660   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
19661                               DAG.getConstant(0, T), Node->getOperand(2));
19662   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
19663                        cast<AtomicSDNode>(Node)->getMemoryVT(),
19664                        Node->getOperand(0),
19665                        Node->getOperand(1), negOp,
19666                        cast<AtomicSDNode>(Node)->getMemOperand(),
19667                        cast<AtomicSDNode>(Node)->getOrdering(),
19668                        cast<AtomicSDNode>(Node)->getSynchScope());
19669 }
19670
19671 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
19672   SDNode *Node = Op.getNode();
19673   SDLoc dl(Node);
19674   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
19675
19676   // Convert seq_cst store -> xchg
19677   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
19678   // FIXME: On 32-bit, store -> fist or movq would be more efficient
19679   //        (The only way to get a 16-byte store is cmpxchg16b)
19680   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
19681   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
19682       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
19683     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
19684                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
19685                                  Node->getOperand(0),
19686                                  Node->getOperand(1), Node->getOperand(2),
19687                                  cast<AtomicSDNode>(Node)->getMemOperand(),
19688                                  cast<AtomicSDNode>(Node)->getOrdering(),
19689                                  cast<AtomicSDNode>(Node)->getSynchScope());
19690     return Swap.getValue(1);
19691   }
19692   // Other atomic stores have a simple pattern.
19693   return Op;
19694 }
19695
19696 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
19697   EVT VT = Op.getNode()->getSimpleValueType(0);
19698
19699   // Let legalize expand this if it isn't a legal type yet.
19700   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19701     return SDValue();
19702
19703   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19704
19705   unsigned Opc;
19706   bool ExtraOp = false;
19707   switch (Op.getOpcode()) {
19708   default: llvm_unreachable("Invalid code");
19709   case ISD::ADDC: Opc = X86ISD::ADD; break;
19710   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
19711   case ISD::SUBC: Opc = X86ISD::SUB; break;
19712   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
19713   }
19714
19715   if (!ExtraOp)
19716     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19717                        Op.getOperand(1));
19718   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19719                      Op.getOperand(1), Op.getOperand(2));
19720 }
19721
19722 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
19723                             SelectionDAG &DAG) {
19724   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
19725
19726   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
19727   // which returns the values as { float, float } (in XMM0) or
19728   // { double, double } (which is returned in XMM0, XMM1).
19729   SDLoc dl(Op);
19730   SDValue Arg = Op.getOperand(0);
19731   EVT ArgVT = Arg.getValueType();
19732   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19733
19734   TargetLowering::ArgListTy Args;
19735   TargetLowering::ArgListEntry Entry;
19736
19737   Entry.Node = Arg;
19738   Entry.Ty = ArgTy;
19739   Entry.isSExt = false;
19740   Entry.isZExt = false;
19741   Args.push_back(Entry);
19742
19743   bool isF64 = ArgVT == MVT::f64;
19744   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
19745   // the small struct {f32, f32} is returned in (eax, edx). For f64,
19746   // the results are returned via SRet in memory.
19747   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
19748   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19749   SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
19750
19751   Type *RetTy = isF64
19752     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
19753     : (Type*)VectorType::get(ArgTy, 4);
19754
19755   TargetLowering::CallLoweringInfo CLI(DAG);
19756   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
19757     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
19758
19759   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
19760
19761   if (isF64)
19762     // Returned in xmm0 and xmm1.
19763     return CallResult.first;
19764
19765   // Returned in bits 0:31 and 32:64 xmm0.
19766   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19767                                CallResult.first, DAG.getIntPtrConstant(0));
19768   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19769                                CallResult.first, DAG.getIntPtrConstant(1));
19770   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
19771   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
19772 }
19773
19774 /// LowerOperation - Provide custom lowering hooks for some operations.
19775 ///
19776 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
19777   switch (Op.getOpcode()) {
19778   default: llvm_unreachable("Should not custom lower this!");
19779   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
19780   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
19781   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
19782     return LowerCMP_SWAP(Op, Subtarget, DAG);
19783   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
19784   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
19785   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
19786   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
19787   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
19788   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
19789   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
19790   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
19791   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
19792   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
19793   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
19794   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
19795   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
19796   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
19797   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
19798   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
19799   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
19800   case ISD::SHL_PARTS:
19801   case ISD::SRA_PARTS:
19802   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
19803   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
19804   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
19805   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
19806   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
19807   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
19808   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
19809   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
19810   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
19811   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
19812   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
19813   case ISD::FABS:
19814   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
19815   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
19816   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
19817   case ISD::SETCC:              return LowerSETCC(Op, DAG);
19818   case ISD::SELECT:             return LowerSELECT(Op, DAG);
19819   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
19820   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
19821   case ISD::VASTART:            return LowerVASTART(Op, DAG);
19822   case ISD::VAARG:              return LowerVAARG(Op, DAG);
19823   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
19824   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
19825   case ISD::INTRINSIC_VOID:
19826   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
19827   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
19828   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
19829   case ISD::FRAME_TO_ARGS_OFFSET:
19830                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
19831   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
19832   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
19833   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
19834   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
19835   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
19836   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
19837   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
19838   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
19839   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
19840   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
19841   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
19842   case ISD::UMUL_LOHI:
19843   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
19844   case ISD::SRA:
19845   case ISD::SRL:
19846   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
19847   case ISD::SADDO:
19848   case ISD::UADDO:
19849   case ISD::SSUBO:
19850   case ISD::USUBO:
19851   case ISD::SMULO:
19852   case ISD::UMULO:              return LowerXALUO(Op, DAG);
19853   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
19854   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
19855   case ISD::ADDC:
19856   case ISD::ADDE:
19857   case ISD::SUBC:
19858   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
19859   case ISD::ADD:                return LowerADD(Op, DAG);
19860   case ISD::SUB:                return LowerSUB(Op, DAG);
19861   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
19862   }
19863 }
19864
19865 /// ReplaceNodeResults - Replace a node with an illegal result type
19866 /// with a new node built out of custom code.
19867 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
19868                                            SmallVectorImpl<SDValue>&Results,
19869                                            SelectionDAG &DAG) const {
19870   SDLoc dl(N);
19871   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19872   switch (N->getOpcode()) {
19873   default:
19874     llvm_unreachable("Do not know how to custom type legalize this operation!");
19875   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
19876   case X86ISD::FMINC:
19877   case X86ISD::FMIN:
19878   case X86ISD::FMAXC:
19879   case X86ISD::FMAX: {
19880     EVT VT = N->getValueType(0);
19881     if (VT != MVT::v2f32)
19882       llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
19883     SDValue UNDEF = DAG.getUNDEF(VT);
19884     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19885                               N->getOperand(0), UNDEF);
19886     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19887                               N->getOperand(1), UNDEF);
19888     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
19889     return;
19890   }
19891   case ISD::SIGN_EXTEND_INREG:
19892   case ISD::ADDC:
19893   case ISD::ADDE:
19894   case ISD::SUBC:
19895   case ISD::SUBE:
19896     // We don't want to expand or promote these.
19897     return;
19898   case ISD::SDIV:
19899   case ISD::UDIV:
19900   case ISD::SREM:
19901   case ISD::UREM:
19902   case ISD::SDIVREM:
19903   case ISD::UDIVREM: {
19904     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
19905     Results.push_back(V);
19906     return;
19907   }
19908   case ISD::FP_TO_SINT:
19909   case ISD::FP_TO_UINT: {
19910     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
19911
19912     if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
19913       return;
19914
19915     std::pair<SDValue,SDValue> Vals =
19916         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
19917     SDValue FIST = Vals.first, StackSlot = Vals.second;
19918     if (FIST.getNode()) {
19919       EVT VT = N->getValueType(0);
19920       // Return a load from the stack slot.
19921       if (StackSlot.getNode())
19922         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
19923                                       MachinePointerInfo(),
19924                                       false, false, false, 0));
19925       else
19926         Results.push_back(FIST);
19927     }
19928     return;
19929   }
19930   case ISD::UINT_TO_FP: {
19931     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19932     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
19933         N->getValueType(0) != MVT::v2f32)
19934       return;
19935     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
19936                                  N->getOperand(0));
19937     SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
19938                                      MVT::f64);
19939     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
19940     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
19941                              DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
19942     Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
19943     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
19944     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
19945     return;
19946   }
19947   case ISD::FP_ROUND: {
19948     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
19949         return;
19950     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
19951     Results.push_back(V);
19952     return;
19953   }
19954   case ISD::INTRINSIC_W_CHAIN: {
19955     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
19956     switch (IntNo) {
19957     default : llvm_unreachable("Do not know how to custom type "
19958                                "legalize this intrinsic operation!");
19959     case Intrinsic::x86_rdtsc:
19960       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19961                                      Results);
19962     case Intrinsic::x86_rdtscp:
19963       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
19964                                      Results);
19965     case Intrinsic::x86_rdpmc:
19966       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
19967     }
19968   }
19969   case ISD::READCYCLECOUNTER: {
19970     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19971                                    Results);
19972   }
19973   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
19974     EVT T = N->getValueType(0);
19975     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
19976     bool Regs64bit = T == MVT::i128;
19977     EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
19978     SDValue cpInL, cpInH;
19979     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19980                         DAG.getConstant(0, HalfT));
19981     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19982                         DAG.getConstant(1, HalfT));
19983     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
19984                              Regs64bit ? X86::RAX : X86::EAX,
19985                              cpInL, SDValue());
19986     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
19987                              Regs64bit ? X86::RDX : X86::EDX,
19988                              cpInH, cpInL.getValue(1));
19989     SDValue swapInL, swapInH;
19990     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19991                           DAG.getConstant(0, HalfT));
19992     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19993                           DAG.getConstant(1, HalfT));
19994     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
19995                                Regs64bit ? X86::RBX : X86::EBX,
19996                                swapInL, cpInH.getValue(1));
19997     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
19998                                Regs64bit ? X86::RCX : X86::ECX,
19999                                swapInH, swapInL.getValue(1));
20000     SDValue Ops[] = { swapInH.getValue(0),
20001                       N->getOperand(1),
20002                       swapInH.getValue(1) };
20003     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20004     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
20005     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
20006                                   X86ISD::LCMPXCHG8_DAG;
20007     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
20008     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
20009                                         Regs64bit ? X86::RAX : X86::EAX,
20010                                         HalfT, Result.getValue(1));
20011     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
20012                                         Regs64bit ? X86::RDX : X86::EDX,
20013                                         HalfT, cpOutL.getValue(2));
20014     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
20015
20016     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
20017                                         MVT::i32, cpOutH.getValue(2));
20018     SDValue Success =
20019         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
20020                     DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
20021     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
20022
20023     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
20024     Results.push_back(Success);
20025     Results.push_back(EFLAGS.getValue(1));
20026     return;
20027   }
20028   case ISD::ATOMIC_SWAP:
20029   case ISD::ATOMIC_LOAD_ADD:
20030   case ISD::ATOMIC_LOAD_SUB:
20031   case ISD::ATOMIC_LOAD_AND:
20032   case ISD::ATOMIC_LOAD_OR:
20033   case ISD::ATOMIC_LOAD_XOR:
20034   case ISD::ATOMIC_LOAD_NAND:
20035   case ISD::ATOMIC_LOAD_MIN:
20036   case ISD::ATOMIC_LOAD_MAX:
20037   case ISD::ATOMIC_LOAD_UMIN:
20038   case ISD::ATOMIC_LOAD_UMAX:
20039   case ISD::ATOMIC_LOAD: {
20040     // Delegate to generic TypeLegalization. Situations we can really handle
20041     // should have already been dealt with by AtomicExpandPass.cpp.
20042     break;
20043   }
20044   case ISD::BITCAST: {
20045     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
20046     EVT DstVT = N->getValueType(0);
20047     EVT SrcVT = N->getOperand(0)->getValueType(0);
20048
20049     if (SrcVT != MVT::f64 ||
20050         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
20051       return;
20052
20053     unsigned NumElts = DstVT.getVectorNumElements();
20054     EVT SVT = DstVT.getVectorElementType();
20055     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
20056     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
20057                                    MVT::v2f64, N->getOperand(0));
20058     SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
20059
20060     if (ExperimentalVectorWideningLegalization) {
20061       // If we are legalizing vectors by widening, we already have the desired
20062       // legal vector type, just return it.
20063       Results.push_back(ToVecInt);
20064       return;
20065     }
20066
20067     SmallVector<SDValue, 8> Elts;
20068     for (unsigned i = 0, e = NumElts; i != e; ++i)
20069       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
20070                                    ToVecInt, DAG.getIntPtrConstant(i)));
20071
20072     Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
20073   }
20074   }
20075 }
20076
20077 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
20078   switch (Opcode) {
20079   default: return nullptr;
20080   case X86ISD::BSF:                return "X86ISD::BSF";
20081   case X86ISD::BSR:                return "X86ISD::BSR";
20082   case X86ISD::SHLD:               return "X86ISD::SHLD";
20083   case X86ISD::SHRD:               return "X86ISD::SHRD";
20084   case X86ISD::FAND:               return "X86ISD::FAND";
20085   case X86ISD::FANDN:              return "X86ISD::FANDN";
20086   case X86ISD::FOR:                return "X86ISD::FOR";
20087   case X86ISD::FXOR:               return "X86ISD::FXOR";
20088   case X86ISD::FSRL:               return "X86ISD::FSRL";
20089   case X86ISD::FILD:               return "X86ISD::FILD";
20090   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
20091   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
20092   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
20093   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
20094   case X86ISD::FLD:                return "X86ISD::FLD";
20095   case X86ISD::FST:                return "X86ISD::FST";
20096   case X86ISD::CALL:               return "X86ISD::CALL";
20097   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
20098   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
20099   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
20100   case X86ISD::BT:                 return "X86ISD::BT";
20101   case X86ISD::CMP:                return "X86ISD::CMP";
20102   case X86ISD::COMI:               return "X86ISD::COMI";
20103   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
20104   case X86ISD::CMPM:               return "X86ISD::CMPM";
20105   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
20106   case X86ISD::SETCC:              return "X86ISD::SETCC";
20107   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
20108   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
20109   case X86ISD::CMOV:               return "X86ISD::CMOV";
20110   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
20111   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
20112   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
20113   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
20114   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
20115   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
20116   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
20117   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
20118   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
20119   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
20120   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
20121   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
20122   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
20123   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
20124   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
20125   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
20126   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
20127   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
20128   case X86ISD::HADD:               return "X86ISD::HADD";
20129   case X86ISD::HSUB:               return "X86ISD::HSUB";
20130   case X86ISD::FHADD:              return "X86ISD::FHADD";
20131   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
20132   case X86ISD::UMAX:               return "X86ISD::UMAX";
20133   case X86ISD::UMIN:               return "X86ISD::UMIN";
20134   case X86ISD::SMAX:               return "X86ISD::SMAX";
20135   case X86ISD::SMIN:               return "X86ISD::SMIN";
20136   case X86ISD::FMAX:               return "X86ISD::FMAX";
20137   case X86ISD::FMIN:               return "X86ISD::FMIN";
20138   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
20139   case X86ISD::FMINC:              return "X86ISD::FMINC";
20140   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
20141   case X86ISD::FRCP:               return "X86ISD::FRCP";
20142   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
20143   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
20144   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
20145   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
20146   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
20147   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
20148   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
20149   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
20150   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
20151   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
20152   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
20153   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
20154   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
20155   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
20156   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
20157   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
20158   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
20159   case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
20160   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
20161   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
20162   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
20163   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
20164   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
20165   case X86ISD::VSHL:               return "X86ISD::VSHL";
20166   case X86ISD::VSRL:               return "X86ISD::VSRL";
20167   case X86ISD::VSRA:               return "X86ISD::VSRA";
20168   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
20169   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
20170   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
20171   case X86ISD::CMPP:               return "X86ISD::CMPP";
20172   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
20173   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
20174   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
20175   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
20176   case X86ISD::ADD:                return "X86ISD::ADD";
20177   case X86ISD::SUB:                return "X86ISD::SUB";
20178   case X86ISD::ADC:                return "X86ISD::ADC";
20179   case X86ISD::SBB:                return "X86ISD::SBB";
20180   case X86ISD::SMUL:               return "X86ISD::SMUL";
20181   case X86ISD::UMUL:               return "X86ISD::UMUL";
20182   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
20183   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
20184   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
20185   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
20186   case X86ISD::INC:                return "X86ISD::INC";
20187   case X86ISD::DEC:                return "X86ISD::DEC";
20188   case X86ISD::OR:                 return "X86ISD::OR";
20189   case X86ISD::XOR:                return "X86ISD::XOR";
20190   case X86ISD::AND:                return "X86ISD::AND";
20191   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
20192   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
20193   case X86ISD::PTEST:              return "X86ISD::PTEST";
20194   case X86ISD::TESTP:              return "X86ISD::TESTP";
20195   case X86ISD::TESTM:              return "X86ISD::TESTM";
20196   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
20197   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
20198   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
20199   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
20200   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
20201   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
20202   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
20203   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
20204   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
20205   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
20206   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
20207   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
20208   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
20209   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
20210   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
20211   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
20212   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
20213   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
20214   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
20215   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
20216   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
20217   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
20218   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
20219   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
20220   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
20221   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
20222   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
20223   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
20224   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
20225   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
20226   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
20227   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
20228   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
20229   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
20230   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
20231   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
20232   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
20233   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
20234   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
20235   case X86ISD::SAHF:               return "X86ISD::SAHF";
20236   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
20237   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
20238   case X86ISD::FMADD:              return "X86ISD::FMADD";
20239   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
20240   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
20241   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
20242   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
20243   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
20244   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
20245   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
20246   case X86ISD::XTEST:              return "X86ISD::XTEST";
20247   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
20248   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
20249   case X86ISD::SELECT:             return "X86ISD::SELECT";
20250   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
20251   case X86ISD::RCP28:              return "X86ISD::RCP28";
20252   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
20253   }
20254 }
20255
20256 // isLegalAddressingMode - Return true if the addressing mode represented
20257 // by AM is legal for this target, for a load/store of the specified type.
20258 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
20259                                               Type *Ty) const {
20260   // X86 supports extremely general addressing modes.
20261   CodeModel::Model M = getTargetMachine().getCodeModel();
20262   Reloc::Model R = getTargetMachine().getRelocationModel();
20263
20264   // X86 allows a sign-extended 32-bit immediate field as a displacement.
20265   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
20266     return false;
20267
20268   if (AM.BaseGV) {
20269     unsigned GVFlags =
20270       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
20271
20272     // If a reference to this global requires an extra load, we can't fold it.
20273     if (isGlobalStubReference(GVFlags))
20274       return false;
20275
20276     // If BaseGV requires a register for the PIC base, we cannot also have a
20277     // BaseReg specified.
20278     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
20279       return false;
20280
20281     // If lower 4G is not available, then we must use rip-relative addressing.
20282     if ((M != CodeModel::Small || R != Reloc::Static) &&
20283         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
20284       return false;
20285   }
20286
20287   switch (AM.Scale) {
20288   case 0:
20289   case 1:
20290   case 2:
20291   case 4:
20292   case 8:
20293     // These scales always work.
20294     break;
20295   case 3:
20296   case 5:
20297   case 9:
20298     // These scales are formed with basereg+scalereg.  Only accept if there is
20299     // no basereg yet.
20300     if (AM.HasBaseReg)
20301       return false;
20302     break;
20303   default:  // Other stuff never works.
20304     return false;
20305   }
20306
20307   return true;
20308 }
20309
20310 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
20311   unsigned Bits = Ty->getScalarSizeInBits();
20312
20313   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
20314   // particularly cheaper than those without.
20315   if (Bits == 8)
20316     return false;
20317
20318   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
20319   // variable shifts just as cheap as scalar ones.
20320   if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
20321     return false;
20322
20323   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
20324   // fully general vector.
20325   return true;
20326 }
20327
20328 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
20329   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20330     return false;
20331   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
20332   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
20333   return NumBits1 > NumBits2;
20334 }
20335
20336 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
20337   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20338     return false;
20339
20340   if (!isTypeLegal(EVT::getEVT(Ty1)))
20341     return false;
20342
20343   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
20344
20345   // Assuming the caller doesn't have a zeroext or signext return parameter,
20346   // truncation all the way down to i1 is valid.
20347   return true;
20348 }
20349
20350 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
20351   return isInt<32>(Imm);
20352 }
20353
20354 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
20355   // Can also use sub to handle negated immediates.
20356   return isInt<32>(Imm);
20357 }
20358
20359 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
20360   if (!VT1.isInteger() || !VT2.isInteger())
20361     return false;
20362   unsigned NumBits1 = VT1.getSizeInBits();
20363   unsigned NumBits2 = VT2.getSizeInBits();
20364   return NumBits1 > NumBits2;
20365 }
20366
20367 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
20368   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20369   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
20370 }
20371
20372 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
20373   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20374   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
20375 }
20376
20377 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
20378   EVT VT1 = Val.getValueType();
20379   if (isZExtFree(VT1, VT2))
20380     return true;
20381
20382   if (Val.getOpcode() != ISD::LOAD)
20383     return false;
20384
20385   if (!VT1.isSimple() || !VT1.isInteger() ||
20386       !VT2.isSimple() || !VT2.isInteger())
20387     return false;
20388
20389   switch (VT1.getSimpleVT().SimpleTy) {
20390   default: break;
20391   case MVT::i8:
20392   case MVT::i16:
20393   case MVT::i32:
20394     // X86 has 8, 16, and 32-bit zero-extending loads.
20395     return true;
20396   }
20397
20398   return false;
20399 }
20400
20401 bool
20402 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
20403   if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
20404     return false;
20405
20406   VT = VT.getScalarType();
20407
20408   if (!VT.isSimple())
20409     return false;
20410
20411   switch (VT.getSimpleVT().SimpleTy) {
20412   case MVT::f32:
20413   case MVT::f64:
20414     return true;
20415   default:
20416     break;
20417   }
20418
20419   return false;
20420 }
20421
20422 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
20423   // i16 instructions are longer (0x66 prefix) and potentially slower.
20424   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
20425 }
20426
20427 /// isShuffleMaskLegal - Targets can use this to indicate that they only
20428 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
20429 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
20430 /// are assumed to be legal.
20431 bool
20432 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
20433                                       EVT VT) const {
20434   if (!VT.isSimple())
20435     return false;
20436
20437   MVT SVT = VT.getSimpleVT();
20438
20439   // Very little shuffling can be done for 64-bit vectors right now.
20440   if (VT.getSizeInBits() == 64)
20441     return false;
20442
20443   // This is an experimental legality test that is tailored to match the
20444   // legality test of the experimental lowering more closely. They are gated
20445   // separately to ease testing of performance differences.
20446   if (ExperimentalVectorShuffleLegality)
20447     // We only care that the types being shuffled are legal. The lowering can
20448     // handle any possible shuffle mask that results.
20449     return isTypeLegal(SVT);
20450
20451   // If this is a single-input shuffle with no 128 bit lane crossings we can
20452   // lower it into pshufb.
20453   if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
20454       (SVT.is256BitVector() && Subtarget->hasInt256())) {
20455     bool isLegal = true;
20456     for (unsigned I = 0, E = M.size(); I != E; ++I) {
20457       if (M[I] >= (int)SVT.getVectorNumElements() ||
20458           ShuffleCrosses128bitLane(SVT, I, M[I])) {
20459         isLegal = false;
20460         break;
20461       }
20462     }
20463     if (isLegal)
20464       return true;
20465   }
20466
20467   // FIXME: blends, shifts.
20468   return (SVT.getVectorNumElements() == 2 ||
20469           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
20470           isMOVLMask(M, SVT) ||
20471           isCommutedMOVLMask(M, SVT) ||
20472           isMOVHLPSMask(M, SVT) ||
20473           isSHUFPMask(M, SVT) ||
20474           isSHUFPMask(M, SVT, /* Commuted */ true) ||
20475           isPSHUFDMask(M, SVT) ||
20476           isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
20477           isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
20478           isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
20479           isPALIGNRMask(M, SVT, Subtarget) ||
20480           isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
20481           isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
20482           isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20483           isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20484           isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
20485           (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
20486 }
20487
20488 bool
20489 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
20490                                           EVT VT) const {
20491   if (!VT.isSimple())
20492     return false;
20493
20494   MVT SVT = VT.getSimpleVT();
20495
20496   // This is an experimental legality test that is tailored to match the
20497   // legality test of the experimental lowering more closely. They are gated
20498   // separately to ease testing of performance differences.
20499   if (ExperimentalVectorShuffleLegality)
20500     // The new vector shuffle lowering is very good at managing zero-inputs.
20501     return isShuffleMaskLegal(Mask, VT);
20502
20503   unsigned NumElts = SVT.getVectorNumElements();
20504   // FIXME: This collection of masks seems suspect.
20505   if (NumElts == 2)
20506     return true;
20507   if (NumElts == 4 && SVT.is128BitVector()) {
20508     return (isMOVLMask(Mask, SVT)  ||
20509             isCommutedMOVLMask(Mask, SVT, true) ||
20510             isSHUFPMask(Mask, SVT) ||
20511             isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
20512             isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
20513                         Subtarget->hasInt256()));
20514   }
20515   return false;
20516 }
20517
20518 //===----------------------------------------------------------------------===//
20519 //                           X86 Scheduler Hooks
20520 //===----------------------------------------------------------------------===//
20521
20522 /// Utility function to emit xbegin specifying the start of an RTM region.
20523 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
20524                                      const TargetInstrInfo *TII) {
20525   DebugLoc DL = MI->getDebugLoc();
20526
20527   const BasicBlock *BB = MBB->getBasicBlock();
20528   MachineFunction::iterator I = MBB;
20529   ++I;
20530
20531   // For the v = xbegin(), we generate
20532   //
20533   // thisMBB:
20534   //  xbegin sinkMBB
20535   //
20536   // mainMBB:
20537   //  eax = -1
20538   //
20539   // sinkMBB:
20540   //  v = eax
20541
20542   MachineBasicBlock *thisMBB = MBB;
20543   MachineFunction *MF = MBB->getParent();
20544   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
20545   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
20546   MF->insert(I, mainMBB);
20547   MF->insert(I, sinkMBB);
20548
20549   // Transfer the remainder of BB and its successor edges to sinkMBB.
20550   sinkMBB->splice(sinkMBB->begin(), MBB,
20551                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20552   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
20553
20554   // thisMBB:
20555   //  xbegin sinkMBB
20556   //  # fallthrough to mainMBB
20557   //  # abortion to sinkMBB
20558   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
20559   thisMBB->addSuccessor(mainMBB);
20560   thisMBB->addSuccessor(sinkMBB);
20561
20562   // mainMBB:
20563   //  EAX = -1
20564   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
20565   mainMBB->addSuccessor(sinkMBB);
20566
20567   // sinkMBB:
20568   // EAX is live into the sinkMBB
20569   sinkMBB->addLiveIn(X86::EAX);
20570   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20571           TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20572     .addReg(X86::EAX);
20573
20574   MI->eraseFromParent();
20575   return sinkMBB;
20576 }
20577
20578 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
20579 // or XMM0_V32I8 in AVX all of this code can be replaced with that
20580 // in the .td file.
20581 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
20582                                        const TargetInstrInfo *TII) {
20583   unsigned Opc;
20584   switch (MI->getOpcode()) {
20585   default: llvm_unreachable("illegal opcode!");
20586   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
20587   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
20588   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
20589   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
20590   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
20591   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
20592   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
20593   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
20594   }
20595
20596   DebugLoc dl = MI->getDebugLoc();
20597   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20598
20599   unsigned NumArgs = MI->getNumOperands();
20600   for (unsigned i = 1; i < NumArgs; ++i) {
20601     MachineOperand &Op = MI->getOperand(i);
20602     if (!(Op.isReg() && Op.isImplicit()))
20603       MIB.addOperand(Op);
20604   }
20605   if (MI->hasOneMemOperand())
20606     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20607
20608   BuildMI(*BB, MI, dl,
20609     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20610     .addReg(X86::XMM0);
20611
20612   MI->eraseFromParent();
20613   return BB;
20614 }
20615
20616 // FIXME: Custom handling because TableGen doesn't support multiple implicit
20617 // defs in an instruction pattern
20618 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
20619                                        const TargetInstrInfo *TII) {
20620   unsigned Opc;
20621   switch (MI->getOpcode()) {
20622   default: llvm_unreachable("illegal opcode!");
20623   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
20624   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
20625   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
20626   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
20627   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
20628   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
20629   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
20630   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
20631   }
20632
20633   DebugLoc dl = MI->getDebugLoc();
20634   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20635
20636   unsigned NumArgs = MI->getNumOperands(); // remove the results
20637   for (unsigned i = 1; i < NumArgs; ++i) {
20638     MachineOperand &Op = MI->getOperand(i);
20639     if (!(Op.isReg() && Op.isImplicit()))
20640       MIB.addOperand(Op);
20641   }
20642   if (MI->hasOneMemOperand())
20643     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20644
20645   BuildMI(*BB, MI, dl,
20646     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20647     .addReg(X86::ECX);
20648
20649   MI->eraseFromParent();
20650   return BB;
20651 }
20652
20653 static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
20654                                       const X86Subtarget *Subtarget) {
20655   DebugLoc dl = MI->getDebugLoc();
20656   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20657   // Address into RAX/EAX, other two args into ECX, EDX.
20658   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
20659   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
20660   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
20661   for (int i = 0; i < X86::AddrNumOperands; ++i)
20662     MIB.addOperand(MI->getOperand(i));
20663
20664   unsigned ValOps = X86::AddrNumOperands;
20665   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
20666     .addReg(MI->getOperand(ValOps).getReg());
20667   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
20668     .addReg(MI->getOperand(ValOps+1).getReg());
20669
20670   // The instruction doesn't actually take any operands though.
20671   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
20672
20673   MI->eraseFromParent(); // The pseudo is gone now.
20674   return BB;
20675 }
20676
20677 MachineBasicBlock *
20678 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
20679                                                  MachineBasicBlock *MBB) const {
20680   // Emit va_arg instruction on X86-64.
20681
20682   // Operands to this pseudo-instruction:
20683   // 0  ) Output        : destination address (reg)
20684   // 1-5) Input         : va_list address (addr, i64mem)
20685   // 6  ) ArgSize       : Size (in bytes) of vararg type
20686   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
20687   // 8  ) Align         : Alignment of type
20688   // 9  ) EFLAGS (implicit-def)
20689
20690   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
20691   assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
20692
20693   unsigned DestReg = MI->getOperand(0).getReg();
20694   MachineOperand &Base = MI->getOperand(1);
20695   MachineOperand &Scale = MI->getOperand(2);
20696   MachineOperand &Index = MI->getOperand(3);
20697   MachineOperand &Disp = MI->getOperand(4);
20698   MachineOperand &Segment = MI->getOperand(5);
20699   unsigned ArgSize = MI->getOperand(6).getImm();
20700   unsigned ArgMode = MI->getOperand(7).getImm();
20701   unsigned Align = MI->getOperand(8).getImm();
20702
20703   // Memory Reference
20704   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
20705   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
20706   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
20707
20708   // Machine Information
20709   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20710   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
20711   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
20712   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
20713   DebugLoc DL = MI->getDebugLoc();
20714
20715   // struct va_list {
20716   //   i32   gp_offset
20717   //   i32   fp_offset
20718   //   i64   overflow_area (address)
20719   //   i64   reg_save_area (address)
20720   // }
20721   // sizeof(va_list) = 24
20722   // alignment(va_list) = 8
20723
20724   unsigned TotalNumIntRegs = 6;
20725   unsigned TotalNumXMMRegs = 8;
20726   bool UseGPOffset = (ArgMode == 1);
20727   bool UseFPOffset = (ArgMode == 2);
20728   unsigned MaxOffset = TotalNumIntRegs * 8 +
20729                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
20730
20731   /* Align ArgSize to a multiple of 8 */
20732   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
20733   bool NeedsAlign = (Align > 8);
20734
20735   MachineBasicBlock *thisMBB = MBB;
20736   MachineBasicBlock *overflowMBB;
20737   MachineBasicBlock *offsetMBB;
20738   MachineBasicBlock *endMBB;
20739
20740   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
20741   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
20742   unsigned OffsetReg = 0;
20743
20744   if (!UseGPOffset && !UseFPOffset) {
20745     // If we only pull from the overflow region, we don't create a branch.
20746     // We don't need to alter control flow.
20747     OffsetDestReg = 0; // unused
20748     OverflowDestReg = DestReg;
20749
20750     offsetMBB = nullptr;
20751     overflowMBB = thisMBB;
20752     endMBB = thisMBB;
20753   } else {
20754     // First emit code to check if gp_offset (or fp_offset) is below the bound.
20755     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
20756     // If not, pull from overflow_area. (branch to overflowMBB)
20757     //
20758     //       thisMBB
20759     //         |     .
20760     //         |        .
20761     //     offsetMBB   overflowMBB
20762     //         |        .
20763     //         |     .
20764     //        endMBB
20765
20766     // Registers for the PHI in endMBB
20767     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
20768     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
20769
20770     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20771     MachineFunction *MF = MBB->getParent();
20772     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20773     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20774     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20775
20776     MachineFunction::iterator MBBIter = MBB;
20777     ++MBBIter;
20778
20779     // Insert the new basic blocks
20780     MF->insert(MBBIter, offsetMBB);
20781     MF->insert(MBBIter, overflowMBB);
20782     MF->insert(MBBIter, endMBB);
20783
20784     // Transfer the remainder of MBB and its successor edges to endMBB.
20785     endMBB->splice(endMBB->begin(), thisMBB,
20786                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
20787     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
20788
20789     // Make offsetMBB and overflowMBB successors of thisMBB
20790     thisMBB->addSuccessor(offsetMBB);
20791     thisMBB->addSuccessor(overflowMBB);
20792
20793     // endMBB is a successor of both offsetMBB and overflowMBB
20794     offsetMBB->addSuccessor(endMBB);
20795     overflowMBB->addSuccessor(endMBB);
20796
20797     // Load the offset value into a register
20798     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20799     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
20800       .addOperand(Base)
20801       .addOperand(Scale)
20802       .addOperand(Index)
20803       .addDisp(Disp, UseFPOffset ? 4 : 0)
20804       .addOperand(Segment)
20805       .setMemRefs(MMOBegin, MMOEnd);
20806
20807     // Check if there is enough room left to pull this argument.
20808     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
20809       .addReg(OffsetReg)
20810       .addImm(MaxOffset + 8 - ArgSizeA8);
20811
20812     // Branch to "overflowMBB" if offset >= max
20813     // Fall through to "offsetMBB" otherwise
20814     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
20815       .addMBB(overflowMBB);
20816   }
20817
20818   // In offsetMBB, emit code to use the reg_save_area.
20819   if (offsetMBB) {
20820     assert(OffsetReg != 0);
20821
20822     // Read the reg_save_area address.
20823     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
20824     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
20825       .addOperand(Base)
20826       .addOperand(Scale)
20827       .addOperand(Index)
20828       .addDisp(Disp, 16)
20829       .addOperand(Segment)
20830       .setMemRefs(MMOBegin, MMOEnd);
20831
20832     // Zero-extend the offset
20833     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
20834       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
20835         .addImm(0)
20836         .addReg(OffsetReg)
20837         .addImm(X86::sub_32bit);
20838
20839     // Add the offset to the reg_save_area to get the final address.
20840     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
20841       .addReg(OffsetReg64)
20842       .addReg(RegSaveReg);
20843
20844     // Compute the offset for the next argument
20845     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20846     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
20847       .addReg(OffsetReg)
20848       .addImm(UseFPOffset ? 16 : 8);
20849
20850     // Store it back into the va_list.
20851     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
20852       .addOperand(Base)
20853       .addOperand(Scale)
20854       .addOperand(Index)
20855       .addDisp(Disp, UseFPOffset ? 4 : 0)
20856       .addOperand(Segment)
20857       .addReg(NextOffsetReg)
20858       .setMemRefs(MMOBegin, MMOEnd);
20859
20860     // Jump to endMBB
20861     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
20862       .addMBB(endMBB);
20863   }
20864
20865   //
20866   // Emit code to use overflow area
20867   //
20868
20869   // Load the overflow_area address into a register.
20870   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
20871   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
20872     .addOperand(Base)
20873     .addOperand(Scale)
20874     .addOperand(Index)
20875     .addDisp(Disp, 8)
20876     .addOperand(Segment)
20877     .setMemRefs(MMOBegin, MMOEnd);
20878
20879   // If we need to align it, do so. Otherwise, just copy the address
20880   // to OverflowDestReg.
20881   if (NeedsAlign) {
20882     // Align the overflow address
20883     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
20884     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
20885
20886     // aligned_addr = (addr + (align-1)) & ~(align-1)
20887     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
20888       .addReg(OverflowAddrReg)
20889       .addImm(Align-1);
20890
20891     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
20892       .addReg(TmpReg)
20893       .addImm(~(uint64_t)(Align-1));
20894   } else {
20895     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
20896       .addReg(OverflowAddrReg);
20897   }
20898
20899   // Compute the next overflow address after this argument.
20900   // (the overflow address should be kept 8-byte aligned)
20901   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
20902   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
20903     .addReg(OverflowDestReg)
20904     .addImm(ArgSizeA8);
20905
20906   // Store the new overflow address.
20907   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
20908     .addOperand(Base)
20909     .addOperand(Scale)
20910     .addOperand(Index)
20911     .addDisp(Disp, 8)
20912     .addOperand(Segment)
20913     .addReg(NextAddrReg)
20914     .setMemRefs(MMOBegin, MMOEnd);
20915
20916   // If we branched, emit the PHI to the front of endMBB.
20917   if (offsetMBB) {
20918     BuildMI(*endMBB, endMBB->begin(), DL,
20919             TII->get(X86::PHI), DestReg)
20920       .addReg(OffsetDestReg).addMBB(offsetMBB)
20921       .addReg(OverflowDestReg).addMBB(overflowMBB);
20922   }
20923
20924   // Erase the pseudo instruction
20925   MI->eraseFromParent();
20926
20927   return endMBB;
20928 }
20929
20930 MachineBasicBlock *
20931 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
20932                                                  MachineInstr *MI,
20933                                                  MachineBasicBlock *MBB) const {
20934   // Emit code to save XMM registers to the stack. The ABI says that the
20935   // number of registers to save is given in %al, so it's theoretically
20936   // possible to do an indirect jump trick to avoid saving all of them,
20937   // however this code takes a simpler approach and just executes all
20938   // of the stores if %al is non-zero. It's less code, and it's probably
20939   // easier on the hardware branch predictor, and stores aren't all that
20940   // expensive anyway.
20941
20942   // Create the new basic blocks. One block contains all the XMM stores,
20943   // and one block is the final destination regardless of whether any
20944   // stores were performed.
20945   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20946   MachineFunction *F = MBB->getParent();
20947   MachineFunction::iterator MBBIter = MBB;
20948   ++MBBIter;
20949   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
20950   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
20951   F->insert(MBBIter, XMMSaveMBB);
20952   F->insert(MBBIter, EndMBB);
20953
20954   // Transfer the remainder of MBB and its successor edges to EndMBB.
20955   EndMBB->splice(EndMBB->begin(), MBB,
20956                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20957   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
20958
20959   // The original block will now fall through to the XMM save block.
20960   MBB->addSuccessor(XMMSaveMBB);
20961   // The XMMSaveMBB will fall through to the end block.
20962   XMMSaveMBB->addSuccessor(EndMBB);
20963
20964   // Now add the instructions.
20965   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20966   DebugLoc DL = MI->getDebugLoc();
20967
20968   unsigned CountReg = MI->getOperand(0).getReg();
20969   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
20970   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
20971
20972   if (!Subtarget->isTargetWin64()) {
20973     // If %al is 0, branch around the XMM save block.
20974     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
20975     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
20976     MBB->addSuccessor(EndMBB);
20977   }
20978
20979   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
20980   // that was just emitted, but clearly shouldn't be "saved".
20981   assert((MI->getNumOperands() <= 3 ||
20982           !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
20983           MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
20984          && "Expected last argument to be EFLAGS");
20985   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
20986   // In the XMM save block, save all the XMM argument registers.
20987   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
20988     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
20989     MachineMemOperand *MMO =
20990       F->getMachineMemOperand(
20991           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
20992         MachineMemOperand::MOStore,
20993         /*Size=*/16, /*Align=*/16);
20994     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
20995       .addFrameIndex(RegSaveFrameIndex)
20996       .addImm(/*Scale=*/1)
20997       .addReg(/*IndexReg=*/0)
20998       .addImm(/*Disp=*/Offset)
20999       .addReg(/*Segment=*/0)
21000       .addReg(MI->getOperand(i).getReg())
21001       .addMemOperand(MMO);
21002   }
21003
21004   MI->eraseFromParent();   // The pseudo instruction is gone now.
21005
21006   return EndMBB;
21007 }
21008
21009 // The EFLAGS operand of SelectItr might be missing a kill marker
21010 // because there were multiple uses of EFLAGS, and ISel didn't know
21011 // which to mark. Figure out whether SelectItr should have had a
21012 // kill marker, and set it if it should. Returns the correct kill
21013 // marker value.
21014 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
21015                                      MachineBasicBlock* BB,
21016                                      const TargetRegisterInfo* TRI) {
21017   // Scan forward through BB for a use/def of EFLAGS.
21018   MachineBasicBlock::iterator miI(std::next(SelectItr));
21019   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
21020     const MachineInstr& mi = *miI;
21021     if (mi.readsRegister(X86::EFLAGS))
21022       return false;
21023     if (mi.definesRegister(X86::EFLAGS))
21024       break; // Should have kill-flag - update below.
21025   }
21026
21027   // If we hit the end of the block, check whether EFLAGS is live into a
21028   // successor.
21029   if (miI == BB->end()) {
21030     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
21031                                           sEnd = BB->succ_end();
21032          sItr != sEnd; ++sItr) {
21033       MachineBasicBlock* succ = *sItr;
21034       if (succ->isLiveIn(X86::EFLAGS))
21035         return false;
21036     }
21037   }
21038
21039   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
21040   // out. SelectMI should have a kill flag on EFLAGS.
21041   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
21042   return true;
21043 }
21044
21045 MachineBasicBlock *
21046 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
21047                                      MachineBasicBlock *BB) const {
21048   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21049   DebugLoc DL = MI->getDebugLoc();
21050
21051   // To "insert" a SELECT_CC instruction, we actually have to insert the
21052   // diamond control-flow pattern.  The incoming instruction knows the
21053   // destination vreg to set, the condition code register to branch on, the
21054   // true/false values to select between, and a branch opcode to use.
21055   const BasicBlock *LLVM_BB = BB->getBasicBlock();
21056   MachineFunction::iterator It = BB;
21057   ++It;
21058
21059   //  thisMBB:
21060   //  ...
21061   //   TrueVal = ...
21062   //   cmpTY ccX, r1, r2
21063   //   bCC copy1MBB
21064   //   fallthrough --> copy0MBB
21065   MachineBasicBlock *thisMBB = BB;
21066   MachineFunction *F = BB->getParent();
21067   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
21068   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
21069   F->insert(It, copy0MBB);
21070   F->insert(It, sinkMBB);
21071
21072   // If the EFLAGS register isn't dead in the terminator, then claim that it's
21073   // live into the sink and copy blocks.
21074   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
21075   if (!MI->killsRegister(X86::EFLAGS) &&
21076       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
21077     copy0MBB->addLiveIn(X86::EFLAGS);
21078     sinkMBB->addLiveIn(X86::EFLAGS);
21079   }
21080
21081   // Transfer the remainder of BB and its successor edges to sinkMBB.
21082   sinkMBB->splice(sinkMBB->begin(), BB,
21083                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
21084   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
21085
21086   // Add the true and fallthrough blocks as its successors.
21087   BB->addSuccessor(copy0MBB);
21088   BB->addSuccessor(sinkMBB);
21089
21090   // Create the conditional branch instruction.
21091   unsigned Opc =
21092     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
21093   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
21094
21095   //  copy0MBB:
21096   //   %FalseValue = ...
21097   //   # fallthrough to sinkMBB
21098   copy0MBB->addSuccessor(sinkMBB);
21099
21100   //  sinkMBB:
21101   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
21102   //  ...
21103   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21104           TII->get(X86::PHI), MI->getOperand(0).getReg())
21105     .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
21106     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
21107
21108   MI->eraseFromParent();   // The pseudo instruction is gone now.
21109   return sinkMBB;
21110 }
21111
21112 MachineBasicBlock *
21113 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
21114                                         MachineBasicBlock *BB) const {
21115   MachineFunction *MF = BB->getParent();
21116   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21117   DebugLoc DL = MI->getDebugLoc();
21118   const BasicBlock *LLVM_BB = BB->getBasicBlock();
21119
21120   assert(MF->shouldSplitStack());
21121
21122   const bool Is64Bit = Subtarget->is64Bit();
21123   const bool IsLP64 = Subtarget->isTarget64BitLP64();
21124
21125   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
21126   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
21127
21128   // BB:
21129   //  ... [Till the alloca]
21130   // If stacklet is not large enough, jump to mallocMBB
21131   //
21132   // bumpMBB:
21133   //  Allocate by subtracting from RSP
21134   //  Jump to continueMBB
21135   //
21136   // mallocMBB:
21137   //  Allocate by call to runtime
21138   //
21139   // continueMBB:
21140   //  ...
21141   //  [rest of original BB]
21142   //
21143
21144   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21145   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21146   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21147
21148   MachineRegisterInfo &MRI = MF->getRegInfo();
21149   const TargetRegisterClass *AddrRegClass =
21150     getRegClassFor(getPointerTy());
21151
21152   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21153     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21154     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
21155     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
21156     sizeVReg = MI->getOperand(1).getReg(),
21157     physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
21158
21159   MachineFunction::iterator MBBIter = BB;
21160   ++MBBIter;
21161
21162   MF->insert(MBBIter, bumpMBB);
21163   MF->insert(MBBIter, mallocMBB);
21164   MF->insert(MBBIter, continueMBB);
21165
21166   continueMBB->splice(continueMBB->begin(), BB,
21167                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
21168   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
21169
21170   // Add code to the main basic block to check if the stack limit has been hit,
21171   // and if so, jump to mallocMBB otherwise to bumpMBB.
21172   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
21173   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
21174     .addReg(tmpSPVReg).addReg(sizeVReg);
21175   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
21176     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
21177     .addReg(SPLimitVReg);
21178   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
21179
21180   // bumpMBB simply decreases the stack pointer, since we know the current
21181   // stacklet has enough space.
21182   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
21183     .addReg(SPLimitVReg);
21184   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
21185     .addReg(SPLimitVReg);
21186   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21187
21188   // Calls into a routine in libgcc to allocate more space from the heap.
21189   const uint32_t *RegMask =
21190       Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21191   if (IsLP64) {
21192     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
21193       .addReg(sizeVReg);
21194     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21195       .addExternalSymbol("__morestack_allocate_stack_space")
21196       .addRegMask(RegMask)
21197       .addReg(X86::RDI, RegState::Implicit)
21198       .addReg(X86::RAX, RegState::ImplicitDefine);
21199   } else if (Is64Bit) {
21200     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
21201       .addReg(sizeVReg);
21202     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21203       .addExternalSymbol("__morestack_allocate_stack_space")
21204       .addRegMask(RegMask)
21205       .addReg(X86::EDI, RegState::Implicit)
21206       .addReg(X86::EAX, RegState::ImplicitDefine);
21207   } else {
21208     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
21209       .addImm(12);
21210     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
21211     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
21212       .addExternalSymbol("__morestack_allocate_stack_space")
21213       .addRegMask(RegMask)
21214       .addReg(X86::EAX, RegState::ImplicitDefine);
21215   }
21216
21217   if (!Is64Bit)
21218     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
21219       .addImm(16);
21220
21221   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
21222     .addReg(IsLP64 ? X86::RAX : X86::EAX);
21223   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21224
21225   // Set up the CFG correctly.
21226   BB->addSuccessor(bumpMBB);
21227   BB->addSuccessor(mallocMBB);
21228   mallocMBB->addSuccessor(continueMBB);
21229   bumpMBB->addSuccessor(continueMBB);
21230
21231   // Take care of the PHI nodes.
21232   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
21233           MI->getOperand(0).getReg())
21234     .addReg(mallocPtrVReg).addMBB(mallocMBB)
21235     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
21236
21237   // Delete the original pseudo instruction.
21238   MI->eraseFromParent();
21239
21240   // And we're done.
21241   return continueMBB;
21242 }
21243
21244 MachineBasicBlock *
21245 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
21246                                         MachineBasicBlock *BB) const {
21247   DebugLoc DL = MI->getDebugLoc();
21248
21249   assert(!Subtarget->isTargetMachO());
21250
21251   X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
21252
21253   MI->eraseFromParent();   // The pseudo instruction is gone now.
21254   return BB;
21255 }
21256
21257 MachineBasicBlock *
21258 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
21259                                       MachineBasicBlock *BB) const {
21260   // This is pretty easy.  We're taking the value that we received from
21261   // our load from the relocation, sticking it in either RDI (x86-64)
21262   // or EAX and doing an indirect call.  The return value will then
21263   // be in the normal return register.
21264   MachineFunction *F = BB->getParent();
21265   const X86InstrInfo *TII = Subtarget->getInstrInfo();
21266   DebugLoc DL = MI->getDebugLoc();
21267
21268   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
21269   assert(MI->getOperand(3).isGlobal() && "This should be a global");
21270
21271   // Get a register mask for the lowered call.
21272   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
21273   // proper register mask.
21274   const uint32_t *RegMask =
21275       Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21276   if (Subtarget->is64Bit()) {
21277     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21278                                       TII->get(X86::MOV64rm), X86::RDI)
21279     .addReg(X86::RIP)
21280     .addImm(0).addReg(0)
21281     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21282                       MI->getOperand(3).getTargetFlags())
21283     .addReg(0);
21284     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
21285     addDirectMem(MIB, X86::RDI);
21286     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
21287   } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
21288     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21289                                       TII->get(X86::MOV32rm), X86::EAX)
21290     .addReg(0)
21291     .addImm(0).addReg(0)
21292     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21293                       MI->getOperand(3).getTargetFlags())
21294     .addReg(0);
21295     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21296     addDirectMem(MIB, X86::EAX);
21297     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21298   } else {
21299     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21300                                       TII->get(X86::MOV32rm), X86::EAX)
21301     .addReg(TII->getGlobalBaseReg(F))
21302     .addImm(0).addReg(0)
21303     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21304                       MI->getOperand(3).getTargetFlags())
21305     .addReg(0);
21306     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21307     addDirectMem(MIB, X86::EAX);
21308     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21309   }
21310
21311   MI->eraseFromParent(); // The pseudo instruction is gone now.
21312   return BB;
21313 }
21314
21315 MachineBasicBlock *
21316 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
21317                                     MachineBasicBlock *MBB) const {
21318   DebugLoc DL = MI->getDebugLoc();
21319   MachineFunction *MF = MBB->getParent();
21320   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21321   MachineRegisterInfo &MRI = MF->getRegInfo();
21322
21323   const BasicBlock *BB = MBB->getBasicBlock();
21324   MachineFunction::iterator I = MBB;
21325   ++I;
21326
21327   // Memory Reference
21328   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21329   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21330
21331   unsigned DstReg;
21332   unsigned MemOpndSlot = 0;
21333
21334   unsigned CurOp = 0;
21335
21336   DstReg = MI->getOperand(CurOp++).getReg();
21337   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
21338   assert(RC->hasType(MVT::i32) && "Invalid destination!");
21339   unsigned mainDstReg = MRI.createVirtualRegister(RC);
21340   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
21341
21342   MemOpndSlot = CurOp;
21343
21344   MVT PVT = getPointerTy();
21345   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21346          "Invalid Pointer Size!");
21347
21348   // For v = setjmp(buf), we generate
21349   //
21350   // thisMBB:
21351   //  buf[LabelOffset] = restoreMBB
21352   //  SjLjSetup restoreMBB
21353   //
21354   // mainMBB:
21355   //  v_main = 0
21356   //
21357   // sinkMBB:
21358   //  v = phi(main, restore)
21359   //
21360   // restoreMBB:
21361   //  if base pointer being used, load it from frame
21362   //  v_restore = 1
21363
21364   MachineBasicBlock *thisMBB = MBB;
21365   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
21366   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
21367   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
21368   MF->insert(I, mainMBB);
21369   MF->insert(I, sinkMBB);
21370   MF->push_back(restoreMBB);
21371
21372   MachineInstrBuilder MIB;
21373
21374   // Transfer the remainder of BB and its successor edges to sinkMBB.
21375   sinkMBB->splice(sinkMBB->begin(), MBB,
21376                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21377   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
21378
21379   // thisMBB:
21380   unsigned PtrStoreOpc = 0;
21381   unsigned LabelReg = 0;
21382   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21383   Reloc::Model RM = MF->getTarget().getRelocationModel();
21384   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
21385                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
21386
21387   // Prepare IP either in reg or imm.
21388   if (!UseImmLabel) {
21389     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
21390     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
21391     LabelReg = MRI.createVirtualRegister(PtrRC);
21392     if (Subtarget->is64Bit()) {
21393       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
21394               .addReg(X86::RIP)
21395               .addImm(0)
21396               .addReg(0)
21397               .addMBB(restoreMBB)
21398               .addReg(0);
21399     } else {
21400       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
21401       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
21402               .addReg(XII->getGlobalBaseReg(MF))
21403               .addImm(0)
21404               .addReg(0)
21405               .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
21406               .addReg(0);
21407     }
21408   } else
21409     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
21410   // Store IP
21411   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
21412   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21413     if (i == X86::AddrDisp)
21414       MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
21415     else
21416       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
21417   }
21418   if (!UseImmLabel)
21419     MIB.addReg(LabelReg);
21420   else
21421     MIB.addMBB(restoreMBB);
21422   MIB.setMemRefs(MMOBegin, MMOEnd);
21423   // Setup
21424   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
21425           .addMBB(restoreMBB);
21426
21427   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21428   MIB.addRegMask(RegInfo->getNoPreservedMask());
21429   thisMBB->addSuccessor(mainMBB);
21430   thisMBB->addSuccessor(restoreMBB);
21431
21432   // mainMBB:
21433   //  EAX = 0
21434   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
21435   mainMBB->addSuccessor(sinkMBB);
21436
21437   // sinkMBB:
21438   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21439           TII->get(X86::PHI), DstReg)
21440     .addReg(mainDstReg).addMBB(mainMBB)
21441     .addReg(restoreDstReg).addMBB(restoreMBB);
21442
21443   // restoreMBB:
21444   if (RegInfo->hasBasePointer(*MF)) {
21445     const bool Uses64BitFramePtr =
21446         Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
21447     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
21448     X86FI->setRestoreBasePointer(MF);
21449     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
21450     unsigned BasePtr = RegInfo->getBaseRegister();
21451     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
21452     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
21453                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
21454       .setMIFlag(MachineInstr::FrameSetup);
21455   }
21456   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
21457   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
21458   restoreMBB->addSuccessor(sinkMBB);
21459
21460   MI->eraseFromParent();
21461   return sinkMBB;
21462 }
21463
21464 MachineBasicBlock *
21465 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
21466                                      MachineBasicBlock *MBB) const {
21467   DebugLoc DL = MI->getDebugLoc();
21468   MachineFunction *MF = MBB->getParent();
21469   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21470   MachineRegisterInfo &MRI = MF->getRegInfo();
21471
21472   // Memory Reference
21473   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21474   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21475
21476   MVT PVT = getPointerTy();
21477   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21478          "Invalid Pointer Size!");
21479
21480   const TargetRegisterClass *RC =
21481     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
21482   unsigned Tmp = MRI.createVirtualRegister(RC);
21483   // Since FP is only updated here but NOT referenced, it's treated as GPR.
21484   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21485   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
21486   unsigned SP = RegInfo->getStackRegister();
21487
21488   MachineInstrBuilder MIB;
21489
21490   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21491   const int64_t SPOffset = 2 * PVT.getStoreSize();
21492
21493   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
21494   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
21495
21496   // Reload FP
21497   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
21498   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
21499     MIB.addOperand(MI->getOperand(i));
21500   MIB.setMemRefs(MMOBegin, MMOEnd);
21501   // Reload IP
21502   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
21503   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21504     if (i == X86::AddrDisp)
21505       MIB.addDisp(MI->getOperand(i), LabelOffset);
21506     else
21507       MIB.addOperand(MI->getOperand(i));
21508   }
21509   MIB.setMemRefs(MMOBegin, MMOEnd);
21510   // Reload SP
21511   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
21512   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21513     if (i == X86::AddrDisp)
21514       MIB.addDisp(MI->getOperand(i), SPOffset);
21515     else
21516       MIB.addOperand(MI->getOperand(i));
21517   }
21518   MIB.setMemRefs(MMOBegin, MMOEnd);
21519   // Jump
21520   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
21521
21522   MI->eraseFromParent();
21523   return MBB;
21524 }
21525
21526 // Replace 213-type (isel default) FMA3 instructions with 231-type for
21527 // accumulator loops. Writing back to the accumulator allows the coalescer
21528 // to remove extra copies in the loop.
21529 MachineBasicBlock *
21530 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
21531                                  MachineBasicBlock *MBB) const {
21532   MachineOperand &AddendOp = MI->getOperand(3);
21533
21534   // Bail out early if the addend isn't a register - we can't switch these.
21535   if (!AddendOp.isReg())
21536     return MBB;
21537
21538   MachineFunction &MF = *MBB->getParent();
21539   MachineRegisterInfo &MRI = MF.getRegInfo();
21540
21541   // Check whether the addend is defined by a PHI:
21542   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
21543   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
21544   if (!AddendDef.isPHI())
21545     return MBB;
21546
21547   // Look for the following pattern:
21548   // loop:
21549   //   %addend = phi [%entry, 0], [%loop, %result]
21550   //   ...
21551   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
21552
21553   // Replace with:
21554   //   loop:
21555   //   %addend = phi [%entry, 0], [%loop, %result]
21556   //   ...
21557   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
21558
21559   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
21560     assert(AddendDef.getOperand(i).isReg());
21561     MachineOperand PHISrcOp = AddendDef.getOperand(i);
21562     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
21563     if (&PHISrcInst == MI) {
21564       // Found a matching instruction.
21565       unsigned NewFMAOpc = 0;
21566       switch (MI->getOpcode()) {
21567         case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
21568         case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
21569         case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
21570         case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
21571         case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
21572         case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
21573         case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
21574         case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
21575         case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
21576         case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
21577         case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
21578         case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
21579         case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
21580         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
21581         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
21582         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
21583         case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
21584         case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
21585         case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
21586         case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
21587
21588         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
21589         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
21590         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
21591         case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
21592         case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
21593         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
21594         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
21595         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
21596         case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
21597         case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
21598         case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
21599         case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
21600         default: llvm_unreachable("Unrecognized FMA variant.");
21601       }
21602
21603       const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
21604       MachineInstrBuilder MIB =
21605         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
21606         .addOperand(MI->getOperand(0))
21607         .addOperand(MI->getOperand(3))
21608         .addOperand(MI->getOperand(2))
21609         .addOperand(MI->getOperand(1));
21610       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
21611       MI->eraseFromParent();
21612     }
21613   }
21614
21615   return MBB;
21616 }
21617
21618 MachineBasicBlock *
21619 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
21620                                                MachineBasicBlock *BB) const {
21621   switch (MI->getOpcode()) {
21622   default: llvm_unreachable("Unexpected instr type to insert");
21623   case X86::TAILJMPd64:
21624   case X86::TAILJMPr64:
21625   case X86::TAILJMPm64:
21626   case X86::TAILJMPd64_REX:
21627   case X86::TAILJMPr64_REX:
21628   case X86::TAILJMPm64_REX:
21629     llvm_unreachable("TAILJMP64 would not be touched here.");
21630   case X86::TCRETURNdi64:
21631   case X86::TCRETURNri64:
21632   case X86::TCRETURNmi64:
21633     return BB;
21634   case X86::WIN_ALLOCA:
21635     return EmitLoweredWinAlloca(MI, BB);
21636   case X86::SEG_ALLOCA_32:
21637   case X86::SEG_ALLOCA_64:
21638     return EmitLoweredSegAlloca(MI, BB);
21639   case X86::TLSCall_32:
21640   case X86::TLSCall_64:
21641     return EmitLoweredTLSCall(MI, BB);
21642   case X86::CMOV_GR8:
21643   case X86::CMOV_FR32:
21644   case X86::CMOV_FR64:
21645   case X86::CMOV_V4F32:
21646   case X86::CMOV_V2F64:
21647   case X86::CMOV_V2I64:
21648   case X86::CMOV_V8F32:
21649   case X86::CMOV_V4F64:
21650   case X86::CMOV_V4I64:
21651   case X86::CMOV_V16F32:
21652   case X86::CMOV_V8F64:
21653   case X86::CMOV_V8I64:
21654   case X86::CMOV_GR16:
21655   case X86::CMOV_GR32:
21656   case X86::CMOV_RFP32:
21657   case X86::CMOV_RFP64:
21658   case X86::CMOV_RFP80:
21659     return EmitLoweredSelect(MI, BB);
21660
21661   case X86::FP32_TO_INT16_IN_MEM:
21662   case X86::FP32_TO_INT32_IN_MEM:
21663   case X86::FP32_TO_INT64_IN_MEM:
21664   case X86::FP64_TO_INT16_IN_MEM:
21665   case X86::FP64_TO_INT32_IN_MEM:
21666   case X86::FP64_TO_INT64_IN_MEM:
21667   case X86::FP80_TO_INT16_IN_MEM:
21668   case X86::FP80_TO_INT32_IN_MEM:
21669   case X86::FP80_TO_INT64_IN_MEM: {
21670     MachineFunction *F = BB->getParent();
21671     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21672     DebugLoc DL = MI->getDebugLoc();
21673
21674     // Change the floating point control register to use "round towards zero"
21675     // mode when truncating to an integer value.
21676     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
21677     addFrameReference(BuildMI(*BB, MI, DL,
21678                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
21679
21680     // Load the old value of the high byte of the control word...
21681     unsigned OldCW =
21682       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
21683     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
21684                       CWFrameIdx);
21685
21686     // Set the high part to be round to zero...
21687     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
21688       .addImm(0xC7F);
21689
21690     // Reload the modified control word now...
21691     addFrameReference(BuildMI(*BB, MI, DL,
21692                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21693
21694     // Restore the memory image of control word to original value
21695     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
21696       .addReg(OldCW);
21697
21698     // Get the X86 opcode to use.
21699     unsigned Opc;
21700     switch (MI->getOpcode()) {
21701     default: llvm_unreachable("illegal opcode!");
21702     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
21703     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
21704     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
21705     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
21706     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
21707     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
21708     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
21709     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
21710     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
21711     }
21712
21713     X86AddressMode AM;
21714     MachineOperand &Op = MI->getOperand(0);
21715     if (Op.isReg()) {
21716       AM.BaseType = X86AddressMode::RegBase;
21717       AM.Base.Reg = Op.getReg();
21718     } else {
21719       AM.BaseType = X86AddressMode::FrameIndexBase;
21720       AM.Base.FrameIndex = Op.getIndex();
21721     }
21722     Op = MI->getOperand(1);
21723     if (Op.isImm())
21724       AM.Scale = Op.getImm();
21725     Op = MI->getOperand(2);
21726     if (Op.isImm())
21727       AM.IndexReg = Op.getImm();
21728     Op = MI->getOperand(3);
21729     if (Op.isGlobal()) {
21730       AM.GV = Op.getGlobal();
21731     } else {
21732       AM.Disp = Op.getImm();
21733     }
21734     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
21735                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
21736
21737     // Reload the original control word now.
21738     addFrameReference(BuildMI(*BB, MI, DL,
21739                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21740
21741     MI->eraseFromParent();   // The pseudo instruction is gone now.
21742     return BB;
21743   }
21744     // String/text processing lowering.
21745   case X86::PCMPISTRM128REG:
21746   case X86::VPCMPISTRM128REG:
21747   case X86::PCMPISTRM128MEM:
21748   case X86::VPCMPISTRM128MEM:
21749   case X86::PCMPESTRM128REG:
21750   case X86::VPCMPESTRM128REG:
21751   case X86::PCMPESTRM128MEM:
21752   case X86::VPCMPESTRM128MEM:
21753     assert(Subtarget->hasSSE42() &&
21754            "Target must have SSE4.2 or AVX features enabled");
21755     return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
21756
21757   // String/text processing lowering.
21758   case X86::PCMPISTRIREG:
21759   case X86::VPCMPISTRIREG:
21760   case X86::PCMPISTRIMEM:
21761   case X86::VPCMPISTRIMEM:
21762   case X86::PCMPESTRIREG:
21763   case X86::VPCMPESTRIREG:
21764   case X86::PCMPESTRIMEM:
21765   case X86::VPCMPESTRIMEM:
21766     assert(Subtarget->hasSSE42() &&
21767            "Target must have SSE4.2 or AVX features enabled");
21768     return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
21769
21770   // Thread synchronization.
21771   case X86::MONITOR:
21772     return EmitMonitor(MI, BB, Subtarget);
21773
21774   // xbegin
21775   case X86::XBEGIN:
21776     return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
21777
21778   case X86::VASTART_SAVE_XMM_REGS:
21779     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
21780
21781   case X86::VAARG_64:
21782     return EmitVAARG64WithCustomInserter(MI, BB);
21783
21784   case X86::EH_SjLj_SetJmp32:
21785   case X86::EH_SjLj_SetJmp64:
21786     return emitEHSjLjSetJmp(MI, BB);
21787
21788   case X86::EH_SjLj_LongJmp32:
21789   case X86::EH_SjLj_LongJmp64:
21790     return emitEHSjLjLongJmp(MI, BB);
21791
21792   case TargetOpcode::STATEPOINT:
21793     // As an implementation detail, STATEPOINT shares the STACKMAP format at
21794     // this point in the process.  We diverge later.
21795     return emitPatchPoint(MI, BB);
21796
21797   case TargetOpcode::STACKMAP:
21798   case TargetOpcode::PATCHPOINT:
21799     return emitPatchPoint(MI, BB);
21800
21801   case X86::VFMADDPDr213r:
21802   case X86::VFMADDPSr213r:
21803   case X86::VFMADDSDr213r:
21804   case X86::VFMADDSSr213r:
21805   case X86::VFMSUBPDr213r:
21806   case X86::VFMSUBPSr213r:
21807   case X86::VFMSUBSDr213r:
21808   case X86::VFMSUBSSr213r:
21809   case X86::VFNMADDPDr213r:
21810   case X86::VFNMADDPSr213r:
21811   case X86::VFNMADDSDr213r:
21812   case X86::VFNMADDSSr213r:
21813   case X86::VFNMSUBPDr213r:
21814   case X86::VFNMSUBPSr213r:
21815   case X86::VFNMSUBSDr213r:
21816   case X86::VFNMSUBSSr213r:
21817   case X86::VFMADDSUBPDr213r:
21818   case X86::VFMADDSUBPSr213r:
21819   case X86::VFMSUBADDPDr213r:
21820   case X86::VFMSUBADDPSr213r:
21821   case X86::VFMADDPDr213rY:
21822   case X86::VFMADDPSr213rY:
21823   case X86::VFMSUBPDr213rY:
21824   case X86::VFMSUBPSr213rY:
21825   case X86::VFNMADDPDr213rY:
21826   case X86::VFNMADDPSr213rY:
21827   case X86::VFNMSUBPDr213rY:
21828   case X86::VFNMSUBPSr213rY:
21829   case X86::VFMADDSUBPDr213rY:
21830   case X86::VFMADDSUBPSr213rY:
21831   case X86::VFMSUBADDPDr213rY:
21832   case X86::VFMSUBADDPSr213rY:
21833     return emitFMA3Instr(MI, BB);
21834   }
21835 }
21836
21837 //===----------------------------------------------------------------------===//
21838 //                           X86 Optimization Hooks
21839 //===----------------------------------------------------------------------===//
21840
21841 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
21842                                                       APInt &KnownZero,
21843                                                       APInt &KnownOne,
21844                                                       const SelectionDAG &DAG,
21845                                                       unsigned Depth) const {
21846   unsigned BitWidth = KnownZero.getBitWidth();
21847   unsigned Opc = Op.getOpcode();
21848   assert((Opc >= ISD::BUILTIN_OP_END ||
21849           Opc == ISD::INTRINSIC_WO_CHAIN ||
21850           Opc == ISD::INTRINSIC_W_CHAIN ||
21851           Opc == ISD::INTRINSIC_VOID) &&
21852          "Should use MaskedValueIsZero if you don't know whether Op"
21853          " is a target node!");
21854
21855   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
21856   switch (Opc) {
21857   default: break;
21858   case X86ISD::ADD:
21859   case X86ISD::SUB:
21860   case X86ISD::ADC:
21861   case X86ISD::SBB:
21862   case X86ISD::SMUL:
21863   case X86ISD::UMUL:
21864   case X86ISD::INC:
21865   case X86ISD::DEC:
21866   case X86ISD::OR:
21867   case X86ISD::XOR:
21868   case X86ISD::AND:
21869     // These nodes' second result is a boolean.
21870     if (Op.getResNo() == 0)
21871       break;
21872     // Fallthrough
21873   case X86ISD::SETCC:
21874     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
21875     break;
21876   case ISD::INTRINSIC_WO_CHAIN: {
21877     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21878     unsigned NumLoBits = 0;
21879     switch (IntId) {
21880     default: break;
21881     case Intrinsic::x86_sse_movmsk_ps:
21882     case Intrinsic::x86_avx_movmsk_ps_256:
21883     case Intrinsic::x86_sse2_movmsk_pd:
21884     case Intrinsic::x86_avx_movmsk_pd_256:
21885     case Intrinsic::x86_mmx_pmovmskb:
21886     case Intrinsic::x86_sse2_pmovmskb_128:
21887     case Intrinsic::x86_avx2_pmovmskb: {
21888       // High bits of movmskp{s|d}, pmovmskb are known zero.
21889       switch (IntId) {
21890         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
21891         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
21892         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
21893         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
21894         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
21895         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
21896         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
21897         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
21898       }
21899       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
21900       break;
21901     }
21902     }
21903     break;
21904   }
21905   }
21906 }
21907
21908 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
21909   SDValue Op,
21910   const SelectionDAG &,
21911   unsigned Depth) const {
21912   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
21913   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
21914     return Op.getValueType().getScalarType().getSizeInBits();
21915
21916   // Fallback case.
21917   return 1;
21918 }
21919
21920 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
21921 /// node is a GlobalAddress + offset.
21922 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
21923                                        const GlobalValue* &GA,
21924                                        int64_t &Offset) const {
21925   if (N->getOpcode() == X86ISD::Wrapper) {
21926     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
21927       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
21928       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
21929       return true;
21930     }
21931   }
21932   return TargetLowering::isGAPlusOffset(N, GA, Offset);
21933 }
21934
21935 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
21936 /// same as extracting the high 128-bit part of 256-bit vector and then
21937 /// inserting the result into the low part of a new 256-bit vector
21938 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
21939   EVT VT = SVOp->getValueType(0);
21940   unsigned NumElems = VT.getVectorNumElements();
21941
21942   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21943   for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
21944     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21945         SVOp->getMaskElt(j) >= 0)
21946       return false;
21947
21948   return true;
21949 }
21950
21951 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
21952 /// same as extracting the low 128-bit part of 256-bit vector and then
21953 /// inserting the result into the high part of a new 256-bit vector
21954 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
21955   EVT VT = SVOp->getValueType(0);
21956   unsigned NumElems = VT.getVectorNumElements();
21957
21958   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21959   for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
21960     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21961         SVOp->getMaskElt(j) >= 0)
21962       return false;
21963
21964   return true;
21965 }
21966
21967 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
21968 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
21969                                         TargetLowering::DAGCombinerInfo &DCI,
21970                                         const X86Subtarget* Subtarget) {
21971   SDLoc dl(N);
21972   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
21973   SDValue V1 = SVOp->getOperand(0);
21974   SDValue V2 = SVOp->getOperand(1);
21975   EVT VT = SVOp->getValueType(0);
21976   unsigned NumElems = VT.getVectorNumElements();
21977
21978   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
21979       V2.getOpcode() == ISD::CONCAT_VECTORS) {
21980     //
21981     //                   0,0,0,...
21982     //                      |
21983     //    V      UNDEF    BUILD_VECTOR    UNDEF
21984     //     \      /           \           /
21985     //  CONCAT_VECTOR         CONCAT_VECTOR
21986     //         \                  /
21987     //          \                /
21988     //          RESULT: V + zero extended
21989     //
21990     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
21991         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
21992         V1.getOperand(1).getOpcode() != ISD::UNDEF)
21993       return SDValue();
21994
21995     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
21996       return SDValue();
21997
21998     // To match the shuffle mask, the first half of the mask should
21999     // be exactly the first vector, and all the rest a splat with the
22000     // first element of the second one.
22001     for (unsigned i = 0; i != NumElems/2; ++i)
22002       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
22003           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
22004         return SDValue();
22005
22006     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
22007     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
22008       if (Ld->hasNUsesOfValue(1, 0)) {
22009         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
22010         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
22011         SDValue ResNode =
22012           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
22013                                   Ld->getMemoryVT(),
22014                                   Ld->getPointerInfo(),
22015                                   Ld->getAlignment(),
22016                                   false/*isVolatile*/, true/*ReadMem*/,
22017                                   false/*WriteMem*/);
22018
22019         // Make sure the newly-created LOAD is in the same position as Ld in
22020         // terms of dependency. We create a TokenFactor for Ld and ResNode,
22021         // and update uses of Ld's output chain to use the TokenFactor.
22022         if (Ld->hasAnyUseOfValue(1)) {
22023           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
22024                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
22025           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
22026           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
22027                                  SDValue(ResNode.getNode(), 1));
22028         }
22029
22030         return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
22031       }
22032     }
22033
22034     // Emit a zeroed vector and insert the desired subvector on its
22035     // first half.
22036     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22037     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
22038     return DCI.CombineTo(N, InsV);
22039   }
22040
22041   //===--------------------------------------------------------------------===//
22042   // Combine some shuffles into subvector extracts and inserts:
22043   //
22044
22045   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
22046   if (isShuffleHigh128VectorInsertLow(SVOp)) {
22047     SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
22048     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
22049     return DCI.CombineTo(N, InsV);
22050   }
22051
22052   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
22053   if (isShuffleLow128VectorInsertHigh(SVOp)) {
22054     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
22055     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
22056     return DCI.CombineTo(N, InsV);
22057   }
22058
22059   return SDValue();
22060 }
22061
22062 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
22063 /// possible.
22064 ///
22065 /// This is the leaf of the recursive combinine below. When we have found some
22066 /// chain of single-use x86 shuffle instructions and accumulated the combined
22067 /// shuffle mask represented by them, this will try to pattern match that mask
22068 /// into either a single instruction if there is a special purpose instruction
22069 /// for this operation, or into a PSHUFB instruction which is a fully general
22070 /// instruction but should only be used to replace chains over a certain depth.
22071 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
22072                                    int Depth, bool HasPSHUFB, SelectionDAG &DAG,
22073                                    TargetLowering::DAGCombinerInfo &DCI,
22074                                    const X86Subtarget *Subtarget) {
22075   assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
22076
22077   // Find the operand that enters the chain. Note that multiple uses are OK
22078   // here, we're not going to remove the operand we find.
22079   SDValue Input = Op.getOperand(0);
22080   while (Input.getOpcode() == ISD::BITCAST)
22081     Input = Input.getOperand(0);
22082
22083   MVT VT = Input.getSimpleValueType();
22084   MVT RootVT = Root.getSimpleValueType();
22085   SDLoc DL(Root);
22086
22087   // Just remove no-op shuffle masks.
22088   if (Mask.size() == 1) {
22089     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
22090                   /*AddTo*/ true);
22091     return true;
22092   }
22093
22094   // Use the float domain if the operand type is a floating point type.
22095   bool FloatDomain = VT.isFloatingPoint();
22096
22097   // For floating point shuffles, we don't have free copies in the shuffle
22098   // instructions or the ability to load as part of the instruction, so
22099   // canonicalize their shuffles to UNPCK or MOV variants.
22100   //
22101   // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
22102   // vectors because it can have a load folded into it that UNPCK cannot. This
22103   // doesn't preclude something switching to the shorter encoding post-RA.
22104   if (FloatDomain) {
22105     if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
22106       bool Lo = Mask.equals(0, 0);
22107       unsigned Shuffle;
22108       MVT ShuffleVT;
22109       // Check if we have SSE3 which will let us use MOVDDUP. That instruction
22110       // is no slower than UNPCKLPD but has the option to fold the input operand
22111       // into even an unaligned memory load.
22112       if (Lo && Subtarget->hasSSE3()) {
22113         Shuffle = X86ISD::MOVDDUP;
22114         ShuffleVT = MVT::v2f64;
22115       } else {
22116         // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
22117         // than the UNPCK variants.
22118         Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
22119         ShuffleVT = MVT::v4f32;
22120       }
22121       if (Depth == 1 && Root->getOpcode() == Shuffle)
22122         return false; // Nothing to do!
22123       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22124       DCI.AddToWorklist(Op.getNode());
22125       if (Shuffle == X86ISD::MOVDDUP)
22126         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22127       else
22128         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22129       DCI.AddToWorklist(Op.getNode());
22130       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22131                     /*AddTo*/ true);
22132       return true;
22133     }
22134     if (Subtarget->hasSSE3() &&
22135         (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
22136       bool Lo = Mask.equals(0, 0, 2, 2);
22137       unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
22138       MVT ShuffleVT = MVT::v4f32;
22139       if (Depth == 1 && Root->getOpcode() == Shuffle)
22140         return false; // Nothing to do!
22141       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22142       DCI.AddToWorklist(Op.getNode());
22143       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22144       DCI.AddToWorklist(Op.getNode());
22145       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22146                     /*AddTo*/ true);
22147       return true;
22148     }
22149     if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
22150       bool Lo = Mask.equals(0, 0, 1, 1);
22151       unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22152       MVT ShuffleVT = MVT::v4f32;
22153       if (Depth == 1 && Root->getOpcode() == Shuffle)
22154         return false; // Nothing to do!
22155       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22156       DCI.AddToWorklist(Op.getNode());
22157       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22158       DCI.AddToWorklist(Op.getNode());
22159       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22160                     /*AddTo*/ true);
22161       return true;
22162     }
22163   }
22164
22165   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
22166   // variants as none of these have single-instruction variants that are
22167   // superior to the UNPCK formulation.
22168   if (!FloatDomain &&
22169       (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
22170        Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
22171        Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
22172        Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
22173                    15))) {
22174     bool Lo = Mask[0] == 0;
22175     unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22176     if (Depth == 1 && Root->getOpcode() == Shuffle)
22177       return false; // Nothing to do!
22178     MVT ShuffleVT;
22179     switch (Mask.size()) {
22180     case 8:
22181       ShuffleVT = MVT::v8i16;
22182       break;
22183     case 16:
22184       ShuffleVT = MVT::v16i8;
22185       break;
22186     default:
22187       llvm_unreachable("Impossible mask size!");
22188     };
22189     Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22190     DCI.AddToWorklist(Op.getNode());
22191     Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22192     DCI.AddToWorklist(Op.getNode());
22193     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22194                   /*AddTo*/ true);
22195     return true;
22196   }
22197
22198   // Don't try to re-form single instruction chains under any circumstances now
22199   // that we've done encoding canonicalization for them.
22200   if (Depth < 2)
22201     return false;
22202
22203   // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
22204   // can replace them with a single PSHUFB instruction profitably. Intel's
22205   // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
22206   // in practice PSHUFB tends to be *very* fast so we're more aggressive.
22207   if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
22208     SmallVector<SDValue, 16> PSHUFBMask;
22209     assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
22210     int Ratio = 16 / Mask.size();
22211     for (unsigned i = 0; i < 16; ++i) {
22212       if (Mask[i / Ratio] == SM_SentinelUndef) {
22213         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
22214         continue;
22215       }
22216       int M = Mask[i / Ratio] != SM_SentinelZero
22217                   ? Ratio * Mask[i / Ratio] + i % Ratio
22218                   : 255;
22219       PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
22220     }
22221     Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
22222     DCI.AddToWorklist(Op.getNode());
22223     SDValue PSHUFBMaskOp =
22224         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
22225     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
22226     Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
22227     DCI.AddToWorklist(Op.getNode());
22228     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22229                   /*AddTo*/ true);
22230     return true;
22231   }
22232
22233   // Failed to find any combines.
22234   return false;
22235 }
22236
22237 /// \brief Fully generic combining of x86 shuffle instructions.
22238 ///
22239 /// This should be the last combine run over the x86 shuffle instructions. Once
22240 /// they have been fully optimized, this will recursively consider all chains
22241 /// of single-use shuffle instructions, build a generic model of the cumulative
22242 /// shuffle operation, and check for simpler instructions which implement this
22243 /// operation. We use this primarily for two purposes:
22244 ///
22245 /// 1) Collapse generic shuffles to specialized single instructions when
22246 ///    equivalent. In most cases, this is just an encoding size win, but
22247 ///    sometimes we will collapse multiple generic shuffles into a single
22248 ///    special-purpose shuffle.
22249 /// 2) Look for sequences of shuffle instructions with 3 or more total
22250 ///    instructions, and replace them with the slightly more expensive SSSE3
22251 ///    PSHUFB instruction if available. We do this as the last combining step
22252 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
22253 ///    a suitable short sequence of other instructions. The PHUFB will either
22254 ///    use a register or have to read from memory and so is slightly (but only
22255 ///    slightly) more expensive than the other shuffle instructions.
22256 ///
22257 /// Because this is inherently a quadratic operation (for each shuffle in
22258 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
22259 /// This should never be an issue in practice as the shuffle lowering doesn't
22260 /// produce sequences of more than 8 instructions.
22261 ///
22262 /// FIXME: We will currently miss some cases where the redundant shuffling
22263 /// would simplify under the threshold for PSHUFB formation because of
22264 /// combine-ordering. To fix this, we should do the redundant instruction
22265 /// combining in this recursive walk.
22266 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
22267                                           ArrayRef<int> RootMask,
22268                                           int Depth, bool HasPSHUFB,
22269                                           SelectionDAG &DAG,
22270                                           TargetLowering::DAGCombinerInfo &DCI,
22271                                           const X86Subtarget *Subtarget) {
22272   // Bound the depth of our recursive combine because this is ultimately
22273   // quadratic in nature.
22274   if (Depth > 8)
22275     return false;
22276
22277   // Directly rip through bitcasts to find the underlying operand.
22278   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
22279     Op = Op.getOperand(0);
22280
22281   MVT VT = Op.getSimpleValueType();
22282   if (!VT.isVector())
22283     return false; // Bail if we hit a non-vector.
22284   // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
22285   // version should be added.
22286   if (VT.getSizeInBits() != 128)
22287     return false;
22288
22289   assert(Root.getSimpleValueType().isVector() &&
22290          "Shuffles operate on vector types!");
22291   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
22292          "Can only combine shuffles of the same vector register size.");
22293
22294   if (!isTargetShuffle(Op.getOpcode()))
22295     return false;
22296   SmallVector<int, 16> OpMask;
22297   bool IsUnary;
22298   bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
22299   // We only can combine unary shuffles which we can decode the mask for.
22300   if (!HaveMask || !IsUnary)
22301     return false;
22302
22303   assert(VT.getVectorNumElements() == OpMask.size() &&
22304          "Different mask size from vector size!");
22305   assert(((RootMask.size() > OpMask.size() &&
22306            RootMask.size() % OpMask.size() == 0) ||
22307           (OpMask.size() > RootMask.size() &&
22308            OpMask.size() % RootMask.size() == 0) ||
22309           OpMask.size() == RootMask.size()) &&
22310          "The smaller number of elements must divide the larger.");
22311   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
22312   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
22313   assert(((RootRatio == 1 && OpRatio == 1) ||
22314           (RootRatio == 1) != (OpRatio == 1)) &&
22315          "Must not have a ratio for both incoming and op masks!");
22316
22317   SmallVector<int, 16> Mask;
22318   Mask.reserve(std::max(OpMask.size(), RootMask.size()));
22319
22320   // Merge this shuffle operation's mask into our accumulated mask. Note that
22321   // this shuffle's mask will be the first applied to the input, followed by the
22322   // root mask to get us all the way to the root value arrangement. The reason
22323   // for this order is that we are recursing up the operation chain.
22324   for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
22325     int RootIdx = i / RootRatio;
22326     if (RootMask[RootIdx] < 0) {
22327       // This is a zero or undef lane, we're done.
22328       Mask.push_back(RootMask[RootIdx]);
22329       continue;
22330     }
22331
22332     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
22333     int OpIdx = RootMaskedIdx / OpRatio;
22334     if (OpMask[OpIdx] < 0) {
22335       // The incoming lanes are zero or undef, it doesn't matter which ones we
22336       // are using.
22337       Mask.push_back(OpMask[OpIdx]);
22338       continue;
22339     }
22340
22341     // Ok, we have non-zero lanes, map them through.
22342     Mask.push_back(OpMask[OpIdx] * OpRatio +
22343                    RootMaskedIdx % OpRatio);
22344   }
22345
22346   // See if we can recurse into the operand to combine more things.
22347   switch (Op.getOpcode()) {
22348     case X86ISD::PSHUFB:
22349       HasPSHUFB = true;
22350     case X86ISD::PSHUFD:
22351     case X86ISD::PSHUFHW:
22352     case X86ISD::PSHUFLW:
22353       if (Op.getOperand(0).hasOneUse() &&
22354           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22355                                         HasPSHUFB, DAG, DCI, Subtarget))
22356         return true;
22357       break;
22358
22359     case X86ISD::UNPCKL:
22360     case X86ISD::UNPCKH:
22361       assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
22362       // We can't check for single use, we have to check that this shuffle is the only user.
22363       if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
22364           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22365                                         HasPSHUFB, DAG, DCI, Subtarget))
22366           return true;
22367       break;
22368   }
22369
22370   // Minor canonicalization of the accumulated shuffle mask to make it easier
22371   // to match below. All this does is detect masks with squential pairs of
22372   // elements, and shrink them to the half-width mask. It does this in a loop
22373   // so it will reduce the size of the mask to the minimal width mask which
22374   // performs an equivalent shuffle.
22375   SmallVector<int, 16> WidenedMask;
22376   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
22377     Mask = std::move(WidenedMask);
22378     WidenedMask.clear();
22379   }
22380
22381   return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
22382                                 Subtarget);
22383 }
22384
22385 /// \brief Get the PSHUF-style mask from PSHUF node.
22386 ///
22387 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
22388 /// PSHUF-style masks that can be reused with such instructions.
22389 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
22390   SmallVector<int, 4> Mask;
22391   bool IsUnary;
22392   bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
22393   (void)HaveMask;
22394   assert(HaveMask);
22395
22396   switch (N.getOpcode()) {
22397   case X86ISD::PSHUFD:
22398     return Mask;
22399   case X86ISD::PSHUFLW:
22400     Mask.resize(4);
22401     return Mask;
22402   case X86ISD::PSHUFHW:
22403     Mask.erase(Mask.begin(), Mask.begin() + 4);
22404     for (int &M : Mask)
22405       M -= 4;
22406     return Mask;
22407   default:
22408     llvm_unreachable("No valid shuffle instruction found!");
22409   }
22410 }
22411
22412 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
22413 ///
22414 /// We walk up the chain and look for a combinable shuffle, skipping over
22415 /// shuffles that we could hoist this shuffle's transformation past without
22416 /// altering anything.
22417 static SDValue
22418 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
22419                              SelectionDAG &DAG,
22420                              TargetLowering::DAGCombinerInfo &DCI) {
22421   assert(N.getOpcode() == X86ISD::PSHUFD &&
22422          "Called with something other than an x86 128-bit half shuffle!");
22423   SDLoc DL(N);
22424
22425   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
22426   // of the shuffles in the chain so that we can form a fresh chain to replace
22427   // this one.
22428   SmallVector<SDValue, 8> Chain;
22429   SDValue V = N.getOperand(0);
22430   for (; V.hasOneUse(); V = V.getOperand(0)) {
22431     switch (V.getOpcode()) {
22432     default:
22433       return SDValue(); // Nothing combined!
22434
22435     case ISD::BITCAST:
22436       // Skip bitcasts as we always know the type for the target specific
22437       // instructions.
22438       continue;
22439
22440     case X86ISD::PSHUFD:
22441       // Found another dword shuffle.
22442       break;
22443
22444     case X86ISD::PSHUFLW:
22445       // Check that the low words (being shuffled) are the identity in the
22446       // dword shuffle, and the high words are self-contained.
22447       if (Mask[0] != 0 || Mask[1] != 1 ||
22448           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
22449         return SDValue();
22450
22451       Chain.push_back(V);
22452       continue;
22453
22454     case X86ISD::PSHUFHW:
22455       // Check that the high words (being shuffled) are the identity in the
22456       // dword shuffle, and the low words are self-contained.
22457       if (Mask[2] != 2 || Mask[3] != 3 ||
22458           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
22459         return SDValue();
22460
22461       Chain.push_back(V);
22462       continue;
22463
22464     case X86ISD::UNPCKL:
22465     case X86ISD::UNPCKH:
22466       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
22467       // shuffle into a preceding word shuffle.
22468       if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
22469         return SDValue();
22470
22471       // Search for a half-shuffle which we can combine with.
22472       unsigned CombineOp =
22473           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
22474       if (V.getOperand(0) != V.getOperand(1) ||
22475           !V->isOnlyUserOf(V.getOperand(0).getNode()))
22476         return SDValue();
22477       Chain.push_back(V);
22478       V = V.getOperand(0);
22479       do {
22480         switch (V.getOpcode()) {
22481         default:
22482           return SDValue(); // Nothing to combine.
22483
22484         case X86ISD::PSHUFLW:
22485         case X86ISD::PSHUFHW:
22486           if (V.getOpcode() == CombineOp)
22487             break;
22488
22489           Chain.push_back(V);
22490
22491           // Fallthrough!
22492         case ISD::BITCAST:
22493           V = V.getOperand(0);
22494           continue;
22495         }
22496         break;
22497       } while (V.hasOneUse());
22498       break;
22499     }
22500     // Break out of the loop if we break out of the switch.
22501     break;
22502   }
22503
22504   if (!V.hasOneUse())
22505     // We fell out of the loop without finding a viable combining instruction.
22506     return SDValue();
22507
22508   // Merge this node's mask and our incoming mask.
22509   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22510   for (int &M : Mask)
22511     M = VMask[M];
22512   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
22513                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22514
22515   // Rebuild the chain around this new shuffle.
22516   while (!Chain.empty()) {
22517     SDValue W = Chain.pop_back_val();
22518
22519     if (V.getValueType() != W.getOperand(0).getValueType())
22520       V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
22521
22522     switch (W.getOpcode()) {
22523     default:
22524       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
22525
22526     case X86ISD::UNPCKL:
22527     case X86ISD::UNPCKH:
22528       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
22529       break;
22530
22531     case X86ISD::PSHUFD:
22532     case X86ISD::PSHUFLW:
22533     case X86ISD::PSHUFHW:
22534       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
22535       break;
22536     }
22537   }
22538   if (V.getValueType() != N.getValueType())
22539     V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
22540
22541   // Return the new chain to replace N.
22542   return V;
22543 }
22544
22545 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
22546 ///
22547 /// We walk up the chain, skipping shuffles of the other half and looking
22548 /// through shuffles which switch halves trying to find a shuffle of the same
22549 /// pair of dwords.
22550 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
22551                                         SelectionDAG &DAG,
22552                                         TargetLowering::DAGCombinerInfo &DCI) {
22553   assert(
22554       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
22555       "Called with something other than an x86 128-bit half shuffle!");
22556   SDLoc DL(N);
22557   unsigned CombineOpcode = N.getOpcode();
22558
22559   // Walk up a single-use chain looking for a combinable shuffle.
22560   SDValue V = N.getOperand(0);
22561   for (; V.hasOneUse(); V = V.getOperand(0)) {
22562     switch (V.getOpcode()) {
22563     default:
22564       return false; // Nothing combined!
22565
22566     case ISD::BITCAST:
22567       // Skip bitcasts as we always know the type for the target specific
22568       // instructions.
22569       continue;
22570
22571     case X86ISD::PSHUFLW:
22572     case X86ISD::PSHUFHW:
22573       if (V.getOpcode() == CombineOpcode)
22574         break;
22575
22576       // Other-half shuffles are no-ops.
22577       continue;
22578     }
22579     // Break out of the loop if we break out of the switch.
22580     break;
22581   }
22582
22583   if (!V.hasOneUse())
22584     // We fell out of the loop without finding a viable combining instruction.
22585     return false;
22586
22587   // Combine away the bottom node as its shuffle will be accumulated into
22588   // a preceding shuffle.
22589   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22590
22591   // Record the old value.
22592   SDValue Old = V;
22593
22594   // Merge this node's mask and our incoming mask (adjusted to account for all
22595   // the pshufd instructions encountered).
22596   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22597   for (int &M : Mask)
22598     M = VMask[M];
22599   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
22600                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22601
22602   // Check that the shuffles didn't cancel each other out. If not, we need to
22603   // combine to the new one.
22604   if (Old != V)
22605     // Replace the combinable shuffle with the combined one, updating all users
22606     // so that we re-evaluate the chain here.
22607     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
22608
22609   return true;
22610 }
22611
22612 /// \brief Try to combine x86 target specific shuffles.
22613 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
22614                                            TargetLowering::DAGCombinerInfo &DCI,
22615                                            const X86Subtarget *Subtarget) {
22616   SDLoc DL(N);
22617   MVT VT = N.getSimpleValueType();
22618   SmallVector<int, 4> Mask;
22619
22620   switch (N.getOpcode()) {
22621   case X86ISD::PSHUFD:
22622   case X86ISD::PSHUFLW:
22623   case X86ISD::PSHUFHW:
22624     Mask = getPSHUFShuffleMask(N);
22625     assert(Mask.size() == 4);
22626     break;
22627   default:
22628     return SDValue();
22629   }
22630
22631   // Nuke no-op shuffles that show up after combining.
22632   if (isNoopShuffleMask(Mask))
22633     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22634
22635   // Look for simplifications involving one or two shuffle instructions.
22636   SDValue V = N.getOperand(0);
22637   switch (N.getOpcode()) {
22638   default:
22639     break;
22640   case X86ISD::PSHUFLW:
22641   case X86ISD::PSHUFHW:
22642     assert(VT == MVT::v8i16);
22643     (void)VT;
22644
22645     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
22646       return SDValue(); // We combined away this shuffle, so we're done.
22647
22648     // See if this reduces to a PSHUFD which is no more expensive and can
22649     // combine with more operations. Note that it has to at least flip the
22650     // dwords as otherwise it would have been removed as a no-op.
22651     if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
22652       int DMask[] = {0, 1, 2, 3};
22653       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
22654       DMask[DOffset + 0] = DOffset + 1;
22655       DMask[DOffset + 1] = DOffset + 0;
22656       V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
22657       DCI.AddToWorklist(V.getNode());
22658       V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
22659                       getV4X86ShuffleImm8ForMask(DMask, DAG));
22660       DCI.AddToWorklist(V.getNode());
22661       return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
22662     }
22663
22664     // Look for shuffle patterns which can be implemented as a single unpack.
22665     // FIXME: This doesn't handle the location of the PSHUFD generically, and
22666     // only works when we have a PSHUFD followed by two half-shuffles.
22667     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
22668         (V.getOpcode() == X86ISD::PSHUFLW ||
22669          V.getOpcode() == X86ISD::PSHUFHW) &&
22670         V.getOpcode() != N.getOpcode() &&
22671         V.hasOneUse()) {
22672       SDValue D = V.getOperand(0);
22673       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
22674         D = D.getOperand(0);
22675       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
22676         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22677         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
22678         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22679         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22680         int WordMask[8];
22681         for (int i = 0; i < 4; ++i) {
22682           WordMask[i + NOffset] = Mask[i] + NOffset;
22683           WordMask[i + VOffset] = VMask[i] + VOffset;
22684         }
22685         // Map the word mask through the DWord mask.
22686         int MappedMask[8];
22687         for (int i = 0; i < 8; ++i)
22688           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
22689         const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
22690         const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
22691         if (std::equal(std::begin(MappedMask), std::end(MappedMask),
22692                        std::begin(UnpackLoMask)) ||
22693             std::equal(std::begin(MappedMask), std::end(MappedMask),
22694                        std::begin(UnpackHiMask))) {
22695           // We can replace all three shuffles with an unpack.
22696           V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
22697           DCI.AddToWorklist(V.getNode());
22698           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
22699                                                 : X86ISD::UNPCKH,
22700                              DL, MVT::v8i16, V, V);
22701         }
22702       }
22703     }
22704
22705     break;
22706
22707   case X86ISD::PSHUFD:
22708     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
22709       return NewN;
22710
22711     break;
22712   }
22713
22714   return SDValue();
22715 }
22716
22717 /// \brief Try to combine a shuffle into a target-specific add-sub node.
22718 ///
22719 /// We combine this directly on the abstract vector shuffle nodes so it is
22720 /// easier to generically match. We also insert dummy vector shuffle nodes for
22721 /// the operands which explicitly discard the lanes which are unused by this
22722 /// operation to try to flow through the rest of the combiner the fact that
22723 /// they're unused.
22724 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
22725   SDLoc DL(N);
22726   EVT VT = N->getValueType(0);
22727
22728   // We only handle target-independent shuffles.
22729   // FIXME: It would be easy and harmless to use the target shuffle mask
22730   // extraction tool to support more.
22731   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
22732     return SDValue();
22733
22734   auto *SVN = cast<ShuffleVectorSDNode>(N);
22735   ArrayRef<int> Mask = SVN->getMask();
22736   SDValue V1 = N->getOperand(0);
22737   SDValue V2 = N->getOperand(1);
22738
22739   // We require the first shuffle operand to be the SUB node, and the second to
22740   // be the ADD node.
22741   // FIXME: We should support the commuted patterns.
22742   if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
22743     return SDValue();
22744
22745   // If there are other uses of these operations we can't fold them.
22746   if (!V1->hasOneUse() || !V2->hasOneUse())
22747     return SDValue();
22748
22749   // Ensure that both operations have the same operands. Note that we can
22750   // commute the FADD operands.
22751   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
22752   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
22753       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
22754     return SDValue();
22755
22756   // We're looking for blends between FADD and FSUB nodes. We insist on these
22757   // nodes being lined up in a specific expected pattern.
22758   if (!(isShuffleEquivalent(Mask, 0, 3) ||
22759         isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
22760         isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
22761     return SDValue();
22762
22763   // Only specific types are legal at this point, assert so we notice if and
22764   // when these change.
22765   assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
22766           VT == MVT::v4f64) &&
22767          "Unknown vector type encountered!");
22768
22769   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
22770 }
22771
22772 /// PerformShuffleCombine - Performs several different shuffle combines.
22773 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
22774                                      TargetLowering::DAGCombinerInfo &DCI,
22775                                      const X86Subtarget *Subtarget) {
22776   SDLoc dl(N);
22777   SDValue N0 = N->getOperand(0);
22778   SDValue N1 = N->getOperand(1);
22779   EVT VT = N->getValueType(0);
22780
22781   // Don't create instructions with illegal types after legalize types has run.
22782   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22783   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
22784     return SDValue();
22785
22786   // If we have legalized the vector types, look for blends of FADD and FSUB
22787   // nodes that we can fuse into an ADDSUB node.
22788   if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
22789     if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
22790       return AddSub;
22791
22792   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
22793   if (Subtarget->hasFp256() && VT.is256BitVector() &&
22794       N->getOpcode() == ISD::VECTOR_SHUFFLE)
22795     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
22796
22797   // During Type Legalization, when promoting illegal vector types,
22798   // the backend might introduce new shuffle dag nodes and bitcasts.
22799   //
22800   // This code performs the following transformation:
22801   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
22802   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
22803   //
22804   // We do this only if both the bitcast and the BINOP dag nodes have
22805   // one use. Also, perform this transformation only if the new binary
22806   // operation is legal. This is to avoid introducing dag nodes that
22807   // potentially need to be further expanded (or custom lowered) into a
22808   // less optimal sequence of dag nodes.
22809   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
22810       N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
22811       N0.getOpcode() == ISD::BITCAST) {
22812     SDValue BC0 = N0.getOperand(0);
22813     EVT SVT = BC0.getValueType();
22814     unsigned Opcode = BC0.getOpcode();
22815     unsigned NumElts = VT.getVectorNumElements();
22816
22817     if (BC0.hasOneUse() && SVT.isVector() &&
22818         SVT.getVectorNumElements() * 2 == NumElts &&
22819         TLI.isOperationLegal(Opcode, VT)) {
22820       bool CanFold = false;
22821       switch (Opcode) {
22822       default : break;
22823       case ISD::ADD :
22824       case ISD::FADD :
22825       case ISD::SUB :
22826       case ISD::FSUB :
22827       case ISD::MUL :
22828       case ISD::FMUL :
22829         CanFold = true;
22830       }
22831
22832       unsigned SVTNumElts = SVT.getVectorNumElements();
22833       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
22834       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
22835         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
22836       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
22837         CanFold = SVOp->getMaskElt(i) < 0;
22838
22839       if (CanFold) {
22840         SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
22841         SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
22842         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
22843         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
22844       }
22845     }
22846   }
22847
22848   // Only handle 128 wide vector from here on.
22849   if (!VT.is128BitVector())
22850     return SDValue();
22851
22852   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
22853   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
22854   // consecutive, non-overlapping, and in the right order.
22855   SmallVector<SDValue, 16> Elts;
22856   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
22857     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
22858
22859   SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
22860   if (LD.getNode())
22861     return LD;
22862
22863   if (isTargetShuffle(N->getOpcode())) {
22864     SDValue Shuffle =
22865         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
22866     if (Shuffle.getNode())
22867       return Shuffle;
22868
22869     // Try recursively combining arbitrary sequences of x86 shuffle
22870     // instructions into higher-order shuffles. We do this after combining
22871     // specific PSHUF instruction sequences into their minimal form so that we
22872     // can evaluate how many specialized shuffle instructions are involved in
22873     // a particular chain.
22874     SmallVector<int, 1> NonceMask; // Just a placeholder.
22875     NonceMask.push_back(0);
22876     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
22877                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
22878                                       DCI, Subtarget))
22879       return SDValue(); // This routine will use CombineTo to replace N.
22880   }
22881
22882   return SDValue();
22883 }
22884
22885 /// PerformTruncateCombine - Converts truncate operation to
22886 /// a sequence of vector shuffle operations.
22887 /// It is possible when we truncate 256-bit vector to 128-bit vector
22888 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
22889                                       TargetLowering::DAGCombinerInfo &DCI,
22890                                       const X86Subtarget *Subtarget)  {
22891   return SDValue();
22892 }
22893
22894 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
22895 /// specific shuffle of a load can be folded into a single element load.
22896 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
22897 /// shuffles have been custom lowered so we need to handle those here.
22898 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
22899                                          TargetLowering::DAGCombinerInfo &DCI) {
22900   if (DCI.isBeforeLegalizeOps())
22901     return SDValue();
22902
22903   SDValue InVec = N->getOperand(0);
22904   SDValue EltNo = N->getOperand(1);
22905
22906   if (!isa<ConstantSDNode>(EltNo))
22907     return SDValue();
22908
22909   EVT OriginalVT = InVec.getValueType();
22910
22911   if (InVec.getOpcode() == ISD::BITCAST) {
22912     // Don't duplicate a load with other uses.
22913     if (!InVec.hasOneUse())
22914       return SDValue();
22915     EVT BCVT = InVec.getOperand(0).getValueType();
22916     if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
22917       return SDValue();
22918     InVec = InVec.getOperand(0);
22919   }
22920
22921   EVT CurrentVT = InVec.getValueType();
22922
22923   if (!isTargetShuffle(InVec.getOpcode()))
22924     return SDValue();
22925
22926   // Don't duplicate a load with other uses.
22927   if (!InVec.hasOneUse())
22928     return SDValue();
22929
22930   SmallVector<int, 16> ShuffleMask;
22931   bool UnaryShuffle;
22932   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
22933                             ShuffleMask, UnaryShuffle))
22934     return SDValue();
22935
22936   // Select the input vector, guarding against out of range extract vector.
22937   unsigned NumElems = CurrentVT.getVectorNumElements();
22938   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
22939   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
22940   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
22941                                          : InVec.getOperand(1);
22942
22943   // If inputs to shuffle are the same for both ops, then allow 2 uses
22944   unsigned AllowedUses = InVec.getNumOperands() > 1 &&
22945                          InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
22946
22947   if (LdNode.getOpcode() == ISD::BITCAST) {
22948     // Don't duplicate a load with other uses.
22949     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
22950       return SDValue();
22951
22952     AllowedUses = 1; // only allow 1 load use if we have a bitcast
22953     LdNode = LdNode.getOperand(0);
22954   }
22955
22956   if (!ISD::isNormalLoad(LdNode.getNode()))
22957     return SDValue();
22958
22959   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
22960
22961   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
22962     return SDValue();
22963
22964   EVT EltVT = N->getValueType(0);
22965   // If there's a bitcast before the shuffle, check if the load type and
22966   // alignment is valid.
22967   unsigned Align = LN0->getAlignment();
22968   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22969   unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
22970       EltVT.getTypeForEVT(*DAG.getContext()));
22971
22972   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
22973     return SDValue();
22974
22975   // All checks match so transform back to vector_shuffle so that DAG combiner
22976   // can finish the job
22977   SDLoc dl(N);
22978
22979   // Create shuffle node taking into account the case that its a unary shuffle
22980   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
22981                                    : InVec.getOperand(1);
22982   Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
22983                                  InVec.getOperand(0), Shuffle,
22984                                  &ShuffleMask[0]);
22985   Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
22986   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
22987                      EltNo);
22988 }
22989
22990 /// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
22991 /// special and don't usually play with other vector types, it's better to
22992 /// handle them early to be sure we emit efficient code by avoiding
22993 /// store-load conversions.
22994 static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
22995   if (N->getValueType(0) != MVT::x86mmx ||
22996       N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
22997       N->getOperand(0)->getValueType(0) != MVT::v2i32)
22998     return SDValue();
22999
23000   SDValue V = N->getOperand(0);
23001   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
23002   if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
23003     return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
23004                        N->getValueType(0), V.getOperand(0));
23005
23006   return SDValue();
23007 }
23008
23009 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
23010 /// generation and convert it from being a bunch of shuffles and extracts
23011 /// into a somewhat faster sequence. For i686, the best sequence is apparently
23012 /// storing the value and loading scalars back, while for x64 we should
23013 /// use 64-bit extracts and shifts.
23014 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
23015                                          TargetLowering::DAGCombinerInfo &DCI) {
23016   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
23017   if (NewOp.getNode())
23018     return NewOp;
23019
23020   SDValue InputVector = N->getOperand(0);
23021
23022   // Detect mmx to i32 conversion through a v2i32 elt extract.
23023   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
23024       N->getValueType(0) == MVT::i32 &&
23025       InputVector.getValueType() == MVT::v2i32) {
23026
23027     // The bitcast source is a direct mmx result.
23028     SDValue MMXSrc = InputVector.getNode()->getOperand(0);
23029     if (MMXSrc.getValueType() == MVT::x86mmx)
23030       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23031                          N->getValueType(0),
23032                          InputVector.getNode()->getOperand(0));
23033
23034     // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
23035     SDValue MMXSrcOp = MMXSrc.getOperand(0);
23036     if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
23037         MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
23038         MMXSrcOp.getOpcode() == ISD::BITCAST &&
23039         MMXSrcOp.getValueType() == MVT::v1i64 &&
23040         MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
23041       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23042                          N->getValueType(0),
23043                          MMXSrcOp.getOperand(0));
23044   }
23045
23046   // Only operate on vectors of 4 elements, where the alternative shuffling
23047   // gets to be more expensive.
23048   if (InputVector.getValueType() != MVT::v4i32)
23049     return SDValue();
23050
23051   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
23052   // single use which is a sign-extend or zero-extend, and all elements are
23053   // used.
23054   SmallVector<SDNode *, 4> Uses;
23055   unsigned ExtractedElements = 0;
23056   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
23057        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
23058     if (UI.getUse().getResNo() != InputVector.getResNo())
23059       return SDValue();
23060
23061     SDNode *Extract = *UI;
23062     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23063       return SDValue();
23064
23065     if (Extract->getValueType(0) != MVT::i32)
23066       return SDValue();
23067     if (!Extract->hasOneUse())
23068       return SDValue();
23069     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
23070         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
23071       return SDValue();
23072     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
23073       return SDValue();
23074
23075     // Record which element was extracted.
23076     ExtractedElements |=
23077       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
23078
23079     Uses.push_back(Extract);
23080   }
23081
23082   // If not all the elements were used, this may not be worthwhile.
23083   if (ExtractedElements != 15)
23084     return SDValue();
23085
23086   // Ok, we've now decided to do the transformation.
23087   // If 64-bit shifts are legal, use the extract-shift sequence,
23088   // otherwise bounce the vector off the cache.
23089   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23090   SDValue Vals[4];
23091   SDLoc dl(InputVector);
23092
23093   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
23094     SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
23095     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
23096     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23097       DAG.getConstant(0, VecIdxTy));
23098     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23099       DAG.getConstant(1, VecIdxTy));
23100
23101     SDValue ShAmt = DAG.getConstant(32,
23102       DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
23103     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
23104     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23105       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
23106     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
23107     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23108       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
23109   } else {
23110     // Store the value to a temporary stack slot.
23111     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
23112     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
23113       MachinePointerInfo(), false, false, 0);
23114
23115     EVT ElementType = InputVector.getValueType().getVectorElementType();
23116     unsigned EltSize = ElementType.getSizeInBits() / 8;
23117
23118     // Replace each use (extract) with a load of the appropriate element.
23119     for (unsigned i = 0; i < 4; ++i) {
23120       uint64_t Offset = EltSize * i;
23121       SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
23122
23123       SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
23124                                        StackPtr, OffsetVal);
23125
23126       // Load the scalar.
23127       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
23128                             ScalarAddr, MachinePointerInfo(),
23129                             false, false, false, 0);
23130
23131     }
23132   }
23133
23134   // Replace the extracts
23135   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
23136     UE = Uses.end(); UI != UE; ++UI) {
23137     SDNode *Extract = *UI;
23138
23139     SDValue Idx = Extract->getOperand(1);
23140     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
23141     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
23142   }
23143
23144   // The replacement was made in place; don't return anything.
23145   return SDValue();
23146 }
23147
23148 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
23149 static std::pair<unsigned, bool>
23150 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
23151                    SelectionDAG &DAG, const X86Subtarget *Subtarget) {
23152   if (!VT.isVector())
23153     return std::make_pair(0, false);
23154
23155   bool NeedSplit = false;
23156   switch (VT.getSimpleVT().SimpleTy) {
23157   default: return std::make_pair(0, false);
23158   case MVT::v4i64:
23159   case MVT::v2i64:
23160     if (!Subtarget->hasVLX())
23161       return std::make_pair(0, false);
23162     break;
23163   case MVT::v64i8:
23164   case MVT::v32i16:
23165     if (!Subtarget->hasBWI())
23166       return std::make_pair(0, false);
23167     break;
23168   case MVT::v16i32:
23169   case MVT::v8i64:
23170     if (!Subtarget->hasAVX512())
23171       return std::make_pair(0, false);
23172     break;
23173   case MVT::v32i8:
23174   case MVT::v16i16:
23175   case MVT::v8i32:
23176     if (!Subtarget->hasAVX2())
23177       NeedSplit = true;
23178     if (!Subtarget->hasAVX())
23179       return std::make_pair(0, false);
23180     break;
23181   case MVT::v16i8:
23182   case MVT::v8i16:
23183   case MVT::v4i32:
23184     if (!Subtarget->hasSSE2())
23185       return std::make_pair(0, false);
23186   }
23187
23188   // SSE2 has only a small subset of the operations.
23189   bool hasUnsigned = Subtarget->hasSSE41() ||
23190                      (Subtarget->hasSSE2() && VT == MVT::v16i8);
23191   bool hasSigned = Subtarget->hasSSE41() ||
23192                    (Subtarget->hasSSE2() && VT == MVT::v8i16);
23193
23194   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23195
23196   unsigned Opc = 0;
23197   // Check for x CC y ? x : y.
23198   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23199       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23200     switch (CC) {
23201     default: break;
23202     case ISD::SETULT:
23203     case ISD::SETULE:
23204       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23205     case ISD::SETUGT:
23206     case ISD::SETUGE:
23207       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23208     case ISD::SETLT:
23209     case ISD::SETLE:
23210       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23211     case ISD::SETGT:
23212     case ISD::SETGE:
23213       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23214     }
23215   // Check for x CC y ? y : x -- a min/max with reversed arms.
23216   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23217              DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23218     switch (CC) {
23219     default: break;
23220     case ISD::SETULT:
23221     case ISD::SETULE:
23222       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23223     case ISD::SETUGT:
23224     case ISD::SETUGE:
23225       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23226     case ISD::SETLT:
23227     case ISD::SETLE:
23228       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23229     case ISD::SETGT:
23230     case ISD::SETGE:
23231       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23232     }
23233   }
23234
23235   return std::make_pair(Opc, NeedSplit);
23236 }
23237
23238 static SDValue
23239 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
23240                                       const X86Subtarget *Subtarget) {
23241   SDLoc dl(N);
23242   SDValue Cond = N->getOperand(0);
23243   SDValue LHS = N->getOperand(1);
23244   SDValue RHS = N->getOperand(2);
23245
23246   if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
23247     SDValue CondSrc = Cond->getOperand(0);
23248     if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
23249       Cond = CondSrc->getOperand(0);
23250   }
23251
23252   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
23253     return SDValue();
23254
23255   // A vselect where all conditions and data are constants can be optimized into
23256   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
23257   if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
23258       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
23259     return SDValue();
23260
23261   unsigned MaskValue = 0;
23262   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
23263     return SDValue();
23264
23265   MVT VT = N->getSimpleValueType(0);
23266   unsigned NumElems = VT.getVectorNumElements();
23267   SmallVector<int, 8> ShuffleMask(NumElems, -1);
23268   for (unsigned i = 0; i < NumElems; ++i) {
23269     // Be sure we emit undef where we can.
23270     if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
23271       ShuffleMask[i] = -1;
23272     else
23273       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
23274   }
23275
23276   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23277   if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
23278     return SDValue();
23279   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
23280 }
23281
23282 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
23283 /// nodes.
23284 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
23285                                     TargetLowering::DAGCombinerInfo &DCI,
23286                                     const X86Subtarget *Subtarget) {
23287   SDLoc DL(N);
23288   SDValue Cond = N->getOperand(0);
23289   // Get the LHS/RHS of the select.
23290   SDValue LHS = N->getOperand(1);
23291   SDValue RHS = N->getOperand(2);
23292   EVT VT = LHS.getValueType();
23293   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23294
23295   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
23296   // instructions match the semantics of the common C idiom x<y?x:y but not
23297   // x<=y?x:y, because of how they handle negative zero (which can be
23298   // ignored in unsafe-math mode).
23299   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
23300   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
23301       VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
23302       (Subtarget->hasSSE2() ||
23303        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
23304     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23305
23306     unsigned Opcode = 0;
23307     // Check for x CC y ? x : y.
23308     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23309         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23310       switch (CC) {
23311       default: break;
23312       case ISD::SETULT:
23313         // Converting this to a min would handle NaNs incorrectly, and swapping
23314         // the operands would cause it to handle comparisons between positive
23315         // and negative zero incorrectly.
23316         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23317           if (!DAG.getTarget().Options.UnsafeFPMath &&
23318               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23319             break;
23320           std::swap(LHS, RHS);
23321         }
23322         Opcode = X86ISD::FMIN;
23323         break;
23324       case ISD::SETOLE:
23325         // Converting this to a min would handle comparisons between positive
23326         // and negative zero incorrectly.
23327         if (!DAG.getTarget().Options.UnsafeFPMath &&
23328             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23329           break;
23330         Opcode = X86ISD::FMIN;
23331         break;
23332       case ISD::SETULE:
23333         // Converting this to a min would handle both negative zeros and NaNs
23334         // incorrectly, but we can swap the operands to fix both.
23335         std::swap(LHS, RHS);
23336       case ISD::SETOLT:
23337       case ISD::SETLT:
23338       case ISD::SETLE:
23339         Opcode = X86ISD::FMIN;
23340         break;
23341
23342       case ISD::SETOGE:
23343         // Converting this to a max would handle comparisons between positive
23344         // and negative zero incorrectly.
23345         if (!DAG.getTarget().Options.UnsafeFPMath &&
23346             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23347           break;
23348         Opcode = X86ISD::FMAX;
23349         break;
23350       case ISD::SETUGT:
23351         // Converting this to a max would handle NaNs incorrectly, and swapping
23352         // the operands would cause it to handle comparisons between positive
23353         // and negative zero incorrectly.
23354         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23355           if (!DAG.getTarget().Options.UnsafeFPMath &&
23356               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23357             break;
23358           std::swap(LHS, RHS);
23359         }
23360         Opcode = X86ISD::FMAX;
23361         break;
23362       case ISD::SETUGE:
23363         // Converting this to a max would handle both negative zeros and NaNs
23364         // incorrectly, but we can swap the operands to fix both.
23365         std::swap(LHS, RHS);
23366       case ISD::SETOGT:
23367       case ISD::SETGT:
23368       case ISD::SETGE:
23369         Opcode = X86ISD::FMAX;
23370         break;
23371       }
23372     // Check for x CC y ? y : x -- a min/max with reversed arms.
23373     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23374                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23375       switch (CC) {
23376       default: break;
23377       case ISD::SETOGE:
23378         // Converting this to a min would handle comparisons between positive
23379         // and negative zero incorrectly, and swapping the operands would
23380         // cause it to handle NaNs incorrectly.
23381         if (!DAG.getTarget().Options.UnsafeFPMath &&
23382             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
23383           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23384             break;
23385           std::swap(LHS, RHS);
23386         }
23387         Opcode = X86ISD::FMIN;
23388         break;
23389       case ISD::SETUGT:
23390         // Converting this to a min would handle NaNs incorrectly.
23391         if (!DAG.getTarget().Options.UnsafeFPMath &&
23392             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
23393           break;
23394         Opcode = X86ISD::FMIN;
23395         break;
23396       case ISD::SETUGE:
23397         // Converting this to a min would handle both negative zeros and NaNs
23398         // incorrectly, but we can swap the operands to fix both.
23399         std::swap(LHS, RHS);
23400       case ISD::SETOGT:
23401       case ISD::SETGT:
23402       case ISD::SETGE:
23403         Opcode = X86ISD::FMIN;
23404         break;
23405
23406       case ISD::SETULT:
23407         // Converting this to a max would handle NaNs incorrectly.
23408         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23409           break;
23410         Opcode = X86ISD::FMAX;
23411         break;
23412       case ISD::SETOLE:
23413         // Converting this to a max would handle comparisons between positive
23414         // and negative zero incorrectly, and swapping the operands would
23415         // cause it to handle NaNs incorrectly.
23416         if (!DAG.getTarget().Options.UnsafeFPMath &&
23417             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
23418           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23419             break;
23420           std::swap(LHS, RHS);
23421         }
23422         Opcode = X86ISD::FMAX;
23423         break;
23424       case ISD::SETULE:
23425         // Converting this to a max would handle both negative zeros and NaNs
23426         // incorrectly, but we can swap the operands to fix both.
23427         std::swap(LHS, RHS);
23428       case ISD::SETOLT:
23429       case ISD::SETLT:
23430       case ISD::SETLE:
23431         Opcode = X86ISD::FMAX;
23432         break;
23433       }
23434     }
23435
23436     if (Opcode)
23437       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
23438   }
23439
23440   EVT CondVT = Cond.getValueType();
23441   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
23442       CondVT.getVectorElementType() == MVT::i1) {
23443     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
23444     // lowering on KNL. In this case we convert it to
23445     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
23446     // The same situation for all 128 and 256-bit vectors of i8 and i16.
23447     // Since SKX these selects have a proper lowering.
23448     EVT OpVT = LHS.getValueType();
23449     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
23450         (OpVT.getVectorElementType() == MVT::i8 ||
23451          OpVT.getVectorElementType() == MVT::i16) &&
23452         !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
23453       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
23454       DCI.AddToWorklist(Cond.getNode());
23455       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
23456     }
23457   }
23458   // If this is a select between two integer constants, try to do some
23459   // optimizations.
23460   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
23461     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
23462       // Don't do this for crazy integer types.
23463       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
23464         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
23465         // so that TrueC (the true value) is larger than FalseC.
23466         bool NeedsCondInvert = false;
23467
23468         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
23469             // Efficiently invertible.
23470             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
23471              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
23472               isa<ConstantSDNode>(Cond.getOperand(1))))) {
23473           NeedsCondInvert = true;
23474           std::swap(TrueC, FalseC);
23475         }
23476
23477         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
23478         if (FalseC->getAPIntValue() == 0 &&
23479             TrueC->getAPIntValue().isPowerOf2()) {
23480           if (NeedsCondInvert) // Invert the condition if needed.
23481             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23482                                DAG.getConstant(1, Cond.getValueType()));
23483
23484           // Zero extend the condition if needed.
23485           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
23486
23487           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23488           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
23489                              DAG.getConstant(ShAmt, MVT::i8));
23490         }
23491
23492         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
23493         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23494           if (NeedsCondInvert) // Invert the condition if needed.
23495             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23496                                DAG.getConstant(1, Cond.getValueType()));
23497
23498           // Zero extend the condition if needed.
23499           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23500                              FalseC->getValueType(0), Cond);
23501           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23502                              SDValue(FalseC, 0));
23503         }
23504
23505         // Optimize cases that will turn into an LEA instruction.  This requires
23506         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23507         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23508           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23509           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23510
23511           bool isFastMultiplier = false;
23512           if (Diff < 10) {
23513             switch ((unsigned char)Diff) {
23514               default: break;
23515               case 1:  // result = add base, cond
23516               case 2:  // result = lea base(    , cond*2)
23517               case 3:  // result = lea base(cond, cond*2)
23518               case 4:  // result = lea base(    , cond*4)
23519               case 5:  // result = lea base(cond, cond*4)
23520               case 8:  // result = lea base(    , cond*8)
23521               case 9:  // result = lea base(cond, cond*8)
23522                 isFastMultiplier = true;
23523                 break;
23524             }
23525           }
23526
23527           if (isFastMultiplier) {
23528             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23529             if (NeedsCondInvert) // Invert the condition if needed.
23530               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23531                                  DAG.getConstant(1, Cond.getValueType()));
23532
23533             // Zero extend the condition if needed.
23534             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23535                                Cond);
23536             // Scale the condition by the difference.
23537             if (Diff != 1)
23538               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23539                                  DAG.getConstant(Diff, Cond.getValueType()));
23540
23541             // Add the base if non-zero.
23542             if (FalseC->getAPIntValue() != 0)
23543               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23544                                  SDValue(FalseC, 0));
23545             return Cond;
23546           }
23547         }
23548       }
23549   }
23550
23551   // Canonicalize max and min:
23552   // (x > y) ? x : y -> (x >= y) ? x : y
23553   // (x < y) ? x : y -> (x <= y) ? x : y
23554   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
23555   // the need for an extra compare
23556   // against zero. e.g.
23557   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
23558   // subl   %esi, %edi
23559   // testl  %edi, %edi
23560   // movl   $0, %eax
23561   // cmovgl %edi, %eax
23562   // =>
23563   // xorl   %eax, %eax
23564   // subl   %esi, $edi
23565   // cmovsl %eax, %edi
23566   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
23567       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23568       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23569     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23570     switch (CC) {
23571     default: break;
23572     case ISD::SETLT:
23573     case ISD::SETGT: {
23574       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
23575       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
23576                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
23577       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
23578     }
23579     }
23580   }
23581
23582   // Early exit check
23583   if (!TLI.isTypeLegal(VT))
23584     return SDValue();
23585
23586   // Match VSELECTs into subs with unsigned saturation.
23587   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
23588       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
23589       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
23590        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
23591     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23592
23593     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
23594     // left side invert the predicate to simplify logic below.
23595     SDValue Other;
23596     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
23597       Other = RHS;
23598       CC = ISD::getSetCCInverse(CC, true);
23599     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
23600       Other = LHS;
23601     }
23602
23603     if (Other.getNode() && Other->getNumOperands() == 2 &&
23604         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
23605       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
23606       SDValue CondRHS = Cond->getOperand(1);
23607
23608       // Look for a general sub with unsigned saturation first.
23609       // x >= y ? x-y : 0 --> subus x, y
23610       // x >  y ? x-y : 0 --> subus x, y
23611       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
23612           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
23613         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
23614
23615       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
23616         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
23617           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
23618             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
23619               // If the RHS is a constant we have to reverse the const
23620               // canonicalization.
23621               // x > C-1 ? x+-C : 0 --> subus x, C
23622               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
23623                   CondRHSConst->getAPIntValue() ==
23624                       (-OpRHSConst->getAPIntValue() - 1))
23625                 return DAG.getNode(
23626                     X86ISD::SUBUS, DL, VT, OpLHS,
23627                     DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
23628
23629           // Another special case: If C was a sign bit, the sub has been
23630           // canonicalized into a xor.
23631           // FIXME: Would it be better to use computeKnownBits to determine
23632           //        whether it's safe to decanonicalize the xor?
23633           // x s< 0 ? x^C : 0 --> subus x, C
23634           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
23635               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
23636               OpRHSConst->getAPIntValue().isSignBit())
23637             // Note that we have to rebuild the RHS constant here to ensure we
23638             // don't rely on particular values of undef lanes.
23639             return DAG.getNode(
23640                 X86ISD::SUBUS, DL, VT, OpLHS,
23641                 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
23642         }
23643     }
23644   }
23645
23646   // Try to match a min/max vector operation.
23647   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
23648     std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
23649     unsigned Opc = ret.first;
23650     bool NeedSplit = ret.second;
23651
23652     if (Opc && NeedSplit) {
23653       unsigned NumElems = VT.getVectorNumElements();
23654       // Extract the LHS vectors
23655       SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
23656       SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
23657
23658       // Extract the RHS vectors
23659       SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
23660       SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
23661
23662       // Create min/max for each subvector
23663       LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
23664       RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
23665
23666       // Merge the result
23667       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
23668     } else if (Opc)
23669       return DAG.getNode(Opc, DL, VT, LHS, RHS);
23670   }
23671
23672   // Simplify vector selection if condition value type matches vselect
23673   // operand type
23674   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
23675     assert(Cond.getValueType().isVector() &&
23676            "vector select expects a vector selector!");
23677
23678     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
23679     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
23680
23681     // Try invert the condition if true value is not all 1s and false value
23682     // is not all 0s.
23683     if (!TValIsAllOnes && !FValIsAllZeros &&
23684         // Check if the selector will be produced by CMPP*/PCMP*
23685         Cond.getOpcode() == ISD::SETCC &&
23686         // Check if SETCC has already been promoted
23687         TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
23688       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
23689       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
23690
23691       if (TValIsAllZeros || FValIsAllOnes) {
23692         SDValue CC = Cond.getOperand(2);
23693         ISD::CondCode NewCC =
23694           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
23695                                Cond.getOperand(0).getValueType().isInteger());
23696         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
23697         std::swap(LHS, RHS);
23698         TValIsAllOnes = FValIsAllOnes;
23699         FValIsAllZeros = TValIsAllZeros;
23700       }
23701     }
23702
23703     if (TValIsAllOnes || FValIsAllZeros) {
23704       SDValue Ret;
23705
23706       if (TValIsAllOnes && FValIsAllZeros)
23707         Ret = Cond;
23708       else if (TValIsAllOnes)
23709         Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
23710                           DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
23711       else if (FValIsAllZeros)
23712         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
23713                           DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
23714
23715       return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
23716     }
23717   }
23718
23719   // If we know that this node is legal then we know that it is going to be
23720   // matched by one of the SSE/AVX BLEND instructions. These instructions only
23721   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
23722   // to simplify previous instructions.
23723   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
23724       !DCI.isBeforeLegalize() &&
23725       // We explicitly check against v8i16 and v16i16 because, although
23726       // they're marked as Custom, they might only be legal when Cond is a
23727       // build_vector of constants. This will be taken care in a later
23728       // condition.
23729       (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
23730        VT != MVT::v8i16) &&
23731       // Don't optimize vector of constants. Those are handled by
23732       // the generic code and all the bits must be properly set for
23733       // the generic optimizer.
23734       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
23735     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
23736
23737     // Don't optimize vector selects that map to mask-registers.
23738     if (BitWidth == 1)
23739       return SDValue();
23740
23741     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
23742     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
23743
23744     APInt KnownZero, KnownOne;
23745     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
23746                                           DCI.isBeforeLegalizeOps());
23747     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
23748         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
23749                                  TLO)) {
23750       // If we changed the computation somewhere in the DAG, this change
23751       // will affect all users of Cond.
23752       // Make sure it is fine and update all the nodes so that we do not
23753       // use the generic VSELECT anymore. Otherwise, we may perform
23754       // wrong optimizations as we messed up with the actual expectation
23755       // for the vector boolean values.
23756       if (Cond != TLO.Old) {
23757         // Check all uses of that condition operand to check whether it will be
23758         // consumed by non-BLEND instructions, which may depend on all bits are
23759         // set properly.
23760         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23761              I != E; ++I)
23762           if (I->getOpcode() != ISD::VSELECT)
23763             // TODO: Add other opcodes eventually lowered into BLEND.
23764             return SDValue();
23765
23766         // Update all the users of the condition, before committing the change,
23767         // so that the VSELECT optimizations that expect the correct vector
23768         // boolean value will not be triggered.
23769         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23770              I != E; ++I)
23771           DAG.ReplaceAllUsesOfValueWith(
23772               SDValue(*I, 0),
23773               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
23774                           Cond, I->getOperand(1), I->getOperand(2)));
23775         DCI.CommitTargetLoweringOpt(TLO);
23776         return SDValue();
23777       }
23778       // At this point, only Cond is changed. Change the condition
23779       // just for N to keep the opportunity to optimize all other
23780       // users their own way.
23781       DAG.ReplaceAllUsesOfValueWith(
23782           SDValue(N, 0),
23783           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
23784                       TLO.New, N->getOperand(1), N->getOperand(2)));
23785       return SDValue();
23786     }
23787   }
23788
23789   // We should generate an X86ISD::BLENDI from a vselect if its argument
23790   // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
23791   // constants. This specific pattern gets generated when we split a
23792   // selector for a 512 bit vector in a machine without AVX512 (but with
23793   // 256-bit vectors), during legalization:
23794   //
23795   // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
23796   //
23797   // Iff we find this pattern and the build_vectors are built from
23798   // constants, we translate the vselect into a shuffle_vector that we
23799   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
23800   if ((N->getOpcode() == ISD::VSELECT ||
23801        N->getOpcode() == X86ISD::SHRUNKBLEND) &&
23802       !DCI.isBeforeLegalize()) {
23803     SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
23804     if (Shuffle.getNode())
23805       return Shuffle;
23806   }
23807
23808   return SDValue();
23809 }
23810
23811 // Check whether a boolean test is testing a boolean value generated by
23812 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
23813 // code.
23814 //
23815 // Simplify the following patterns:
23816 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
23817 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
23818 // to (Op EFLAGS Cond)
23819 //
23820 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
23821 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
23822 // to (Op EFLAGS !Cond)
23823 //
23824 // where Op could be BRCOND or CMOV.
23825 //
23826 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
23827   // Quit if not CMP and SUB with its value result used.
23828   if (Cmp.getOpcode() != X86ISD::CMP &&
23829       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
23830       return SDValue();
23831
23832   // Quit if not used as a boolean value.
23833   if (CC != X86::COND_E && CC != X86::COND_NE)
23834     return SDValue();
23835
23836   // Check CMP operands. One of them should be 0 or 1 and the other should be
23837   // an SetCC or extended from it.
23838   SDValue Op1 = Cmp.getOperand(0);
23839   SDValue Op2 = Cmp.getOperand(1);
23840
23841   SDValue SetCC;
23842   const ConstantSDNode* C = nullptr;
23843   bool needOppositeCond = (CC == X86::COND_E);
23844   bool checkAgainstTrue = false; // Is it a comparison against 1?
23845
23846   if ((C = dyn_cast<ConstantSDNode>(Op1)))
23847     SetCC = Op2;
23848   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
23849     SetCC = Op1;
23850   else // Quit if all operands are not constants.
23851     return SDValue();
23852
23853   if (C->getZExtValue() == 1) {
23854     needOppositeCond = !needOppositeCond;
23855     checkAgainstTrue = true;
23856   } else if (C->getZExtValue() != 0)
23857     // Quit if the constant is neither 0 or 1.
23858     return SDValue();
23859
23860   bool truncatedToBoolWithAnd = false;
23861   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
23862   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
23863          SetCC.getOpcode() == ISD::TRUNCATE ||
23864          SetCC.getOpcode() == ISD::AND) {
23865     if (SetCC.getOpcode() == ISD::AND) {
23866       int OpIdx = -1;
23867       ConstantSDNode *CS;
23868       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
23869           CS->getZExtValue() == 1)
23870         OpIdx = 1;
23871       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
23872           CS->getZExtValue() == 1)
23873         OpIdx = 0;
23874       if (OpIdx == -1)
23875         break;
23876       SetCC = SetCC.getOperand(OpIdx);
23877       truncatedToBoolWithAnd = true;
23878     } else
23879       SetCC = SetCC.getOperand(0);
23880   }
23881
23882   switch (SetCC.getOpcode()) {
23883   case X86ISD::SETCC_CARRY:
23884     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
23885     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
23886     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
23887     // truncated to i1 using 'and'.
23888     if (checkAgainstTrue && !truncatedToBoolWithAnd)
23889       break;
23890     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
23891            "Invalid use of SETCC_CARRY!");
23892     // FALL THROUGH
23893   case X86ISD::SETCC:
23894     // Set the condition code or opposite one if necessary.
23895     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
23896     if (needOppositeCond)
23897       CC = X86::GetOppositeBranchCondition(CC);
23898     return SetCC.getOperand(1);
23899   case X86ISD::CMOV: {
23900     // Check whether false/true value has canonical one, i.e. 0 or 1.
23901     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
23902     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
23903     // Quit if true value is not a constant.
23904     if (!TVal)
23905       return SDValue();
23906     // Quit if false value is not a constant.
23907     if (!FVal) {
23908       SDValue Op = SetCC.getOperand(0);
23909       // Skip 'zext' or 'trunc' node.
23910       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
23911           Op.getOpcode() == ISD::TRUNCATE)
23912         Op = Op.getOperand(0);
23913       // A special case for rdrand/rdseed, where 0 is set if false cond is
23914       // found.
23915       if ((Op.getOpcode() != X86ISD::RDRAND &&
23916            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
23917         return SDValue();
23918     }
23919     // Quit if false value is not the constant 0 or 1.
23920     bool FValIsFalse = true;
23921     if (FVal && FVal->getZExtValue() != 0) {
23922       if (FVal->getZExtValue() != 1)
23923         return SDValue();
23924       // If FVal is 1, opposite cond is needed.
23925       needOppositeCond = !needOppositeCond;
23926       FValIsFalse = false;
23927     }
23928     // Quit if TVal is not the constant opposite of FVal.
23929     if (FValIsFalse && TVal->getZExtValue() != 1)
23930       return SDValue();
23931     if (!FValIsFalse && TVal->getZExtValue() != 0)
23932       return SDValue();
23933     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
23934     if (needOppositeCond)
23935       CC = X86::GetOppositeBranchCondition(CC);
23936     return SetCC.getOperand(3);
23937   }
23938   }
23939
23940   return SDValue();
23941 }
23942
23943 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
23944 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
23945                                   TargetLowering::DAGCombinerInfo &DCI,
23946                                   const X86Subtarget *Subtarget) {
23947   SDLoc DL(N);
23948
23949   // If the flag operand isn't dead, don't touch this CMOV.
23950   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
23951     return SDValue();
23952
23953   SDValue FalseOp = N->getOperand(0);
23954   SDValue TrueOp = N->getOperand(1);
23955   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
23956   SDValue Cond = N->getOperand(3);
23957
23958   if (CC == X86::COND_E || CC == X86::COND_NE) {
23959     switch (Cond.getOpcode()) {
23960     default: break;
23961     case X86ISD::BSR:
23962     case X86ISD::BSF:
23963       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
23964       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
23965         return (CC == X86::COND_E) ? FalseOp : TrueOp;
23966     }
23967   }
23968
23969   SDValue Flags;
23970
23971   Flags = checkBoolTestSetCCCombine(Cond, CC);
23972   if (Flags.getNode() &&
23973       // Extra check as FCMOV only supports a subset of X86 cond.
23974       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
23975     SDValue Ops[] = { FalseOp, TrueOp,
23976                       DAG.getConstant(CC, MVT::i8), Flags };
23977     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
23978   }
23979
23980   // If this is a select between two integer constants, try to do some
23981   // optimizations.  Note that the operands are ordered the opposite of SELECT
23982   // operands.
23983   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
23984     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
23985       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
23986       // larger than FalseC (the false value).
23987       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
23988         CC = X86::GetOppositeBranchCondition(CC);
23989         std::swap(TrueC, FalseC);
23990         std::swap(TrueOp, FalseOp);
23991       }
23992
23993       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
23994       // This is efficient for any integer data type (including i8/i16) and
23995       // shift amount.
23996       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
23997         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23998                            DAG.getConstant(CC, MVT::i8), Cond);
23999
24000         // Zero extend the condition if needed.
24001         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
24002
24003         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
24004         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
24005                            DAG.getConstant(ShAmt, MVT::i8));
24006         if (N->getNumValues() == 2)  // Dead flag value?
24007           return DCI.CombineTo(N, Cond, SDValue());
24008         return Cond;
24009       }
24010
24011       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
24012       // for any integer data type, including i8/i16.
24013       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
24014         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24015                            DAG.getConstant(CC, MVT::i8), Cond);
24016
24017         // Zero extend the condition if needed.
24018         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
24019                            FalseC->getValueType(0), Cond);
24020         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24021                            SDValue(FalseC, 0));
24022
24023         if (N->getNumValues() == 2)  // Dead flag value?
24024           return DCI.CombineTo(N, Cond, SDValue());
24025         return Cond;
24026       }
24027
24028       // Optimize cases that will turn into an LEA instruction.  This requires
24029       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
24030       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
24031         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
24032         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
24033
24034         bool isFastMultiplier = false;
24035         if (Diff < 10) {
24036           switch ((unsigned char)Diff) {
24037           default: break;
24038           case 1:  // result = add base, cond
24039           case 2:  // result = lea base(    , cond*2)
24040           case 3:  // result = lea base(cond, cond*2)
24041           case 4:  // result = lea base(    , cond*4)
24042           case 5:  // result = lea base(cond, cond*4)
24043           case 8:  // result = lea base(    , cond*8)
24044           case 9:  // result = lea base(cond, cond*8)
24045             isFastMultiplier = true;
24046             break;
24047           }
24048         }
24049
24050         if (isFastMultiplier) {
24051           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
24052           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24053                              DAG.getConstant(CC, MVT::i8), Cond);
24054           // Zero extend the condition if needed.
24055           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
24056                              Cond);
24057           // Scale the condition by the difference.
24058           if (Diff != 1)
24059             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
24060                                DAG.getConstant(Diff, Cond.getValueType()));
24061
24062           // Add the base if non-zero.
24063           if (FalseC->getAPIntValue() != 0)
24064             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24065                                SDValue(FalseC, 0));
24066           if (N->getNumValues() == 2)  // Dead flag value?
24067             return DCI.CombineTo(N, Cond, SDValue());
24068           return Cond;
24069         }
24070       }
24071     }
24072   }
24073
24074   // Handle these cases:
24075   //   (select (x != c), e, c) -> select (x != c), e, x),
24076   //   (select (x == c), c, e) -> select (x == c), x, e)
24077   // where the c is an integer constant, and the "select" is the combination
24078   // of CMOV and CMP.
24079   //
24080   // The rationale for this change is that the conditional-move from a constant
24081   // needs two instructions, however, conditional-move from a register needs
24082   // only one instruction.
24083   //
24084   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
24085   //  some instruction-combining opportunities. This opt needs to be
24086   //  postponed as late as possible.
24087   //
24088   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
24089     // the DCI.xxxx conditions are provided to postpone the optimization as
24090     // late as possible.
24091
24092     ConstantSDNode *CmpAgainst = nullptr;
24093     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
24094         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
24095         !isa<ConstantSDNode>(Cond.getOperand(0))) {
24096
24097       if (CC == X86::COND_NE &&
24098           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
24099         CC = X86::GetOppositeBranchCondition(CC);
24100         std::swap(TrueOp, FalseOp);
24101       }
24102
24103       if (CC == X86::COND_E &&
24104           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
24105         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
24106                           DAG.getConstant(CC, MVT::i8), Cond };
24107         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
24108       }
24109     }
24110   }
24111
24112   return SDValue();
24113 }
24114
24115 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
24116                                                 const X86Subtarget *Subtarget) {
24117   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
24118   switch (IntNo) {
24119   default: return SDValue();
24120   // SSE/AVX/AVX2 blend intrinsics.
24121   case Intrinsic::x86_avx2_pblendvb:
24122   case Intrinsic::x86_avx2_pblendw:
24123   case Intrinsic::x86_avx2_pblendd_128:
24124   case Intrinsic::x86_avx2_pblendd_256:
24125     // Don't try to simplify this intrinsic if we don't have AVX2.
24126     if (!Subtarget->hasAVX2())
24127       return SDValue();
24128     // FALL-THROUGH
24129   case Intrinsic::x86_avx_blend_pd_256:
24130   case Intrinsic::x86_avx_blend_ps_256:
24131   case Intrinsic::x86_avx_blendv_pd_256:
24132   case Intrinsic::x86_avx_blendv_ps_256:
24133     // Don't try to simplify this intrinsic if we don't have AVX.
24134     if (!Subtarget->hasAVX())
24135       return SDValue();
24136     // FALL-THROUGH
24137   case Intrinsic::x86_sse41_pblendw:
24138   case Intrinsic::x86_sse41_blendpd:
24139   case Intrinsic::x86_sse41_blendps:
24140   case Intrinsic::x86_sse41_blendvps:
24141   case Intrinsic::x86_sse41_blendvpd:
24142   case Intrinsic::x86_sse41_pblendvb: {
24143     SDValue Op0 = N->getOperand(1);
24144     SDValue Op1 = N->getOperand(2);
24145     SDValue Mask = N->getOperand(3);
24146
24147     // Don't try to simplify this intrinsic if we don't have SSE4.1.
24148     if (!Subtarget->hasSSE41())
24149       return SDValue();
24150
24151     // fold (blend A, A, Mask) -> A
24152     if (Op0 == Op1)
24153       return Op0;
24154     // fold (blend A, B, allZeros) -> A
24155     if (ISD::isBuildVectorAllZeros(Mask.getNode()))
24156       return Op0;
24157     // fold (blend A, B, allOnes) -> B
24158     if (ISD::isBuildVectorAllOnes(Mask.getNode()))
24159       return Op1;
24160
24161     // Simplify the case where the mask is a constant i32 value.
24162     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
24163       if (C->isNullValue())
24164         return Op0;
24165       if (C->isAllOnesValue())
24166         return Op1;
24167     }
24168
24169     return SDValue();
24170   }
24171
24172   // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
24173   case Intrinsic::x86_sse2_psrai_w:
24174   case Intrinsic::x86_sse2_psrai_d:
24175   case Intrinsic::x86_avx2_psrai_w:
24176   case Intrinsic::x86_avx2_psrai_d:
24177   case Intrinsic::x86_sse2_psra_w:
24178   case Intrinsic::x86_sse2_psra_d:
24179   case Intrinsic::x86_avx2_psra_w:
24180   case Intrinsic::x86_avx2_psra_d: {
24181     SDValue Op0 = N->getOperand(1);
24182     SDValue Op1 = N->getOperand(2);
24183     EVT VT = Op0.getValueType();
24184     assert(VT.isVector() && "Expected a vector type!");
24185
24186     if (isa<BuildVectorSDNode>(Op1))
24187       Op1 = Op1.getOperand(0);
24188
24189     if (!isa<ConstantSDNode>(Op1))
24190       return SDValue();
24191
24192     EVT SVT = VT.getVectorElementType();
24193     unsigned SVTBits = SVT.getSizeInBits();
24194
24195     ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
24196     const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
24197     uint64_t ShAmt = C.getZExtValue();
24198
24199     // Don't try to convert this shift into a ISD::SRA if the shift
24200     // count is bigger than or equal to the element size.
24201     if (ShAmt >= SVTBits)
24202       return SDValue();
24203
24204     // Trivial case: if the shift count is zero, then fold this
24205     // into the first operand.
24206     if (ShAmt == 0)
24207       return Op0;
24208
24209     // Replace this packed shift intrinsic with a target independent
24210     // shift dag node.
24211     SDValue Splat = DAG.getConstant(C, VT);
24212     return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
24213   }
24214   }
24215 }
24216
24217 /// PerformMulCombine - Optimize a single multiply with constant into two
24218 /// in order to implement it with two cheaper instructions, e.g.
24219 /// LEA + SHL, LEA + LEA.
24220 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
24221                                  TargetLowering::DAGCombinerInfo &DCI) {
24222   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24223     return SDValue();
24224
24225   EVT VT = N->getValueType(0);
24226   if (VT != MVT::i64 && VT != MVT::i32)
24227     return SDValue();
24228
24229   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
24230   if (!C)
24231     return SDValue();
24232   uint64_t MulAmt = C->getZExtValue();
24233   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
24234     return SDValue();
24235
24236   uint64_t MulAmt1 = 0;
24237   uint64_t MulAmt2 = 0;
24238   if ((MulAmt % 9) == 0) {
24239     MulAmt1 = 9;
24240     MulAmt2 = MulAmt / 9;
24241   } else if ((MulAmt % 5) == 0) {
24242     MulAmt1 = 5;
24243     MulAmt2 = MulAmt / 5;
24244   } else if ((MulAmt % 3) == 0) {
24245     MulAmt1 = 3;
24246     MulAmt2 = MulAmt / 3;
24247   }
24248   if (MulAmt2 &&
24249       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
24250     SDLoc DL(N);
24251
24252     if (isPowerOf2_64(MulAmt2) &&
24253         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
24254       // If second multiplifer is pow2, issue it first. We want the multiply by
24255       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
24256       // is an add.
24257       std::swap(MulAmt1, MulAmt2);
24258
24259     SDValue NewMul;
24260     if (isPowerOf2_64(MulAmt1))
24261       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
24262                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
24263     else
24264       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
24265                            DAG.getConstant(MulAmt1, VT));
24266
24267     if (isPowerOf2_64(MulAmt2))
24268       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
24269                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
24270     else
24271       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
24272                            DAG.getConstant(MulAmt2, VT));
24273
24274     // Do not add new nodes to DAG combiner worklist.
24275     DCI.CombineTo(N, NewMul, false);
24276   }
24277   return SDValue();
24278 }
24279
24280 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
24281   SDValue N0 = N->getOperand(0);
24282   SDValue N1 = N->getOperand(1);
24283   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
24284   EVT VT = N0.getValueType();
24285
24286   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
24287   // since the result of setcc_c is all zero's or all ones.
24288   if (VT.isInteger() && !VT.isVector() &&
24289       N1C && N0.getOpcode() == ISD::AND &&
24290       N0.getOperand(1).getOpcode() == ISD::Constant) {
24291     SDValue N00 = N0.getOperand(0);
24292     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
24293         ((N00.getOpcode() == ISD::ANY_EXTEND ||
24294           N00.getOpcode() == ISD::ZERO_EXTEND) &&
24295          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
24296       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
24297       APInt ShAmt = N1C->getAPIntValue();
24298       Mask = Mask.shl(ShAmt);
24299       if (Mask != 0)
24300         return DAG.getNode(ISD::AND, SDLoc(N), VT,
24301                            N00, DAG.getConstant(Mask, VT));
24302     }
24303   }
24304
24305   // Hardware support for vector shifts is sparse which makes us scalarize the
24306   // vector operations in many cases. Also, on sandybridge ADD is faster than
24307   // shl.
24308   // (shl V, 1) -> add V,V
24309   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
24310     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
24311       assert(N0.getValueType().isVector() && "Invalid vector shift type");
24312       // We shift all of the values by one. In many cases we do not have
24313       // hardware support for this operation. This is better expressed as an ADD
24314       // of two values.
24315       if (N1SplatC->getZExtValue() == 1)
24316         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
24317     }
24318
24319   return SDValue();
24320 }
24321
24322 /// \brief Returns a vector of 0s if the node in input is a vector logical
24323 /// shift by a constant amount which is known to be bigger than or equal
24324 /// to the vector element size in bits.
24325 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
24326                                       const X86Subtarget *Subtarget) {
24327   EVT VT = N->getValueType(0);
24328
24329   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
24330       (!Subtarget->hasInt256() ||
24331        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
24332     return SDValue();
24333
24334   SDValue Amt = N->getOperand(1);
24335   SDLoc DL(N);
24336   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
24337     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
24338       APInt ShiftAmt = AmtSplat->getAPIntValue();
24339       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
24340
24341       // SSE2/AVX2 logical shifts always return a vector of 0s
24342       // if the shift amount is bigger than or equal to
24343       // the element size. The constant shift amount will be
24344       // encoded as a 8-bit immediate.
24345       if (ShiftAmt.trunc(8).uge(MaxAmount))
24346         return getZeroVector(VT, Subtarget, DAG, DL);
24347     }
24348
24349   return SDValue();
24350 }
24351
24352 /// PerformShiftCombine - Combine shifts.
24353 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
24354                                    TargetLowering::DAGCombinerInfo &DCI,
24355                                    const X86Subtarget *Subtarget) {
24356   if (N->getOpcode() == ISD::SHL) {
24357     SDValue V = PerformSHLCombine(N, DAG);
24358     if (V.getNode()) return V;
24359   }
24360
24361   if (N->getOpcode() != ISD::SRA) {
24362     // Try to fold this logical shift into a zero vector.
24363     SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
24364     if (V.getNode()) return V;
24365   }
24366
24367   return SDValue();
24368 }
24369
24370 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
24371 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
24372 // and friends.  Likewise for OR -> CMPNEQSS.
24373 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
24374                             TargetLowering::DAGCombinerInfo &DCI,
24375                             const X86Subtarget *Subtarget) {
24376   unsigned opcode;
24377
24378   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
24379   // we're requiring SSE2 for both.
24380   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
24381     SDValue N0 = N->getOperand(0);
24382     SDValue N1 = N->getOperand(1);
24383     SDValue CMP0 = N0->getOperand(1);
24384     SDValue CMP1 = N1->getOperand(1);
24385     SDLoc DL(N);
24386
24387     // The SETCCs should both refer to the same CMP.
24388     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
24389       return SDValue();
24390
24391     SDValue CMP00 = CMP0->getOperand(0);
24392     SDValue CMP01 = CMP0->getOperand(1);
24393     EVT     VT    = CMP00.getValueType();
24394
24395     if (VT == MVT::f32 || VT == MVT::f64) {
24396       bool ExpectingFlags = false;
24397       // Check for any users that want flags:
24398       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
24399            !ExpectingFlags && UI != UE; ++UI)
24400         switch (UI->getOpcode()) {
24401         default:
24402         case ISD::BR_CC:
24403         case ISD::BRCOND:
24404         case ISD::SELECT:
24405           ExpectingFlags = true;
24406           break;
24407         case ISD::CopyToReg:
24408         case ISD::SIGN_EXTEND:
24409         case ISD::ZERO_EXTEND:
24410         case ISD::ANY_EXTEND:
24411           break;
24412         }
24413
24414       if (!ExpectingFlags) {
24415         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
24416         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
24417
24418         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
24419           X86::CondCode tmp = cc0;
24420           cc0 = cc1;
24421           cc1 = tmp;
24422         }
24423
24424         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
24425             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
24426           // FIXME: need symbolic constants for these magic numbers.
24427           // See X86ATTInstPrinter.cpp:printSSECC().
24428           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
24429           if (Subtarget->hasAVX512()) {
24430             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
24431                                          CMP01, DAG.getConstant(x86cc, MVT::i8));
24432             if (N->getValueType(0) != MVT::i1)
24433               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
24434                                  FSetCC);
24435             return FSetCC;
24436           }
24437           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
24438                                               CMP00.getValueType(), CMP00, CMP01,
24439                                               DAG.getConstant(x86cc, MVT::i8));
24440
24441           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
24442           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
24443
24444           if (is64BitFP && !Subtarget->is64Bit()) {
24445             // On a 32-bit target, we cannot bitcast the 64-bit float to a
24446             // 64-bit integer, since that's not a legal type. Since
24447             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
24448             // bits, but can do this little dance to extract the lowest 32 bits
24449             // and work with those going forward.
24450             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
24451                                            OnesOrZeroesF);
24452             SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
24453                                            Vector64);
24454             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
24455                                         Vector32, DAG.getIntPtrConstant(0));
24456             IntVT = MVT::i32;
24457           }
24458
24459           SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
24460           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
24461                                       DAG.getConstant(1, IntVT));
24462           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
24463           return OneBitOfTruth;
24464         }
24465       }
24466     }
24467   }
24468   return SDValue();
24469 }
24470
24471 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
24472 /// so it can be folded inside ANDNP.
24473 static bool CanFoldXORWithAllOnes(const SDNode *N) {
24474   EVT VT = N->getValueType(0);
24475
24476   // Match direct AllOnes for 128 and 256-bit vectors
24477   if (ISD::isBuildVectorAllOnes(N))
24478     return true;
24479
24480   // Look through a bit convert.
24481   if (N->getOpcode() == ISD::BITCAST)
24482     N = N->getOperand(0).getNode();
24483
24484   // Sometimes the operand may come from a insert_subvector building a 256-bit
24485   // allones vector
24486   if (VT.is256BitVector() &&
24487       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
24488     SDValue V1 = N->getOperand(0);
24489     SDValue V2 = N->getOperand(1);
24490
24491     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
24492         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
24493         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
24494         ISD::isBuildVectorAllOnes(V2.getNode()))
24495       return true;
24496   }
24497
24498   return false;
24499 }
24500
24501 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
24502 // register. In most cases we actually compare or select YMM-sized registers
24503 // and mixing the two types creates horrible code. This method optimizes
24504 // some of the transition sequences.
24505 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
24506                                  TargetLowering::DAGCombinerInfo &DCI,
24507                                  const X86Subtarget *Subtarget) {
24508   EVT VT = N->getValueType(0);
24509   if (!VT.is256BitVector())
24510     return SDValue();
24511
24512   assert((N->getOpcode() == ISD::ANY_EXTEND ||
24513           N->getOpcode() == ISD::ZERO_EXTEND ||
24514           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
24515
24516   SDValue Narrow = N->getOperand(0);
24517   EVT NarrowVT = Narrow->getValueType(0);
24518   if (!NarrowVT.is128BitVector())
24519     return SDValue();
24520
24521   if (Narrow->getOpcode() != ISD::XOR &&
24522       Narrow->getOpcode() != ISD::AND &&
24523       Narrow->getOpcode() != ISD::OR)
24524     return SDValue();
24525
24526   SDValue N0  = Narrow->getOperand(0);
24527   SDValue N1  = Narrow->getOperand(1);
24528   SDLoc DL(Narrow);
24529
24530   // The Left side has to be a trunc.
24531   if (N0.getOpcode() != ISD::TRUNCATE)
24532     return SDValue();
24533
24534   // The type of the truncated inputs.
24535   EVT WideVT = N0->getOperand(0)->getValueType(0);
24536   if (WideVT != VT)
24537     return SDValue();
24538
24539   // The right side has to be a 'trunc' or a constant vector.
24540   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
24541   ConstantSDNode *RHSConstSplat = nullptr;
24542   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
24543     RHSConstSplat = RHSBV->getConstantSplatNode();
24544   if (!RHSTrunc && !RHSConstSplat)
24545     return SDValue();
24546
24547   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24548
24549   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
24550     return SDValue();
24551
24552   // Set N0 and N1 to hold the inputs to the new wide operation.
24553   N0 = N0->getOperand(0);
24554   if (RHSConstSplat) {
24555     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
24556                      SDValue(RHSConstSplat, 0));
24557     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
24558     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
24559   } else if (RHSTrunc) {
24560     N1 = N1->getOperand(0);
24561   }
24562
24563   // Generate the wide operation.
24564   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
24565   unsigned Opcode = N->getOpcode();
24566   switch (Opcode) {
24567   case ISD::ANY_EXTEND:
24568     return Op;
24569   case ISD::ZERO_EXTEND: {
24570     unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
24571     APInt Mask = APInt::getAllOnesValue(InBits);
24572     Mask = Mask.zext(VT.getScalarType().getSizeInBits());
24573     return DAG.getNode(ISD::AND, DL, VT,
24574                        Op, DAG.getConstant(Mask, VT));
24575   }
24576   case ISD::SIGN_EXTEND:
24577     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
24578                        Op, DAG.getValueType(NarrowVT));
24579   default:
24580     llvm_unreachable("Unexpected opcode");
24581   }
24582 }
24583
24584 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
24585                                  TargetLowering::DAGCombinerInfo &DCI,
24586                                  const X86Subtarget *Subtarget) {
24587   EVT VT = N->getValueType(0);
24588   if (DCI.isBeforeLegalizeOps())
24589     return SDValue();
24590
24591   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24592   if (R.getNode())
24593     return R;
24594
24595   // Create BEXTR instructions
24596   // BEXTR is ((X >> imm) & (2**size-1))
24597   if (VT == MVT::i32 || VT == MVT::i64) {
24598     SDValue N0 = N->getOperand(0);
24599     SDValue N1 = N->getOperand(1);
24600     SDLoc DL(N);
24601
24602     // Check for BEXTR.
24603     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
24604         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
24605       ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
24606       ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
24607       if (MaskNode && ShiftNode) {
24608         uint64_t Mask = MaskNode->getZExtValue();
24609         uint64_t Shift = ShiftNode->getZExtValue();
24610         if (isMask_64(Mask)) {
24611           uint64_t MaskSize = CountPopulation_64(Mask);
24612           if (Shift + MaskSize <= VT.getSizeInBits())
24613             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
24614                                DAG.getConstant(Shift | (MaskSize << 8), VT));
24615         }
24616       }
24617     } // BEXTR
24618
24619     return SDValue();
24620   }
24621
24622   // Want to form ANDNP nodes:
24623   // 1) In the hopes of then easily combining them with OR and AND nodes
24624   //    to form PBLEND/PSIGN.
24625   // 2) To match ANDN packed intrinsics
24626   if (VT != MVT::v2i64 && VT != MVT::v4i64)
24627     return SDValue();
24628
24629   SDValue N0 = N->getOperand(0);
24630   SDValue N1 = N->getOperand(1);
24631   SDLoc DL(N);
24632
24633   // Check LHS for vnot
24634   if (N0.getOpcode() == ISD::XOR &&
24635       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
24636       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
24637     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
24638
24639   // Check RHS for vnot
24640   if (N1.getOpcode() == ISD::XOR &&
24641       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
24642       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
24643     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
24644
24645   return SDValue();
24646 }
24647
24648 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
24649                                 TargetLowering::DAGCombinerInfo &DCI,
24650                                 const X86Subtarget *Subtarget) {
24651   if (DCI.isBeforeLegalizeOps())
24652     return SDValue();
24653
24654   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24655   if (R.getNode())
24656     return R;
24657
24658   SDValue N0 = N->getOperand(0);
24659   SDValue N1 = N->getOperand(1);
24660   EVT VT = N->getValueType(0);
24661
24662   // look for psign/blend
24663   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
24664     if (!Subtarget->hasSSSE3() ||
24665         (VT == MVT::v4i64 && !Subtarget->hasInt256()))
24666       return SDValue();
24667
24668     // Canonicalize pandn to RHS
24669     if (N0.getOpcode() == X86ISD::ANDNP)
24670       std::swap(N0, N1);
24671     // or (and (m, y), (pandn m, x))
24672     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
24673       SDValue Mask = N1.getOperand(0);
24674       SDValue X    = N1.getOperand(1);
24675       SDValue Y;
24676       if (N0.getOperand(0) == Mask)
24677         Y = N0.getOperand(1);
24678       if (N0.getOperand(1) == Mask)
24679         Y = N0.getOperand(0);
24680
24681       // Check to see if the mask appeared in both the AND and ANDNP and
24682       if (!Y.getNode())
24683         return SDValue();
24684
24685       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
24686       // Look through mask bitcast.
24687       if (Mask.getOpcode() == ISD::BITCAST)
24688         Mask = Mask.getOperand(0);
24689       if (X.getOpcode() == ISD::BITCAST)
24690         X = X.getOperand(0);
24691       if (Y.getOpcode() == ISD::BITCAST)
24692         Y = Y.getOperand(0);
24693
24694       EVT MaskVT = Mask.getValueType();
24695
24696       // Validate that the Mask operand is a vector sra node.
24697       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
24698       // there is no psrai.b
24699       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
24700       unsigned SraAmt = ~0;
24701       if (Mask.getOpcode() == ISD::SRA) {
24702         if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
24703           if (auto *AmtConst = AmtBV->getConstantSplatNode())
24704             SraAmt = AmtConst->getZExtValue();
24705       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
24706         SDValue SraC = Mask.getOperand(1);
24707         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
24708       }
24709       if ((SraAmt + 1) != EltBits)
24710         return SDValue();
24711
24712       SDLoc DL(N);
24713
24714       // Now we know we at least have a plendvb with the mask val.  See if
24715       // we can form a psignb/w/d.
24716       // psign = x.type == y.type == mask.type && y = sub(0, x);
24717       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
24718           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
24719           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
24720         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
24721                "Unsupported VT for PSIGN");
24722         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
24723         return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24724       }
24725       // PBLENDVB only available on SSE 4.1
24726       if (!Subtarget->hasSSE41())
24727         return SDValue();
24728
24729       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
24730
24731       X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
24732       Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
24733       Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
24734       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
24735       return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24736     }
24737   }
24738
24739   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
24740     return SDValue();
24741
24742   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
24743   MachineFunction &MF = DAG.getMachineFunction();
24744   bool OptForSize = MF.getFunction()->getAttributes().
24745     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
24746
24747   // SHLD/SHRD instructions have lower register pressure, but on some
24748   // platforms they have higher latency than the equivalent
24749   // series of shifts/or that would otherwise be generated.
24750   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
24751   // have higher latencies and we are not optimizing for size.
24752   if (!OptForSize && Subtarget->isSHLDSlow())
24753     return SDValue();
24754
24755   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
24756     std::swap(N0, N1);
24757   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
24758     return SDValue();
24759   if (!N0.hasOneUse() || !N1.hasOneUse())
24760     return SDValue();
24761
24762   SDValue ShAmt0 = N0.getOperand(1);
24763   if (ShAmt0.getValueType() != MVT::i8)
24764     return SDValue();
24765   SDValue ShAmt1 = N1.getOperand(1);
24766   if (ShAmt1.getValueType() != MVT::i8)
24767     return SDValue();
24768   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
24769     ShAmt0 = ShAmt0.getOperand(0);
24770   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
24771     ShAmt1 = ShAmt1.getOperand(0);
24772
24773   SDLoc DL(N);
24774   unsigned Opc = X86ISD::SHLD;
24775   SDValue Op0 = N0.getOperand(0);
24776   SDValue Op1 = N1.getOperand(0);
24777   if (ShAmt0.getOpcode() == ISD::SUB) {
24778     Opc = X86ISD::SHRD;
24779     std::swap(Op0, Op1);
24780     std::swap(ShAmt0, ShAmt1);
24781   }
24782
24783   unsigned Bits = VT.getSizeInBits();
24784   if (ShAmt1.getOpcode() == ISD::SUB) {
24785     SDValue Sum = ShAmt1.getOperand(0);
24786     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
24787       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
24788       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
24789         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
24790       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
24791         return DAG.getNode(Opc, DL, VT,
24792                            Op0, Op1,
24793                            DAG.getNode(ISD::TRUNCATE, DL,
24794                                        MVT::i8, ShAmt0));
24795     }
24796   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
24797     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
24798     if (ShAmt0C &&
24799         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
24800       return DAG.getNode(Opc, DL, VT,
24801                          N0.getOperand(0), N1.getOperand(0),
24802                          DAG.getNode(ISD::TRUNCATE, DL,
24803                                        MVT::i8, ShAmt0));
24804   }
24805
24806   return SDValue();
24807 }
24808
24809 // Generate NEG and CMOV for integer abs.
24810 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
24811   EVT VT = N->getValueType(0);
24812
24813   // Since X86 does not have CMOV for 8-bit integer, we don't convert
24814   // 8-bit integer abs to NEG and CMOV.
24815   if (VT.isInteger() && VT.getSizeInBits() == 8)
24816     return SDValue();
24817
24818   SDValue N0 = N->getOperand(0);
24819   SDValue N1 = N->getOperand(1);
24820   SDLoc DL(N);
24821
24822   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
24823   // and change it to SUB and CMOV.
24824   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
24825       N0.getOpcode() == ISD::ADD &&
24826       N0.getOperand(1) == N1 &&
24827       N1.getOpcode() == ISD::SRA &&
24828       N1.getOperand(0) == N0.getOperand(0))
24829     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
24830       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
24831         // Generate SUB & CMOV.
24832         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
24833                                   DAG.getConstant(0, VT), N0.getOperand(0));
24834
24835         SDValue Ops[] = { N0.getOperand(0), Neg,
24836                           DAG.getConstant(X86::COND_GE, MVT::i8),
24837                           SDValue(Neg.getNode(), 1) };
24838         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
24839       }
24840   return SDValue();
24841 }
24842
24843 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
24844 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
24845                                  TargetLowering::DAGCombinerInfo &DCI,
24846                                  const X86Subtarget *Subtarget) {
24847   if (DCI.isBeforeLegalizeOps())
24848     return SDValue();
24849
24850   if (Subtarget->hasCMov()) {
24851     SDValue RV = performIntegerAbsCombine(N, DAG);
24852     if (RV.getNode())
24853       return RV;
24854   }
24855
24856   return SDValue();
24857 }
24858
24859 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
24860 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
24861                                   TargetLowering::DAGCombinerInfo &DCI,
24862                                   const X86Subtarget *Subtarget) {
24863   LoadSDNode *Ld = cast<LoadSDNode>(N);
24864   EVT RegVT = Ld->getValueType(0);
24865   EVT MemVT = Ld->getMemoryVT();
24866   SDLoc dl(Ld);
24867   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24868
24869   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
24870   // into two 16-byte operations.
24871   ISD::LoadExtType Ext = Ld->getExtensionType();
24872   unsigned Alignment = Ld->getAlignment();
24873   bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
24874   if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24875       !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
24876     unsigned NumElems = RegVT.getVectorNumElements();
24877     if (NumElems < 2)
24878       return SDValue();
24879
24880     SDValue Ptr = Ld->getBasePtr();
24881     SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
24882
24883     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
24884                                   NumElems/2);
24885     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24886                                 Ld->getPointerInfo(), Ld->isVolatile(),
24887                                 Ld->isNonTemporal(), Ld->isInvariant(),
24888                                 Alignment);
24889     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24890     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24891                                 Ld->getPointerInfo(), Ld->isVolatile(),
24892                                 Ld->isNonTemporal(), Ld->isInvariant(),
24893                                 std::min(16U, Alignment));
24894     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24895                              Load1.getValue(1),
24896                              Load2.getValue(1));
24897
24898     SDValue NewVec = DAG.getUNDEF(RegVT);
24899     NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
24900     NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
24901     return DCI.CombineTo(N, NewVec, TF, true);
24902   }
24903
24904   return SDValue();
24905 }
24906
24907 /// PerformMLOADCombine - Resolve extending loads
24908 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
24909                                    TargetLowering::DAGCombinerInfo &DCI,
24910                                    const X86Subtarget *Subtarget) {
24911   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
24912   if (Mld->getExtensionType() != ISD::SEXTLOAD)
24913     return SDValue();
24914
24915   EVT VT = Mld->getValueType(0);
24916   unsigned NumElems = VT.getVectorNumElements();
24917   EVT LdVT = Mld->getMemoryVT();
24918   SDLoc dl(Mld);
24919
24920   assert(LdVT != VT && "Cannot extend to the same type");
24921   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
24922   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
24923   // From, To sizes and ElemCount must be pow of two
24924   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
24925     "Unexpected size for extending masked load");
24926
24927   unsigned SizeRatio  = ToSz / FromSz;
24928   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
24929
24930   // Create a type on which we perform the shuffle
24931   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24932           LdVT.getScalarType(), NumElems*SizeRatio);
24933   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24934
24935   // Convert Src0 value
24936   SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
24937   if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
24938     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24939     for (unsigned i = 0; i != NumElems; ++i)
24940       ShuffleVec[i] = i * SizeRatio;
24941
24942     // Can't shuffle using an illegal type.
24943     assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
24944             && "WideVecVT should be legal");
24945     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
24946                                     DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
24947   }
24948   // Prepare the new mask
24949   SDValue NewMask;
24950   SDValue Mask = Mld->getMask();
24951   if (Mask.getValueType() == VT) {
24952     // Mask and original value have the same type
24953     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
24954     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24955     for (unsigned i = 0; i != NumElems; ++i)
24956       ShuffleVec[i] = i * SizeRatio;
24957     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
24958       ShuffleVec[i] = NumElems*SizeRatio;
24959     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
24960                                    DAG.getConstant(0, WideVecVT),
24961                                    &ShuffleVec[0]);
24962   }
24963   else {
24964     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
24965     unsigned WidenNumElts = NumElems*SizeRatio;
24966     unsigned MaskNumElts = VT.getVectorNumElements();
24967     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
24968                                      WidenNumElts);
24969
24970     unsigned NumConcat = WidenNumElts / MaskNumElts;
24971     SmallVector<SDValue, 16> Ops(NumConcat);
24972     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
24973     Ops[0] = Mask;
24974     for (unsigned i = 1; i != NumConcat; ++i)
24975       Ops[i] = ZeroVal;
24976
24977     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
24978   }
24979
24980   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
24981                                      Mld->getBasePtr(), NewMask, WideSrc0,
24982                                      Mld->getMemoryVT(), Mld->getMemOperand(),
24983                                      ISD::NON_EXTLOAD);
24984   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
24985   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
24986
24987 }
24988 /// PerformMSTORECombine - Resolve truncating stores
24989 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
24990                                     const X86Subtarget *Subtarget) {
24991   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
24992   if (!Mst->isTruncatingStore())
24993     return SDValue();
24994
24995   EVT VT = Mst->getValue().getValueType();
24996   unsigned NumElems = VT.getVectorNumElements();
24997   EVT StVT = Mst->getMemoryVT();
24998   SDLoc dl(Mst);
24999
25000   assert(StVT != VT && "Cannot truncate to the same type");
25001   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25002   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25003
25004   // From, To sizes and ElemCount must be pow of two
25005   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
25006     "Unexpected size for truncating masked store");
25007   // We are going to use the original vector elt for storing.
25008   // Accumulated smaller vector elements must be a multiple of the store size.
25009   assert (((NumElems * FromSz) % ToSz) == 0 &&
25010           "Unexpected ratio for truncating masked store");
25011
25012   unsigned SizeRatio  = FromSz / ToSz;
25013   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25014
25015   // Create a type on which we perform the shuffle
25016   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25017           StVT.getScalarType(), NumElems*SizeRatio);
25018
25019   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25020
25021   SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
25022   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
25023   for (unsigned i = 0; i != NumElems; ++i)
25024     ShuffleVec[i] = i * SizeRatio;
25025
25026   // Can't shuffle using an illegal type.
25027   assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
25028           && "WideVecVT should be legal");
25029
25030   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25031                                         DAG.getUNDEF(WideVecVT),
25032                                         &ShuffleVec[0]);
25033
25034   SDValue NewMask;
25035   SDValue Mask = Mst->getMask();
25036   if (Mask.getValueType() == VT) {
25037     // Mask and original value have the same type
25038     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
25039     for (unsigned i = 0; i != NumElems; ++i)
25040       ShuffleVec[i] = i * SizeRatio;
25041     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
25042       ShuffleVec[i] = NumElems*SizeRatio;
25043     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
25044                                    DAG.getConstant(0, WideVecVT),
25045                                    &ShuffleVec[0]);
25046   }
25047   else {
25048     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
25049     unsigned WidenNumElts = NumElems*SizeRatio;
25050     unsigned MaskNumElts = VT.getVectorNumElements();
25051     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
25052                                      WidenNumElts);
25053
25054     unsigned NumConcat = WidenNumElts / MaskNumElts;
25055     SmallVector<SDValue, 16> Ops(NumConcat);
25056     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
25057     Ops[0] = Mask;
25058     for (unsigned i = 1; i != NumConcat; ++i)
25059       Ops[i] = ZeroVal;
25060
25061     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
25062   }
25063
25064   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
25065                             NewMask, StVT, Mst->getMemOperand(), false);
25066 }
25067 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
25068 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
25069                                    const X86Subtarget *Subtarget) {
25070   StoreSDNode *St = cast<StoreSDNode>(N);
25071   EVT VT = St->getValue().getValueType();
25072   EVT StVT = St->getMemoryVT();
25073   SDLoc dl(St);
25074   SDValue StoredVal = St->getOperand(1);
25075   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25076
25077   // If we are saving a concatenation of two XMM registers and 32-byte stores
25078   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
25079   unsigned Alignment = St->getAlignment();
25080   bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
25081   if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
25082       StVT == VT && !IsAligned) {
25083     unsigned NumElems = VT.getVectorNumElements();
25084     if (NumElems < 2)
25085       return SDValue();
25086
25087     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
25088     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
25089
25090     SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
25091     SDValue Ptr0 = St->getBasePtr();
25092     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
25093
25094     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
25095                                 St->getPointerInfo(), St->isVolatile(),
25096                                 St->isNonTemporal(), Alignment);
25097     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
25098                                 St->getPointerInfo(), St->isVolatile(),
25099                                 St->isNonTemporal(),
25100                                 std::min(16U, Alignment));
25101     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
25102   }
25103
25104   // Optimize trunc store (of multiple scalars) to shuffle and store.
25105   // First, pack all of the elements in one place. Next, store to memory
25106   // in fewer chunks.
25107   if (St->isTruncatingStore() && VT.isVector()) {
25108     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25109     unsigned NumElems = VT.getVectorNumElements();
25110     assert(StVT != VT && "Cannot truncate to the same type");
25111     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25112     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25113
25114     // From, To sizes and ElemCount must be pow of two
25115     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
25116     // We are going to use the original vector elt for storing.
25117     // Accumulated smaller vector elements must be a multiple of the store size.
25118     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
25119
25120     unsigned SizeRatio  = FromSz / ToSz;
25121
25122     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25123
25124     // Create a type on which we perform the shuffle
25125     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25126             StVT.getScalarType(), NumElems*SizeRatio);
25127
25128     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25129
25130     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
25131     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
25132     for (unsigned i = 0; i != NumElems; ++i)
25133       ShuffleVec[i] = i * SizeRatio;
25134
25135     // Can't shuffle using an illegal type.
25136     if (!TLI.isTypeLegal(WideVecVT))
25137       return SDValue();
25138
25139     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25140                                          DAG.getUNDEF(WideVecVT),
25141                                          &ShuffleVec[0]);
25142     // At this point all of the data is stored at the bottom of the
25143     // register. We now need to save it to mem.
25144
25145     // Find the largest store unit
25146     MVT StoreType = MVT::i8;
25147     for (MVT Tp : MVT::integer_valuetypes()) {
25148       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
25149         StoreType = Tp;
25150     }
25151
25152     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
25153     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
25154         (64 <= NumElems * ToSz))
25155       StoreType = MVT::f64;
25156
25157     // Bitcast the original vector into a vector of store-size units
25158     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
25159             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
25160     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
25161     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
25162     SmallVector<SDValue, 8> Chains;
25163     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
25164                                         TLI.getPointerTy());
25165     SDValue Ptr = St->getBasePtr();
25166
25167     // Perform one or more big stores into memory.
25168     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
25169       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
25170                                    StoreType, ShuffWide,
25171                                    DAG.getIntPtrConstant(i));
25172       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
25173                                 St->getPointerInfo(), St->isVolatile(),
25174                                 St->isNonTemporal(), St->getAlignment());
25175       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
25176       Chains.push_back(Ch);
25177     }
25178
25179     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
25180   }
25181
25182   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
25183   // the FP state in cases where an emms may be missing.
25184   // A preferable solution to the general problem is to figure out the right
25185   // places to insert EMMS.  This qualifies as a quick hack.
25186
25187   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
25188   if (VT.getSizeInBits() != 64)
25189     return SDValue();
25190
25191   const Function *F = DAG.getMachineFunction().getFunction();
25192   bool NoImplicitFloatOps = F->getAttributes().
25193     hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
25194   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
25195                      && Subtarget->hasSSE2();
25196   if ((VT.isVector() ||
25197        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
25198       isa<LoadSDNode>(St->getValue()) &&
25199       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
25200       St->getChain().hasOneUse() && !St->isVolatile()) {
25201     SDNode* LdVal = St->getValue().getNode();
25202     LoadSDNode *Ld = nullptr;
25203     int TokenFactorIndex = -1;
25204     SmallVector<SDValue, 8> Ops;
25205     SDNode* ChainVal = St->getChain().getNode();
25206     // Must be a store of a load.  We currently handle two cases:  the load
25207     // is a direct child, and it's under an intervening TokenFactor.  It is
25208     // possible to dig deeper under nested TokenFactors.
25209     if (ChainVal == LdVal)
25210       Ld = cast<LoadSDNode>(St->getChain());
25211     else if (St->getValue().hasOneUse() &&
25212              ChainVal->getOpcode() == ISD::TokenFactor) {
25213       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
25214         if (ChainVal->getOperand(i).getNode() == LdVal) {
25215           TokenFactorIndex = i;
25216           Ld = cast<LoadSDNode>(St->getValue());
25217         } else
25218           Ops.push_back(ChainVal->getOperand(i));
25219       }
25220     }
25221
25222     if (!Ld || !ISD::isNormalLoad(Ld))
25223       return SDValue();
25224
25225     // If this is not the MMX case, i.e. we are just turning i64 load/store
25226     // into f64 load/store, avoid the transformation if there are multiple
25227     // uses of the loaded value.
25228     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
25229       return SDValue();
25230
25231     SDLoc LdDL(Ld);
25232     SDLoc StDL(N);
25233     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
25234     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
25235     // pair instead.
25236     if (Subtarget->is64Bit() || F64IsLegal) {
25237       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
25238       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
25239                                   Ld->getPointerInfo(), Ld->isVolatile(),
25240                                   Ld->isNonTemporal(), Ld->isInvariant(),
25241                                   Ld->getAlignment());
25242       SDValue NewChain = NewLd.getValue(1);
25243       if (TokenFactorIndex != -1) {
25244         Ops.push_back(NewChain);
25245         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25246       }
25247       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
25248                           St->getPointerInfo(),
25249                           St->isVolatile(), St->isNonTemporal(),
25250                           St->getAlignment());
25251     }
25252
25253     // Otherwise, lower to two pairs of 32-bit loads / stores.
25254     SDValue LoAddr = Ld->getBasePtr();
25255     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
25256                                  DAG.getConstant(4, MVT::i32));
25257
25258     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
25259                                Ld->getPointerInfo(),
25260                                Ld->isVolatile(), Ld->isNonTemporal(),
25261                                Ld->isInvariant(), Ld->getAlignment());
25262     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
25263                                Ld->getPointerInfo().getWithOffset(4),
25264                                Ld->isVolatile(), Ld->isNonTemporal(),
25265                                Ld->isInvariant(),
25266                                MinAlign(Ld->getAlignment(), 4));
25267
25268     SDValue NewChain = LoLd.getValue(1);
25269     if (TokenFactorIndex != -1) {
25270       Ops.push_back(LoLd);
25271       Ops.push_back(HiLd);
25272       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25273     }
25274
25275     LoAddr = St->getBasePtr();
25276     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
25277                          DAG.getConstant(4, MVT::i32));
25278
25279     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
25280                                 St->getPointerInfo(),
25281                                 St->isVolatile(), St->isNonTemporal(),
25282                                 St->getAlignment());
25283     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
25284                                 St->getPointerInfo().getWithOffset(4),
25285                                 St->isVolatile(),
25286                                 St->isNonTemporal(),
25287                                 MinAlign(St->getAlignment(), 4));
25288     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
25289   }
25290   return SDValue();
25291 }
25292
25293 /// Return 'true' if this vector operation is "horizontal"
25294 /// and return the operands for the horizontal operation in LHS and RHS.  A
25295 /// horizontal operation performs the binary operation on successive elements
25296 /// of its first operand, then on successive elements of its second operand,
25297 /// returning the resulting values in a vector.  For example, if
25298 ///   A = < float a0, float a1, float a2, float a3 >
25299 /// and
25300 ///   B = < float b0, float b1, float b2, float b3 >
25301 /// then the result of doing a horizontal operation on A and B is
25302 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
25303 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
25304 /// A horizontal-op B, for some already available A and B, and if so then LHS is
25305 /// set to A, RHS to B, and the routine returns 'true'.
25306 /// Note that the binary operation should have the property that if one of the
25307 /// operands is UNDEF then the result is UNDEF.
25308 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
25309   // Look for the following pattern: if
25310   //   A = < float a0, float a1, float a2, float a3 >
25311   //   B = < float b0, float b1, float b2, float b3 >
25312   // and
25313   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
25314   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
25315   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
25316   // which is A horizontal-op B.
25317
25318   // At least one of the operands should be a vector shuffle.
25319   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
25320       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
25321     return false;
25322
25323   MVT VT = LHS.getSimpleValueType();
25324
25325   assert((VT.is128BitVector() || VT.is256BitVector()) &&
25326          "Unsupported vector type for horizontal add/sub");
25327
25328   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
25329   // operate independently on 128-bit lanes.
25330   unsigned NumElts = VT.getVectorNumElements();
25331   unsigned NumLanes = VT.getSizeInBits()/128;
25332   unsigned NumLaneElts = NumElts / NumLanes;
25333   assert((NumLaneElts % 2 == 0) &&
25334          "Vector type should have an even number of elements in each lane");
25335   unsigned HalfLaneElts = NumLaneElts/2;
25336
25337   // View LHS in the form
25338   //   LHS = VECTOR_SHUFFLE A, B, LMask
25339   // If LHS is not a shuffle then pretend it is the shuffle
25340   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
25341   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
25342   // type VT.
25343   SDValue A, B;
25344   SmallVector<int, 16> LMask(NumElts);
25345   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25346     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
25347       A = LHS.getOperand(0);
25348     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
25349       B = LHS.getOperand(1);
25350     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
25351     std::copy(Mask.begin(), Mask.end(), LMask.begin());
25352   } else {
25353     if (LHS.getOpcode() != ISD::UNDEF)
25354       A = LHS;
25355     for (unsigned i = 0; i != NumElts; ++i)
25356       LMask[i] = i;
25357   }
25358
25359   // Likewise, view RHS in the form
25360   //   RHS = VECTOR_SHUFFLE C, D, RMask
25361   SDValue C, D;
25362   SmallVector<int, 16> RMask(NumElts);
25363   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25364     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
25365       C = RHS.getOperand(0);
25366     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
25367       D = RHS.getOperand(1);
25368     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
25369     std::copy(Mask.begin(), Mask.end(), RMask.begin());
25370   } else {
25371     if (RHS.getOpcode() != ISD::UNDEF)
25372       C = RHS;
25373     for (unsigned i = 0; i != NumElts; ++i)
25374       RMask[i] = i;
25375   }
25376
25377   // Check that the shuffles are both shuffling the same vectors.
25378   if (!(A == C && B == D) && !(A == D && B == C))
25379     return false;
25380
25381   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
25382   if (!A.getNode() && !B.getNode())
25383     return false;
25384
25385   // If A and B occur in reverse order in RHS, then "swap" them (which means
25386   // rewriting the mask).
25387   if (A != C)
25388     CommuteVectorShuffleMask(RMask, NumElts);
25389
25390   // At this point LHS and RHS are equivalent to
25391   //   LHS = VECTOR_SHUFFLE A, B, LMask
25392   //   RHS = VECTOR_SHUFFLE A, B, RMask
25393   // Check that the masks correspond to performing a horizontal operation.
25394   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
25395     for (unsigned i = 0; i != NumLaneElts; ++i) {
25396       int LIdx = LMask[i+l], RIdx = RMask[i+l];
25397
25398       // Ignore any UNDEF components.
25399       if (LIdx < 0 || RIdx < 0 ||
25400           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
25401           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
25402         continue;
25403
25404       // Check that successive elements are being operated on.  If not, this is
25405       // not a horizontal operation.
25406       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
25407       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
25408       if (!(LIdx == Index && RIdx == Index + 1) &&
25409           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
25410         return false;
25411     }
25412   }
25413
25414   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
25415   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
25416   return true;
25417 }
25418
25419 /// Do target-specific dag combines on floating point adds.
25420 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
25421                                   const X86Subtarget *Subtarget) {
25422   EVT VT = N->getValueType(0);
25423   SDValue LHS = N->getOperand(0);
25424   SDValue RHS = N->getOperand(1);
25425
25426   // Try to synthesize horizontal adds from adds of shuffles.
25427   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25428        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25429       isHorizontalBinOp(LHS, RHS, true))
25430     return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
25431   return SDValue();
25432 }
25433
25434 /// Do target-specific dag combines on floating point subs.
25435 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
25436                                   const X86Subtarget *Subtarget) {
25437   EVT VT = N->getValueType(0);
25438   SDValue LHS = N->getOperand(0);
25439   SDValue RHS = N->getOperand(1);
25440
25441   // Try to synthesize horizontal subs from subs of shuffles.
25442   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25443        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25444       isHorizontalBinOp(LHS, RHS, false))
25445     return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
25446   return SDValue();
25447 }
25448
25449 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
25450 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
25451   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
25452   // F[X]OR(0.0, x) -> x
25453   // F[X]OR(x, 0.0) -> x
25454   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25455     if (C->getValueAPF().isPosZero())
25456       return N->getOperand(1);
25457   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25458     if (C->getValueAPF().isPosZero())
25459       return N->getOperand(0);
25460   return SDValue();
25461 }
25462
25463 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
25464 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
25465   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
25466
25467   // Only perform optimizations if UnsafeMath is used.
25468   if (!DAG.getTarget().Options.UnsafeFPMath)
25469     return SDValue();
25470
25471   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
25472   // into FMINC and FMAXC, which are Commutative operations.
25473   unsigned NewOp = 0;
25474   switch (N->getOpcode()) {
25475     default: llvm_unreachable("unknown opcode");
25476     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
25477     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
25478   }
25479
25480   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
25481                      N->getOperand(0), N->getOperand(1));
25482 }
25483
25484 /// Do target-specific dag combines on X86ISD::FAND nodes.
25485 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
25486   // FAND(0.0, x) -> 0.0
25487   // FAND(x, 0.0) -> 0.0
25488   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25489     if (C->getValueAPF().isPosZero())
25490       return N->getOperand(0);
25491   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25492     if (C->getValueAPF().isPosZero())
25493       return N->getOperand(1);
25494   return SDValue();
25495 }
25496
25497 /// Do target-specific dag combines on X86ISD::FANDN nodes
25498 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
25499   // FANDN(x, 0.0) -> 0.0
25500   // FANDN(0.0, x) -> x
25501   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25502     if (C->getValueAPF().isPosZero())
25503       return N->getOperand(1);
25504   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25505     if (C->getValueAPF().isPosZero())
25506       return N->getOperand(1);
25507   return SDValue();
25508 }
25509
25510 static SDValue PerformBTCombine(SDNode *N,
25511                                 SelectionDAG &DAG,
25512                                 TargetLowering::DAGCombinerInfo &DCI) {
25513   // BT ignores high bits in the bit index operand.
25514   SDValue Op1 = N->getOperand(1);
25515   if (Op1.hasOneUse()) {
25516     unsigned BitWidth = Op1.getValueSizeInBits();
25517     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
25518     APInt KnownZero, KnownOne;
25519     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
25520                                           !DCI.isBeforeLegalizeOps());
25521     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25522     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
25523         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
25524       DCI.CommitTargetLoweringOpt(TLO);
25525   }
25526   return SDValue();
25527 }
25528
25529 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
25530   SDValue Op = N->getOperand(0);
25531   if (Op.getOpcode() == ISD::BITCAST)
25532     Op = Op.getOperand(0);
25533   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
25534   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
25535       VT.getVectorElementType().getSizeInBits() ==
25536       OpVT.getVectorElementType().getSizeInBits()) {
25537     return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
25538   }
25539   return SDValue();
25540 }
25541
25542 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
25543                                                const X86Subtarget *Subtarget) {
25544   EVT VT = N->getValueType(0);
25545   if (!VT.isVector())
25546     return SDValue();
25547
25548   SDValue N0 = N->getOperand(0);
25549   SDValue N1 = N->getOperand(1);
25550   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
25551   SDLoc dl(N);
25552
25553   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
25554   // both SSE and AVX2 since there is no sign-extended shift right
25555   // operation on a vector with 64-bit elements.
25556   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
25557   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
25558   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
25559       N0.getOpcode() == ISD::SIGN_EXTEND)) {
25560     SDValue N00 = N0.getOperand(0);
25561
25562     // EXTLOAD has a better solution on AVX2,
25563     // it may be replaced with X86ISD::VSEXT node.
25564     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
25565       if (!ISD::isNormalLoad(N00.getNode()))
25566         return SDValue();
25567
25568     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
25569         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
25570                                   N00, N1);
25571       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
25572     }
25573   }
25574   return SDValue();
25575 }
25576
25577 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
25578                                   TargetLowering::DAGCombinerInfo &DCI,
25579                                   const X86Subtarget *Subtarget) {
25580   SDValue N0 = N->getOperand(0);
25581   EVT VT = N->getValueType(0);
25582
25583   // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
25584   // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
25585   // This exposes the sext to the sdivrem lowering, so that it directly extends
25586   // from AH (which we otherwise need to do contortions to access).
25587   if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
25588       N0.getValueType() == MVT::i8 && VT == MVT::i32) {
25589     SDLoc dl(N);
25590     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25591     SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
25592                             N0.getOperand(0), N0.getOperand(1));
25593     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25594     return R.getValue(1);
25595   }
25596
25597   if (!DCI.isBeforeLegalizeOps())
25598     return SDValue();
25599
25600   if (!Subtarget->hasFp256())
25601     return SDValue();
25602
25603   if (VT.isVector() && VT.getSizeInBits() == 256) {
25604     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25605     if (R.getNode())
25606       return R;
25607   }
25608
25609   return SDValue();
25610 }
25611
25612 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
25613                                  const X86Subtarget* Subtarget) {
25614   SDLoc dl(N);
25615   EVT VT = N->getValueType(0);
25616
25617   // Let legalize expand this if it isn't a legal type yet.
25618   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
25619     return SDValue();
25620
25621   EVT ScalarVT = VT.getScalarType();
25622   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
25623       (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
25624     return SDValue();
25625
25626   SDValue A = N->getOperand(0);
25627   SDValue B = N->getOperand(1);
25628   SDValue C = N->getOperand(2);
25629
25630   bool NegA = (A.getOpcode() == ISD::FNEG);
25631   bool NegB = (B.getOpcode() == ISD::FNEG);
25632   bool NegC = (C.getOpcode() == ISD::FNEG);
25633
25634   // Negative multiplication when NegA xor NegB
25635   bool NegMul = (NegA != NegB);
25636   if (NegA)
25637     A = A.getOperand(0);
25638   if (NegB)
25639     B = B.getOperand(0);
25640   if (NegC)
25641     C = C.getOperand(0);
25642
25643   unsigned Opcode;
25644   if (!NegMul)
25645     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
25646   else
25647     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
25648
25649   return DAG.getNode(Opcode, dl, VT, A, B, C);
25650 }
25651
25652 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
25653                                   TargetLowering::DAGCombinerInfo &DCI,
25654                                   const X86Subtarget *Subtarget) {
25655   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
25656   //           (and (i32 x86isd::setcc_carry), 1)
25657   // This eliminates the zext. This transformation is necessary because
25658   // ISD::SETCC is always legalized to i8.
25659   SDLoc dl(N);
25660   SDValue N0 = N->getOperand(0);
25661   EVT VT = N->getValueType(0);
25662
25663   if (N0.getOpcode() == ISD::AND &&
25664       N0.hasOneUse() &&
25665       N0.getOperand(0).hasOneUse()) {
25666     SDValue N00 = N0.getOperand(0);
25667     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25668       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
25669       if (!C || C->getZExtValue() != 1)
25670         return SDValue();
25671       return DAG.getNode(ISD::AND, dl, VT,
25672                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25673                                      N00.getOperand(0), N00.getOperand(1)),
25674                          DAG.getConstant(1, VT));
25675     }
25676   }
25677
25678   if (N0.getOpcode() == ISD::TRUNCATE &&
25679       N0.hasOneUse() &&
25680       N0.getOperand(0).hasOneUse()) {
25681     SDValue N00 = N0.getOperand(0);
25682     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25683       return DAG.getNode(ISD::AND, dl, VT,
25684                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25685                                      N00.getOperand(0), N00.getOperand(1)),
25686                          DAG.getConstant(1, VT));
25687     }
25688   }
25689   if (VT.is256BitVector()) {
25690     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25691     if (R.getNode())
25692       return R;
25693   }
25694
25695   // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
25696   // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
25697   // This exposes the zext to the udivrem lowering, so that it directly extends
25698   // from AH (which we otherwise need to do contortions to access).
25699   if (N0.getOpcode() == ISD::UDIVREM &&
25700       N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
25701       (VT == MVT::i32 || VT == MVT::i64)) {
25702     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25703     SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
25704                             N0.getOperand(0), N0.getOperand(1));
25705     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25706     return R.getValue(1);
25707   }
25708
25709   return SDValue();
25710 }
25711
25712 // Optimize x == -y --> x+y == 0
25713 //          x != -y --> x+y != 0
25714 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
25715                                       const X86Subtarget* Subtarget) {
25716   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
25717   SDValue LHS = N->getOperand(0);
25718   SDValue RHS = N->getOperand(1);
25719   EVT VT = N->getValueType(0);
25720   SDLoc DL(N);
25721
25722   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
25723     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
25724       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
25725         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25726                                    LHS.getValueType(), RHS, LHS.getOperand(1));
25727         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25728                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25729       }
25730   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
25731     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
25732       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
25733         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25734                                    RHS.getValueType(), LHS, RHS.getOperand(1));
25735         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25736                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25737       }
25738
25739   if (VT.getScalarType() == MVT::i1) {
25740     bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
25741       (LHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25742     bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
25743     if (!IsSEXT0 && !IsVZero0)
25744       return SDValue();
25745     bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
25746       (RHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25747     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
25748
25749     if (!IsSEXT1 && !IsVZero1)
25750       return SDValue();
25751
25752     if (IsSEXT0 && IsVZero1) {
25753       assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
25754       if (CC == ISD::SETEQ)
25755         return DAG.getNOT(DL, LHS.getOperand(0), VT);
25756       return LHS.getOperand(0);
25757     }
25758     if (IsSEXT1 && IsVZero0) {
25759       assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
25760       if (CC == ISD::SETEQ)
25761         return DAG.getNOT(DL, RHS.getOperand(0), VT);
25762       return RHS.getOperand(0);
25763     }
25764   }
25765
25766   return SDValue();
25767 }
25768
25769 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
25770                                       const X86Subtarget *Subtarget) {
25771   SDLoc dl(N);
25772   MVT VT = N->getOperand(1)->getSimpleValueType(0);
25773   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
25774          "X86insertps is only defined for v4x32");
25775
25776   SDValue Ld = N->getOperand(1);
25777   if (MayFoldLoad(Ld)) {
25778     // Extract the countS bits from the immediate so we can get the proper
25779     // address when narrowing the vector load to a specific element.
25780     // When the second source op is a memory address, interps doesn't use
25781     // countS and just gets an f32 from that address.
25782     unsigned DestIndex =
25783         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
25784     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
25785   } else
25786     return SDValue();
25787
25788   // Create this as a scalar to vector to match the instruction pattern.
25789   SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
25790   // countS bits are ignored when loading from memory on insertps, which
25791   // means we don't need to explicitly set them to 0.
25792   return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
25793                      LoadScalarToVector, N->getOperand(2));
25794 }
25795
25796 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
25797 // as "sbb reg,reg", since it can be extended without zext and produces
25798 // an all-ones bit which is more useful than 0/1 in some cases.
25799 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
25800                                MVT VT) {
25801   if (VT == MVT::i8)
25802     return DAG.getNode(ISD::AND, DL, VT,
25803                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25804                                    DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
25805                        DAG.getConstant(1, VT));
25806   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
25807   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
25808                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25809                                  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
25810 }
25811
25812 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
25813 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
25814                                    TargetLowering::DAGCombinerInfo &DCI,
25815                                    const X86Subtarget *Subtarget) {
25816   SDLoc DL(N);
25817   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
25818   SDValue EFLAGS = N->getOperand(1);
25819
25820   if (CC == X86::COND_A) {
25821     // Try to convert COND_A into COND_B in an attempt to facilitate
25822     // materializing "setb reg".
25823     //
25824     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
25825     // cannot take an immediate as its first operand.
25826     //
25827     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
25828         EFLAGS.getValueType().isInteger() &&
25829         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
25830       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
25831                                    EFLAGS.getNode()->getVTList(),
25832                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
25833       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
25834       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
25835     }
25836   }
25837
25838   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
25839   // a zext and produces an all-ones bit which is more useful than 0/1 in some
25840   // cases.
25841   if (CC == X86::COND_B)
25842     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
25843
25844   SDValue Flags;
25845
25846   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25847   if (Flags.getNode()) {
25848     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25849     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
25850   }
25851
25852   return SDValue();
25853 }
25854
25855 // Optimize branch condition evaluation.
25856 //
25857 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
25858                                     TargetLowering::DAGCombinerInfo &DCI,
25859                                     const X86Subtarget *Subtarget) {
25860   SDLoc DL(N);
25861   SDValue Chain = N->getOperand(0);
25862   SDValue Dest = N->getOperand(1);
25863   SDValue EFLAGS = N->getOperand(3);
25864   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
25865
25866   SDValue Flags;
25867
25868   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25869   if (Flags.getNode()) {
25870     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25871     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
25872                        Flags);
25873   }
25874
25875   return SDValue();
25876 }
25877
25878 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
25879                                                          SelectionDAG &DAG) {
25880   // Take advantage of vector comparisons producing 0 or -1 in each lane to
25881   // optimize away operation when it's from a constant.
25882   //
25883   // The general transformation is:
25884   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
25885   //       AND(VECTOR_CMP(x,y), constant2)
25886   //    constant2 = UNARYOP(constant)
25887
25888   // Early exit if this isn't a vector operation, the operand of the
25889   // unary operation isn't a bitwise AND, or if the sizes of the operations
25890   // aren't the same.
25891   EVT VT = N->getValueType(0);
25892   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
25893       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
25894       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
25895     return SDValue();
25896
25897   // Now check that the other operand of the AND is a constant. We could
25898   // make the transformation for non-constant splats as well, but it's unclear
25899   // that would be a benefit as it would not eliminate any operations, just
25900   // perform one more step in scalar code before moving to the vector unit.
25901   if (BuildVectorSDNode *BV =
25902           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
25903     // Bail out if the vector isn't a constant.
25904     if (!BV->isConstant())
25905       return SDValue();
25906
25907     // Everything checks out. Build up the new and improved node.
25908     SDLoc DL(N);
25909     EVT IntVT = BV->getValueType(0);
25910     // Create a new constant of the appropriate type for the transformed
25911     // DAG.
25912     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
25913     // The AND node needs bitcasts to/from an integer vector type around it.
25914     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
25915     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
25916                                  N->getOperand(0)->getOperand(0), MaskConst);
25917     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
25918     return Res;
25919   }
25920
25921   return SDValue();
25922 }
25923
25924 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
25925                                         const X86Subtarget *Subtarget) {
25926   // First try to optimize away the conversion entirely when it's
25927   // conditionally from a constant. Vectors only.
25928   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
25929   if (Res != SDValue())
25930     return Res;
25931
25932   // Now move on to more general possibilities.
25933   SDValue Op0 = N->getOperand(0);
25934   EVT InVT = Op0->getValueType(0);
25935
25936   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
25937   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
25938     SDLoc dl(N);
25939     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
25940     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
25941     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
25942   }
25943
25944   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
25945   // a 32-bit target where SSE doesn't support i64->FP operations.
25946   if (Op0.getOpcode() == ISD::LOAD) {
25947     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
25948     EVT VT = Ld->getValueType(0);
25949     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
25950         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
25951         !Subtarget->is64Bit() && VT == MVT::i64) {
25952       SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
25953           SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
25954       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
25955       return FILDChain;
25956     }
25957   }
25958   return SDValue();
25959 }
25960
25961 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
25962 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
25963                                  X86TargetLowering::DAGCombinerInfo &DCI) {
25964   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
25965   // the result is either zero or one (depending on the input carry bit).
25966   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
25967   if (X86::isZeroNode(N->getOperand(0)) &&
25968       X86::isZeroNode(N->getOperand(1)) &&
25969       // We don't have a good way to replace an EFLAGS use, so only do this when
25970       // dead right now.
25971       SDValue(N, 1).use_empty()) {
25972     SDLoc DL(N);
25973     EVT VT = N->getValueType(0);
25974     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
25975     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
25976                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
25977                                            DAG.getConstant(X86::COND_B,MVT::i8),
25978                                            N->getOperand(2)),
25979                                DAG.getConstant(1, VT));
25980     return DCI.CombineTo(N, Res1, CarryOut);
25981   }
25982
25983   return SDValue();
25984 }
25985
25986 // fold (add Y, (sete  X, 0)) -> adc  0, Y
25987 //      (add Y, (setne X, 0)) -> sbb -1, Y
25988 //      (sub (sete  X, 0), Y) -> sbb  0, Y
25989 //      (sub (setne X, 0), Y) -> adc -1, Y
25990 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
25991   SDLoc DL(N);
25992
25993   // Look through ZExts.
25994   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
25995   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
25996     return SDValue();
25997
25998   SDValue SetCC = Ext.getOperand(0);
25999   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
26000     return SDValue();
26001
26002   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
26003   if (CC != X86::COND_E && CC != X86::COND_NE)
26004     return SDValue();
26005
26006   SDValue Cmp = SetCC.getOperand(1);
26007   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
26008       !X86::isZeroNode(Cmp.getOperand(1)) ||
26009       !Cmp.getOperand(0).getValueType().isInteger())
26010     return SDValue();
26011
26012   SDValue CmpOp0 = Cmp.getOperand(0);
26013   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
26014                                DAG.getConstant(1, CmpOp0.getValueType()));
26015
26016   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
26017   if (CC == X86::COND_NE)
26018     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
26019                        DL, OtherVal.getValueType(), OtherVal,
26020                        DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
26021   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
26022                      DL, OtherVal.getValueType(), OtherVal,
26023                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
26024 }
26025
26026 /// PerformADDCombine - Do target-specific dag combines on integer adds.
26027 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
26028                                  const X86Subtarget *Subtarget) {
26029   EVT VT = N->getValueType(0);
26030   SDValue Op0 = N->getOperand(0);
26031   SDValue Op1 = N->getOperand(1);
26032
26033   // Try to synthesize horizontal adds from adds of shuffles.
26034   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26035        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26036       isHorizontalBinOp(Op0, Op1, true))
26037     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
26038
26039   return OptimizeConditionalInDecrement(N, DAG);
26040 }
26041
26042 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
26043                                  const X86Subtarget *Subtarget) {
26044   SDValue Op0 = N->getOperand(0);
26045   SDValue Op1 = N->getOperand(1);
26046
26047   // X86 can't encode an immediate LHS of a sub. See if we can push the
26048   // negation into a preceding instruction.
26049   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
26050     // If the RHS of the sub is a XOR with one use and a constant, invert the
26051     // immediate. Then add one to the LHS of the sub so we can turn
26052     // X-Y -> X+~Y+1, saving one register.
26053     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
26054         isa<ConstantSDNode>(Op1.getOperand(1))) {
26055       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
26056       EVT VT = Op0.getValueType();
26057       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
26058                                    Op1.getOperand(0),
26059                                    DAG.getConstant(~XorC, VT));
26060       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
26061                          DAG.getConstant(C->getAPIntValue()+1, VT));
26062     }
26063   }
26064
26065   // Try to synthesize horizontal adds from adds of shuffles.
26066   EVT VT = N->getValueType(0);
26067   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26068        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26069       isHorizontalBinOp(Op0, Op1, true))
26070     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
26071
26072   return OptimizeConditionalInDecrement(N, DAG);
26073 }
26074
26075 /// performVZEXTCombine - Performs build vector combines
26076 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
26077                                    TargetLowering::DAGCombinerInfo &DCI,
26078                                    const X86Subtarget *Subtarget) {
26079   SDLoc DL(N);
26080   MVT VT = N->getSimpleValueType(0);
26081   SDValue Op = N->getOperand(0);
26082   MVT OpVT = Op.getSimpleValueType();
26083   MVT OpEltVT = OpVT.getVectorElementType();
26084   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
26085
26086   // (vzext (bitcast (vzext (x)) -> (vzext x)
26087   SDValue V = Op;
26088   while (V.getOpcode() == ISD::BITCAST)
26089     V = V.getOperand(0);
26090
26091   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
26092     MVT InnerVT = V.getSimpleValueType();
26093     MVT InnerEltVT = InnerVT.getVectorElementType();
26094
26095     // If the element sizes match exactly, we can just do one larger vzext. This
26096     // is always an exact type match as vzext operates on integer types.
26097     if (OpEltVT == InnerEltVT) {
26098       assert(OpVT == InnerVT && "Types must match for vzext!");
26099       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
26100     }
26101
26102     // The only other way we can combine them is if only a single element of the
26103     // inner vzext is used in the input to the outer vzext.
26104     if (InnerEltVT.getSizeInBits() < InputBits)
26105       return SDValue();
26106
26107     // In this case, the inner vzext is completely dead because we're going to
26108     // only look at bits inside of the low element. Just do the outer vzext on
26109     // a bitcast of the input to the inner.
26110     return DAG.getNode(X86ISD::VZEXT, DL, VT,
26111                        DAG.getNode(ISD::BITCAST, DL, OpVT, V));
26112   }
26113
26114   // Check if we can bypass extracting and re-inserting an element of an input
26115   // vector. Essentialy:
26116   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
26117   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
26118       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
26119       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
26120     SDValue ExtractedV = V.getOperand(0);
26121     SDValue OrigV = ExtractedV.getOperand(0);
26122     if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
26123       if (ExtractIdx->getZExtValue() == 0) {
26124         MVT OrigVT = OrigV.getSimpleValueType();
26125         // Extract a subvector if necessary...
26126         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
26127           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
26128           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
26129                                     OrigVT.getVectorNumElements() / Ratio);
26130           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
26131                               DAG.getIntPtrConstant(0));
26132         }
26133         Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
26134         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
26135       }
26136   }
26137
26138   return SDValue();
26139 }
26140
26141 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
26142                                              DAGCombinerInfo &DCI) const {
26143   SelectionDAG &DAG = DCI.DAG;
26144   switch (N->getOpcode()) {
26145   default: break;
26146   case ISD::EXTRACT_VECTOR_ELT:
26147     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
26148   case ISD::VSELECT:
26149   case ISD::SELECT:
26150   case X86ISD::SHRUNKBLEND:
26151     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
26152   case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG);
26153   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
26154   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
26155   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
26156   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
26157   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
26158   case ISD::SHL:
26159   case ISD::SRA:
26160   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
26161   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
26162   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
26163   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
26164   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
26165   case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
26166   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
26167   case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
26168   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
26169   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
26170   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
26171   case X86ISD::FXOR:
26172   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
26173   case X86ISD::FMIN:
26174   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
26175   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
26176   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
26177   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
26178   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
26179   case ISD::ANY_EXTEND:
26180   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
26181   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
26182   case ISD::SIGN_EXTEND_INREG:
26183     return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
26184   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
26185   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
26186   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
26187   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
26188   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
26189   case X86ISD::SHUFP:       // Handle all target specific shuffles
26190   case X86ISD::PALIGNR:
26191   case X86ISD::UNPCKH:
26192   case X86ISD::UNPCKL:
26193   case X86ISD::MOVHLPS:
26194   case X86ISD::MOVLHPS:
26195   case X86ISD::PSHUFB:
26196   case X86ISD::PSHUFD:
26197   case X86ISD::PSHUFHW:
26198   case X86ISD::PSHUFLW:
26199   case X86ISD::MOVSS:
26200   case X86ISD::MOVSD:
26201   case X86ISD::VPERMILPI:
26202   case X86ISD::VPERM2X128:
26203   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
26204   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
26205   case ISD::INTRINSIC_WO_CHAIN:
26206     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
26207   case X86ISD::INSERTPS: {
26208     if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
26209       return PerformINSERTPSCombine(N, DAG, Subtarget);
26210     break;
26211   }
26212   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
26213   }
26214
26215   return SDValue();
26216 }
26217
26218 /// isTypeDesirableForOp - Return true if the target has native support for
26219 /// the specified value type and it is 'desirable' to use the type for the
26220 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
26221 /// instruction encodings are longer and some i16 instructions are slow.
26222 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
26223   if (!isTypeLegal(VT))
26224     return false;
26225   if (VT != MVT::i16)
26226     return true;
26227
26228   switch (Opc) {
26229   default:
26230     return true;
26231   case ISD::LOAD:
26232   case ISD::SIGN_EXTEND:
26233   case ISD::ZERO_EXTEND:
26234   case ISD::ANY_EXTEND:
26235   case ISD::SHL:
26236   case ISD::SRL:
26237   case ISD::SUB:
26238   case ISD::ADD:
26239   case ISD::MUL:
26240   case ISD::AND:
26241   case ISD::OR:
26242   case ISD::XOR:
26243     return false;
26244   }
26245 }
26246
26247 /// IsDesirableToPromoteOp - This method query the target whether it is
26248 /// beneficial for dag combiner to promote the specified node. If true, it
26249 /// should return the desired promotion type by reference.
26250 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
26251   EVT VT = Op.getValueType();
26252   if (VT != MVT::i16)
26253     return false;
26254
26255   bool Promote = false;
26256   bool Commute = false;
26257   switch (Op.getOpcode()) {
26258   default: break;
26259   case ISD::LOAD: {
26260     LoadSDNode *LD = cast<LoadSDNode>(Op);
26261     // If the non-extending load has a single use and it's not live out, then it
26262     // might be folded.
26263     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
26264                                                      Op.hasOneUse()*/) {
26265       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
26266              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
26267         // The only case where we'd want to promote LOAD (rather then it being
26268         // promoted as an operand is when it's only use is liveout.
26269         if (UI->getOpcode() != ISD::CopyToReg)
26270           return false;
26271       }
26272     }
26273     Promote = true;
26274     break;
26275   }
26276   case ISD::SIGN_EXTEND:
26277   case ISD::ZERO_EXTEND:
26278   case ISD::ANY_EXTEND:
26279     Promote = true;
26280     break;
26281   case ISD::SHL:
26282   case ISD::SRL: {
26283     SDValue N0 = Op.getOperand(0);
26284     // Look out for (store (shl (load), x)).
26285     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
26286       return false;
26287     Promote = true;
26288     break;
26289   }
26290   case ISD::ADD:
26291   case ISD::MUL:
26292   case ISD::AND:
26293   case ISD::OR:
26294   case ISD::XOR:
26295     Commute = true;
26296     // fallthrough
26297   case ISD::SUB: {
26298     SDValue N0 = Op.getOperand(0);
26299     SDValue N1 = Op.getOperand(1);
26300     if (!Commute && MayFoldLoad(N1))
26301       return false;
26302     // Avoid disabling potential load folding opportunities.
26303     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
26304       return false;
26305     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
26306       return false;
26307     Promote = true;
26308   }
26309   }
26310
26311   PVT = MVT::i32;
26312   return Promote;
26313 }
26314
26315 //===----------------------------------------------------------------------===//
26316 //                           X86 Inline Assembly Support
26317 //===----------------------------------------------------------------------===//
26318
26319 namespace {
26320   // Helper to match a string separated by whitespace.
26321   bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
26322     s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
26323
26324     for (unsigned i = 0, e = args.size(); i != e; ++i) {
26325       StringRef piece(*args[i]);
26326       if (!s.startswith(piece)) // Check if the piece matches.
26327         return false;
26328
26329       s = s.substr(piece.size());
26330       StringRef::size_type pos = s.find_first_not_of(" \t");
26331       if (pos == 0) // We matched a prefix.
26332         return false;
26333
26334       s = s.substr(pos);
26335     }
26336
26337     return s.empty();
26338   }
26339   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
26340 }
26341
26342 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
26343
26344   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
26345     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
26346         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
26347         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
26348
26349       if (AsmPieces.size() == 3)
26350         return true;
26351       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
26352         return true;
26353     }
26354   }
26355   return false;
26356 }
26357
26358 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
26359   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
26360
26361   std::string AsmStr = IA->getAsmString();
26362
26363   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
26364   if (!Ty || Ty->getBitWidth() % 16 != 0)
26365     return false;
26366
26367   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
26368   SmallVector<StringRef, 4> AsmPieces;
26369   SplitString(AsmStr, AsmPieces, ";\n");
26370
26371   switch (AsmPieces.size()) {
26372   default: return false;
26373   case 1:
26374     // FIXME: this should verify that we are targeting a 486 or better.  If not,
26375     // we will turn this bswap into something that will be lowered to logical
26376     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
26377     // lower so don't worry about this.
26378     // bswap $0
26379     if (matchAsm(AsmPieces[0], "bswap", "$0") ||
26380         matchAsm(AsmPieces[0], "bswapl", "$0") ||
26381         matchAsm(AsmPieces[0], "bswapq", "$0") ||
26382         matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
26383         matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
26384         matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
26385       // No need to check constraints, nothing other than the equivalent of
26386       // "=r,0" would be valid here.
26387       return IntrinsicLowering::LowerToByteSwap(CI);
26388     }
26389
26390     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
26391     if (CI->getType()->isIntegerTy(16) &&
26392         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26393         (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
26394          matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
26395       AsmPieces.clear();
26396       const std::string &ConstraintsStr = IA->getConstraintString();
26397       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26398       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26399       if (clobbersFlagRegisters(AsmPieces))
26400         return IntrinsicLowering::LowerToByteSwap(CI);
26401     }
26402     break;
26403   case 3:
26404     if (CI->getType()->isIntegerTy(32) &&
26405         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26406         matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
26407         matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
26408         matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
26409       AsmPieces.clear();
26410       const std::string &ConstraintsStr = IA->getConstraintString();
26411       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26412       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26413       if (clobbersFlagRegisters(AsmPieces))
26414         return IntrinsicLowering::LowerToByteSwap(CI);
26415     }
26416
26417     if (CI->getType()->isIntegerTy(64)) {
26418       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
26419       if (Constraints.size() >= 2 &&
26420           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
26421           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
26422         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
26423         if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
26424             matchAsm(AsmPieces[1], "bswap", "%edx") &&
26425             matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
26426           return IntrinsicLowering::LowerToByteSwap(CI);
26427       }
26428     }
26429     break;
26430   }
26431   return false;
26432 }
26433
26434 /// getConstraintType - Given a constraint letter, return the type of
26435 /// constraint it is for this target.
26436 X86TargetLowering::ConstraintType
26437 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
26438   if (Constraint.size() == 1) {
26439     switch (Constraint[0]) {
26440     case 'R':
26441     case 'q':
26442     case 'Q':
26443     case 'f':
26444     case 't':
26445     case 'u':
26446     case 'y':
26447     case 'x':
26448     case 'Y':
26449     case 'l':
26450       return C_RegisterClass;
26451     case 'a':
26452     case 'b':
26453     case 'c':
26454     case 'd':
26455     case 'S':
26456     case 'D':
26457     case 'A':
26458       return C_Register;
26459     case 'I':
26460     case 'J':
26461     case 'K':
26462     case 'L':
26463     case 'M':
26464     case 'N':
26465     case 'G':
26466     case 'C':
26467     case 'e':
26468     case 'Z':
26469       return C_Other;
26470     default:
26471       break;
26472     }
26473   }
26474   return TargetLowering::getConstraintType(Constraint);
26475 }
26476
26477 /// Examine constraint type and operand type and determine a weight value.
26478 /// This object must already have been set up with the operand type
26479 /// and the current alternative constraint selected.
26480 TargetLowering::ConstraintWeight
26481   X86TargetLowering::getSingleConstraintMatchWeight(
26482     AsmOperandInfo &info, const char *constraint) const {
26483   ConstraintWeight weight = CW_Invalid;
26484   Value *CallOperandVal = info.CallOperandVal;
26485     // If we don't have a value, we can't do a match,
26486     // but allow it at the lowest weight.
26487   if (!CallOperandVal)
26488     return CW_Default;
26489   Type *type = CallOperandVal->getType();
26490   // Look at the constraint type.
26491   switch (*constraint) {
26492   default:
26493     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
26494   case 'R':
26495   case 'q':
26496   case 'Q':
26497   case 'a':
26498   case 'b':
26499   case 'c':
26500   case 'd':
26501   case 'S':
26502   case 'D':
26503   case 'A':
26504     if (CallOperandVal->getType()->isIntegerTy())
26505       weight = CW_SpecificReg;
26506     break;
26507   case 'f':
26508   case 't':
26509   case 'u':
26510     if (type->isFloatingPointTy())
26511       weight = CW_SpecificReg;
26512     break;
26513   case 'y':
26514     if (type->isX86_MMXTy() && Subtarget->hasMMX())
26515       weight = CW_SpecificReg;
26516     break;
26517   case 'x':
26518   case 'Y':
26519     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
26520         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
26521       weight = CW_Register;
26522     break;
26523   case 'I':
26524     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
26525       if (C->getZExtValue() <= 31)
26526         weight = CW_Constant;
26527     }
26528     break;
26529   case 'J':
26530     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26531       if (C->getZExtValue() <= 63)
26532         weight = CW_Constant;
26533     }
26534     break;
26535   case 'K':
26536     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26537       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
26538         weight = CW_Constant;
26539     }
26540     break;
26541   case 'L':
26542     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26543       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
26544         weight = CW_Constant;
26545     }
26546     break;
26547   case 'M':
26548     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26549       if (C->getZExtValue() <= 3)
26550         weight = CW_Constant;
26551     }
26552     break;
26553   case 'N':
26554     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26555       if (C->getZExtValue() <= 0xff)
26556         weight = CW_Constant;
26557     }
26558     break;
26559   case 'G':
26560   case 'C':
26561     if (dyn_cast<ConstantFP>(CallOperandVal)) {
26562       weight = CW_Constant;
26563     }
26564     break;
26565   case 'e':
26566     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26567       if ((C->getSExtValue() >= -0x80000000LL) &&
26568           (C->getSExtValue() <= 0x7fffffffLL))
26569         weight = CW_Constant;
26570     }
26571     break;
26572   case 'Z':
26573     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26574       if (C->getZExtValue() <= 0xffffffff)
26575         weight = CW_Constant;
26576     }
26577     break;
26578   }
26579   return weight;
26580 }
26581
26582 /// LowerXConstraint - try to replace an X constraint, which matches anything,
26583 /// with another that has more specific requirements based on the type of the
26584 /// corresponding operand.
26585 const char *X86TargetLowering::
26586 LowerXConstraint(EVT ConstraintVT) const {
26587   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
26588   // 'f' like normal targets.
26589   if (ConstraintVT.isFloatingPoint()) {
26590     if (Subtarget->hasSSE2())
26591       return "Y";
26592     if (Subtarget->hasSSE1())
26593       return "x";
26594   }
26595
26596   return TargetLowering::LowerXConstraint(ConstraintVT);
26597 }
26598
26599 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
26600 /// vector.  If it is invalid, don't add anything to Ops.
26601 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
26602                                                      std::string &Constraint,
26603                                                      std::vector<SDValue>&Ops,
26604                                                      SelectionDAG &DAG) const {
26605   SDValue Result;
26606
26607   // Only support length 1 constraints for now.
26608   if (Constraint.length() > 1) return;
26609
26610   char ConstraintLetter = Constraint[0];
26611   switch (ConstraintLetter) {
26612   default: break;
26613   case 'I':
26614     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26615       if (C->getZExtValue() <= 31) {
26616         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26617         break;
26618       }
26619     }
26620     return;
26621   case 'J':
26622     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26623       if (C->getZExtValue() <= 63) {
26624         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26625         break;
26626       }
26627     }
26628     return;
26629   case 'K':
26630     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26631       if (isInt<8>(C->getSExtValue())) {
26632         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26633         break;
26634       }
26635     }
26636     return;
26637   case 'L':
26638     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26639       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
26640           (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
26641         Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
26642         break;
26643       }
26644     }
26645     return;
26646   case 'M':
26647     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26648       if (C->getZExtValue() <= 3) {
26649         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26650         break;
26651       }
26652     }
26653     return;
26654   case 'N':
26655     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26656       if (C->getZExtValue() <= 255) {
26657         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26658         break;
26659       }
26660     }
26661     return;
26662   case 'O':
26663     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26664       if (C->getZExtValue() <= 127) {
26665         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26666         break;
26667       }
26668     }
26669     return;
26670   case 'e': {
26671     // 32-bit signed value
26672     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26673       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26674                                            C->getSExtValue())) {
26675         // Widen to 64 bits here to get it sign extended.
26676         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
26677         break;
26678       }
26679     // FIXME gcc accepts some relocatable values here too, but only in certain
26680     // memory models; it's complicated.
26681     }
26682     return;
26683   }
26684   case 'Z': {
26685     // 32-bit unsigned value
26686     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26687       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26688                                            C->getZExtValue())) {
26689         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26690         break;
26691       }
26692     }
26693     // FIXME gcc accepts some relocatable values here too, but only in certain
26694     // memory models; it's complicated.
26695     return;
26696   }
26697   case 'i': {
26698     // Literal immediates are always ok.
26699     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
26700       // Widen to 64 bits here to get it sign extended.
26701       Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
26702       break;
26703     }
26704
26705     // In any sort of PIC mode addresses need to be computed at runtime by
26706     // adding in a register or some sort of table lookup.  These can't
26707     // be used as immediates.
26708     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
26709       return;
26710
26711     // If we are in non-pic codegen mode, we allow the address of a global (with
26712     // an optional displacement) to be used with 'i'.
26713     GlobalAddressSDNode *GA = nullptr;
26714     int64_t Offset = 0;
26715
26716     // Match either (GA), (GA+C), (GA+C1+C2), etc.
26717     while (1) {
26718       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
26719         Offset += GA->getOffset();
26720         break;
26721       } else if (Op.getOpcode() == ISD::ADD) {
26722         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26723           Offset += C->getZExtValue();
26724           Op = Op.getOperand(0);
26725           continue;
26726         }
26727       } else if (Op.getOpcode() == ISD::SUB) {
26728         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26729           Offset += -C->getZExtValue();
26730           Op = Op.getOperand(0);
26731           continue;
26732         }
26733       }
26734
26735       // Otherwise, this isn't something we can handle, reject it.
26736       return;
26737     }
26738
26739     const GlobalValue *GV = GA->getGlobal();
26740     // If we require an extra load to get this address, as in PIC mode, we
26741     // can't accept it.
26742     if (isGlobalStubReference(
26743             Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
26744       return;
26745
26746     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
26747                                         GA->getValueType(0), Offset);
26748     break;
26749   }
26750   }
26751
26752   if (Result.getNode()) {
26753     Ops.push_back(Result);
26754     return;
26755   }
26756   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
26757 }
26758
26759 std::pair<unsigned, const TargetRegisterClass*>
26760 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
26761                                                 MVT VT) const {
26762   // First, see if this is a constraint that directly corresponds to an LLVM
26763   // register class.
26764   if (Constraint.size() == 1) {
26765     // GCC Constraint Letters
26766     switch (Constraint[0]) {
26767     default: break;
26768       // TODO: Slight differences here in allocation order and leaving
26769       // RIP in the class. Do they matter any more here than they do
26770       // in the normal allocation?
26771     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
26772       if (Subtarget->is64Bit()) {
26773         if (VT == MVT::i32 || VT == MVT::f32)
26774           return std::make_pair(0U, &X86::GR32RegClass);
26775         if (VT == MVT::i16)
26776           return std::make_pair(0U, &X86::GR16RegClass);
26777         if (VT == MVT::i8 || VT == MVT::i1)
26778           return std::make_pair(0U, &X86::GR8RegClass);
26779         if (VT == MVT::i64 || VT == MVT::f64)
26780           return std::make_pair(0U, &X86::GR64RegClass);
26781         break;
26782       }
26783       // 32-bit fallthrough
26784     case 'Q':   // Q_REGS
26785       if (VT == MVT::i32 || VT == MVT::f32)
26786         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
26787       if (VT == MVT::i16)
26788         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
26789       if (VT == MVT::i8 || VT == MVT::i1)
26790         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
26791       if (VT == MVT::i64)
26792         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
26793       break;
26794     case 'r':   // GENERAL_REGS
26795     case 'l':   // INDEX_REGS
26796       if (VT == MVT::i8 || VT == MVT::i1)
26797         return std::make_pair(0U, &X86::GR8RegClass);
26798       if (VT == MVT::i16)
26799         return std::make_pair(0U, &X86::GR16RegClass);
26800       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
26801         return std::make_pair(0U, &X86::GR32RegClass);
26802       return std::make_pair(0U, &X86::GR64RegClass);
26803     case 'R':   // LEGACY_REGS
26804       if (VT == MVT::i8 || VT == MVT::i1)
26805         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
26806       if (VT == MVT::i16)
26807         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
26808       if (VT == MVT::i32 || !Subtarget->is64Bit())
26809         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
26810       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
26811     case 'f':  // FP Stack registers.
26812       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
26813       // value to the correct fpstack register class.
26814       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
26815         return std::make_pair(0U, &X86::RFP32RegClass);
26816       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
26817         return std::make_pair(0U, &X86::RFP64RegClass);
26818       return std::make_pair(0U, &X86::RFP80RegClass);
26819     case 'y':   // MMX_REGS if MMX allowed.
26820       if (!Subtarget->hasMMX()) break;
26821       return std::make_pair(0U, &X86::VR64RegClass);
26822     case 'Y':   // SSE_REGS if SSE2 allowed
26823       if (!Subtarget->hasSSE2()) break;
26824       // FALL THROUGH.
26825     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
26826       if (!Subtarget->hasSSE1()) break;
26827
26828       switch (VT.SimpleTy) {
26829       default: break;
26830       // Scalar SSE types.
26831       case MVT::f32:
26832       case MVT::i32:
26833         return std::make_pair(0U, &X86::FR32RegClass);
26834       case MVT::f64:
26835       case MVT::i64:
26836         return std::make_pair(0U, &X86::FR64RegClass);
26837       // Vector types.
26838       case MVT::v16i8:
26839       case MVT::v8i16:
26840       case MVT::v4i32:
26841       case MVT::v2i64:
26842       case MVT::v4f32:
26843       case MVT::v2f64:
26844         return std::make_pair(0U, &X86::VR128RegClass);
26845       // AVX types.
26846       case MVT::v32i8:
26847       case MVT::v16i16:
26848       case MVT::v8i32:
26849       case MVT::v4i64:
26850       case MVT::v8f32:
26851       case MVT::v4f64:
26852         return std::make_pair(0U, &X86::VR256RegClass);
26853       case MVT::v8f64:
26854       case MVT::v16f32:
26855       case MVT::v16i32:
26856       case MVT::v8i64:
26857         return std::make_pair(0U, &X86::VR512RegClass);
26858       }
26859       break;
26860     }
26861   }
26862
26863   // Use the default implementation in TargetLowering to convert the register
26864   // constraint into a member of a register class.
26865   std::pair<unsigned, const TargetRegisterClass*> Res;
26866   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
26867
26868   // Not found as a standard register?
26869   if (!Res.second) {
26870     // Map st(0) -> st(7) -> ST0
26871     if (Constraint.size() == 7 && Constraint[0] == '{' &&
26872         tolower(Constraint[1]) == 's' &&
26873         tolower(Constraint[2]) == 't' &&
26874         Constraint[3] == '(' &&
26875         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
26876         Constraint[5] == ')' &&
26877         Constraint[6] == '}') {
26878
26879       Res.first = X86::FP0+Constraint[4]-'0';
26880       Res.second = &X86::RFP80RegClass;
26881       return Res;
26882     }
26883
26884     // GCC allows "st(0)" to be called just plain "st".
26885     if (StringRef("{st}").equals_lower(Constraint)) {
26886       Res.first = X86::FP0;
26887       Res.second = &X86::RFP80RegClass;
26888       return Res;
26889     }
26890
26891     // flags -> EFLAGS
26892     if (StringRef("{flags}").equals_lower(Constraint)) {
26893       Res.first = X86::EFLAGS;
26894       Res.second = &X86::CCRRegClass;
26895       return Res;
26896     }
26897
26898     // 'A' means EAX + EDX.
26899     if (Constraint == "A") {
26900       Res.first = X86::EAX;
26901       Res.second = &X86::GR32_ADRegClass;
26902       return Res;
26903     }
26904     return Res;
26905   }
26906
26907   // Otherwise, check to see if this is a register class of the wrong value
26908   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
26909   // turn into {ax},{dx}.
26910   if (Res.second->hasType(VT))
26911     return Res;   // Correct type already, nothing to do.
26912
26913   // All of the single-register GCC register classes map their values onto
26914   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
26915   // really want an 8-bit or 32-bit register, map to the appropriate register
26916   // class and return the appropriate register.
26917   if (Res.second == &X86::GR16RegClass) {
26918     if (VT == MVT::i8 || VT == MVT::i1) {
26919       unsigned DestReg = 0;
26920       switch (Res.first) {
26921       default: break;
26922       case X86::AX: DestReg = X86::AL; break;
26923       case X86::DX: DestReg = X86::DL; break;
26924       case X86::CX: DestReg = X86::CL; break;
26925       case X86::BX: DestReg = X86::BL; break;
26926       }
26927       if (DestReg) {
26928         Res.first = DestReg;
26929         Res.second = &X86::GR8RegClass;
26930       }
26931     } else if (VT == MVT::i32 || VT == MVT::f32) {
26932       unsigned DestReg = 0;
26933       switch (Res.first) {
26934       default: break;
26935       case X86::AX: DestReg = X86::EAX; break;
26936       case X86::DX: DestReg = X86::EDX; break;
26937       case X86::CX: DestReg = X86::ECX; break;
26938       case X86::BX: DestReg = X86::EBX; break;
26939       case X86::SI: DestReg = X86::ESI; break;
26940       case X86::DI: DestReg = X86::EDI; break;
26941       case X86::BP: DestReg = X86::EBP; break;
26942       case X86::SP: DestReg = X86::ESP; break;
26943       }
26944       if (DestReg) {
26945         Res.first = DestReg;
26946         Res.second = &X86::GR32RegClass;
26947       }
26948     } else if (VT == MVT::i64 || VT == MVT::f64) {
26949       unsigned DestReg = 0;
26950       switch (Res.first) {
26951       default: break;
26952       case X86::AX: DestReg = X86::RAX; break;
26953       case X86::DX: DestReg = X86::RDX; break;
26954       case X86::CX: DestReg = X86::RCX; break;
26955       case X86::BX: DestReg = X86::RBX; break;
26956       case X86::SI: DestReg = X86::RSI; break;
26957       case X86::DI: DestReg = X86::RDI; break;
26958       case X86::BP: DestReg = X86::RBP; break;
26959       case X86::SP: DestReg = X86::RSP; break;
26960       }
26961       if (DestReg) {
26962         Res.first = DestReg;
26963         Res.second = &X86::GR64RegClass;
26964       }
26965     }
26966   } else if (Res.second == &X86::FR32RegClass ||
26967              Res.second == &X86::FR64RegClass ||
26968              Res.second == &X86::VR128RegClass ||
26969              Res.second == &X86::VR256RegClass ||
26970              Res.second == &X86::FR32XRegClass ||
26971              Res.second == &X86::FR64XRegClass ||
26972              Res.second == &X86::VR128XRegClass ||
26973              Res.second == &X86::VR256XRegClass ||
26974              Res.second == &X86::VR512RegClass) {
26975     // Handle references to XMM physical registers that got mapped into the
26976     // wrong class.  This can happen with constraints like {xmm0} where the
26977     // target independent register mapper will just pick the first match it can
26978     // find, ignoring the required type.
26979
26980     if (VT == MVT::f32 || VT == MVT::i32)
26981       Res.second = &X86::FR32RegClass;
26982     else if (VT == MVT::f64 || VT == MVT::i64)
26983       Res.second = &X86::FR64RegClass;
26984     else if (X86::VR128RegClass.hasType(VT))
26985       Res.second = &X86::VR128RegClass;
26986     else if (X86::VR256RegClass.hasType(VT))
26987       Res.second = &X86::VR256RegClass;
26988     else if (X86::VR512RegClass.hasType(VT))
26989       Res.second = &X86::VR512RegClass;
26990   }
26991
26992   return Res;
26993 }
26994
26995 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
26996                                             Type *Ty) const {
26997   // Scaling factors are not free at all.
26998   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
26999   // will take 2 allocations in the out of order engine instead of 1
27000   // for plain addressing mode, i.e. inst (reg1).
27001   // E.g.,
27002   // vaddps (%rsi,%drx), %ymm0, %ymm1
27003   // Requires two allocations (one for the load, one for the computation)
27004   // whereas:
27005   // vaddps (%rsi), %ymm0, %ymm1
27006   // Requires just 1 allocation, i.e., freeing allocations for other operations
27007   // and having less micro operations to execute.
27008   //
27009   // For some X86 architectures, this is even worse because for instance for
27010   // stores, the complex addressing mode forces the instruction to use the
27011   // "load" ports instead of the dedicated "store" port.
27012   // E.g., on Haswell:
27013   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
27014   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
27015   if (isLegalAddressingMode(AM, Ty))
27016     // Scale represents reg2 * scale, thus account for 1
27017     // as soon as we use a second register.
27018     return AM.Scale != 0;
27019   return -1;
27020 }
27021
27022 bool X86TargetLowering::isTargetFTOL() const {
27023   return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
27024 }