lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86ISelLowering.h"
  16 #include "Utils/X86ShuffleDecode.h"
  17 #include "X86CallingConv.h"
  18 #include "X86FrameLowering.h"
  19 #include "X86InstrBuilder.h"
  20 #include "X86MachineFunctionInfo.h"
  21 #include "X86TargetMachine.h"
  22 #include "X86TargetObjectFile.h"
  23 #include "llvm/ADT/SmallBitVector.h"
  24 #include "llvm/ADT/SmallSet.h"
  25 #include "llvm/ADT/Statistic.h"
  26 #include "llvm/ADT/StringExtras.h"
  27 #include "llvm/ADT/StringSwitch.h"
  28 #include "llvm/ADT/VariadicFunction.h"
  29 #include "llvm/CodeGen/IntrinsicLowering.h"
  30 #include "llvm/CodeGen/MachineFrameInfo.h"
  31 #include "llvm/CodeGen/MachineFunction.h"
  32 #include "llvm/CodeGen/MachineInstrBuilder.h"
  33 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  34 #include "llvm/CodeGen/MachineModuleInfo.h"
  35 #include "llvm/CodeGen/MachineRegisterInfo.h"
  36 #include "llvm/IR/CallSite.h"
  37 #include "llvm/IR/CallingConv.h"
  38 #include "llvm/IR/Constants.h"
  39 #include "llvm/IR/DerivedTypes.h"
  40 #include "llvm/IR/Function.h"
  41 #include "llvm/IR/GlobalAlias.h"
  42 #include "llvm/IR/GlobalVariable.h"
  43 #include "llvm/IR/Instructions.h"
  44 #include "llvm/IR/Intrinsics.h"
  45 #include "llvm/MC/MCAsmInfo.h"
  46 #include "llvm/MC/MCContext.h"
  47 #include "llvm/MC/MCExpr.h"
  48 #include "llvm/MC/MCSymbol.h"
  49 #include "llvm/Support/CommandLine.h"
  50 #include "llvm/Support/Debug.h"
  51 #include "llvm/Support/ErrorHandling.h"
  52 #include "llvm/Support/MathExtras.h"
  53 #include "llvm/Target/TargetOptions.h"
  54 #include "X86IntrinsicsInfo.h"
  55 #include <bitset>
  56 #include <numeric>
  57 #include <cctype>
  58 using namespace llvm;
  59
  60 #define DEBUG_TYPE "x86-isel"
  61
  62 STATISTIC(NumTailCalls, "Number of tail calls");
  63
  64 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  65     "x86-experimental-vector-widening-legalization", cl::init(false),
  66     cl::desc("Enable an experimental vector type legalization through widening "
  67              "rather than promotion."),
  68     cl::Hidden);
  69
  70 static cl::opt<bool> ExperimentalVectorShuffleLowering(
  71     "x86-experimental-vector-shuffle-lowering", cl::init(true),
  72     cl::desc("Enable an experimental vector shuffle lowering code path."),
  73     cl::Hidden);
  74
  75 static cl::opt<bool> ExperimentalVectorShuffleLegality(
  76     "x86-experimental-vector-shuffle-legality", cl::init(false),
  77     cl::desc("Enable experimental shuffle legality based on the experimental "
  78              "shuffle lowering. Should only be used with the experimental "
  79              "shuffle lowering."),
  80     cl::Hidden);
  81
  82 static cl::opt<int> ReciprocalEstimateRefinementSteps(
  83     "x86-recip-refinement-steps", cl::init(1),
  84     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
  85              "result of the hardware reciprocal estimate instruction."),
  86     cl::NotHidden);
  87
  88 // Forward declarations.
  89 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
  90                        SDValue V2);
  91
  92 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
  93                                 SelectionDAG &DAG, SDLoc dl,
  94                                 unsigned vectorWidth) {
  95   assert((vectorWidth == 128 || vectorWidth == 256) &&
  96          "Unsupported vector width");
  97   EVT VT = Vec.getValueType();
  98   EVT ElVT = VT.getVectorElementType();
  99   unsigned Factor = VT.getSizeInBits()/vectorWidth;
 100   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
 101                                   VT.getVectorNumElements()/Factor);
 102
 103   // Extract from UNDEF is UNDEF.
 104   if (Vec.getOpcode() == ISD::UNDEF)
 105     return DAG.getUNDEF(ResultVT);
 106
 107   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
 108   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
 109
 110   // This is the index of the first element of the vectorWidth-bit chunk
 111   // we want.
 112   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
 113                                * ElemsPerChunk);
 114
 115   // If the input is a buildvector just emit a smaller one.
 116   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
 117     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
 118                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
 119                                     ElemsPerChunk));
 120
 121   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 122   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 123 }
 124
 125 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 126 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
 127 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
 128 /// instructions or a simple subregister reference. Idx is an index in the
 129 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
 130 /// lowering EXTRACT_VECTOR_ELT operations easier.
 131 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
 132                                    SelectionDAG &DAG, SDLoc dl) {
 133   assert((Vec.getValueType().is256BitVector() ||
 134           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
 135   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
 136 }
 137
 138 /// Generate a DAG to grab 256-bits from a 512-bit vector.
 139 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
 140                                    SelectionDAG &DAG, SDLoc dl) {
 141   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
 142   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
 143 }
 144
 145 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
 146                                unsigned IdxVal, SelectionDAG &DAG,
 147                                SDLoc dl, unsigned vectorWidth) {
 148   assert((vectorWidth == 128 || vectorWidth == 256) &&
 149          "Unsupported vector width");
 150   // Inserting UNDEF is Result
 151   if (Vec.getOpcode() == ISD::UNDEF)
 152     return Result;
 153   EVT VT = Vec.getValueType();
 154   EVT ElVT = VT.getVectorElementType();
 155   EVT ResultVT = Result.getValueType();
 156
 157   // Insert the relevant vectorWidth bits.
 158   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
 159
 160   // This is the index of the first element of the vectorWidth-bit chunk
 161   // we want.
 162   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
 163                                * ElemsPerChunk);
 164
 165   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 166   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 167 }
 168
 169 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
 170 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
 171 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
 172 /// simple superregister reference.  Idx is an index in the 128 bits
 173 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 174 /// lowering INSERT_VECTOR_ELT operations easier.
 175 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 176                                   SelectionDAG &DAG,SDLoc dl) {
 177   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
 178   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 179 }
 180
 181 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 182                                   SelectionDAG &DAG, SDLoc dl) {
 183   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
 184   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 185 }
 186
 187 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
 188 /// instructions. This is used because creating CONCAT_VECTOR nodes of
 189 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
 190 /// large BUILD_VECTORS.
 191 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
 192                                    unsigned NumElems, SelectionDAG &DAG,
 193                                    SDLoc dl) {
 194   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 195   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
 196 }
 197
 198 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
 199                                    unsigned NumElems, SelectionDAG &DAG,
 200                                    SDLoc dl) {
 201   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 202   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 203 }
 204
 205 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 206                                      const X86Subtarget &STI)
 207     : TargetLowering(TM), Subtarget(&STI) {
 208   X86ScalarSSEf64 = Subtarget->hasSSE2();
 209   X86ScalarSSEf32 = Subtarget->hasSSE1();
 210   TD = getDataLayout();
 211
 212   // Set up the TargetLowering object.
 213   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 214
 215   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 216   setBooleanContents(ZeroOrOneBooleanContent);
 217   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 218   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 219
 220   // For 64-bit, since we have so many registers, use the ILP scheduler.
 221   // For 32-bit, use the register pressure specific scheduling.
 222   // For Atom, always use ILP scheduling.
 223   if (Subtarget->isAtom())
 224     setSchedulingPreference(Sched::ILP);
 225   else if (Subtarget->is64Bit())
 226     setSchedulingPreference(Sched::ILP);
 227   else
 228     setSchedulingPreference(Sched::RegPressure);
 229   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
 230   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 231
 232   // Bypass expensive divides on Atom when compiling with O2.
 233   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 234     if (Subtarget->hasSlowDivide32())
 235       addBypassSlowDiv(32, 8);
 236     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
 237       addBypassSlowDiv(64, 16);
 238   }
 239
 240   if (Subtarget->isTargetKnownWindowsMSVC()) {
 241     // Setup Windows compiler runtime calls.
 242     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 243     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 244     setLibcallName(RTLIB::SREM_I64, "_allrem");
 245     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 246     setLibcallName(RTLIB::MUL_I64, "_allmul");
 247     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 248     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 249     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 250     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 251     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 252
 253     // The _ftol2 runtime function has an unusual calling conv, which
 254     // is modeled by a special pseudo-instruction.
 255     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
 256     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
 257     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
 258     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
 259   }
 260
 261   if (Subtarget->isTargetDarwin()) {
 262     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 263     setUseUnderscoreSetJmp(false);
 264     setUseUnderscoreLongJmp(false);
 265   } else if (Subtarget->isTargetWindowsGNU()) {
 266     // MS runtime is weird: it exports _setjmp, but longjmp!
 267     setUseUnderscoreSetJmp(true);
 268     setUseUnderscoreLongJmp(false);
 269   } else {
 270     setUseUnderscoreSetJmp(true);
 271     setUseUnderscoreLongJmp(true);
 272   }
 273
 274   // Set up the register classes.
 275   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 276   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 277   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 278   if (Subtarget->is64Bit())
 279     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 280
 281   for (MVT VT : MVT::integer_valuetypes())
 282     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 283
 284   // We don't accept any truncstore of integer registers.
 285   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 286   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 287   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 288   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 289   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 290   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 291
 292   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 293
 294   // SETOEQ and SETUNE require checking two conditions.
 295   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 296   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 297   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 298   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 299   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 300   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 301
 302   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 303   // operation.
 304   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 305   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 306   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 307
 308   if (Subtarget->is64Bit()) {
 309     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
 310     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 311   } else if (!TM.Options.UseSoftFloat) {
 312     // We have an algorithm for SSE2->double, and we turn this into a
 313     // 64-bit FILD followed by conditional FADD for other targets.
 314     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 315     // We have an algorithm for SSE2, and we turn this into a 64-bit
 316     // FILD for other targets.
 317     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 318   }
 319
 320   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 321   // this operation.
 322   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 323   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 324
 325   if (!TM.Options.UseSoftFloat) {
 326     // SSE has no i16 to fp conversion, only i32
 327     if (X86ScalarSSEf32) {
 328       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 329       // f32 and f64 cases are Legal, f80 case is not
 330       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 331     } else {
 332       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 333       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 334     }
 335   } else {
 336     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 337     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 338   }
 339
 340   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 341   // are Legal, f80 is custom lowered.
 342   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 343   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 344
 345   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 346   // this operation.
 347   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 348   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 349
 350   if (X86ScalarSSEf32) {
 351     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 352     // f32 and f64 cases are Legal, f80 case is not
 353     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 354   } else {
 355     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 356     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 357   }
 358
 359   // Handle FP_TO_UINT by promoting the destination to a larger signed
 360   // conversion.
 361   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 362   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 363   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 364
 365   if (Subtarget->is64Bit()) {
 366     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
 367     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
 368   } else if (!TM.Options.UseSoftFloat) {
 369     // Since AVX is a superset of SSE3, only check for SSE here.
 370     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
 371       // Expand FP_TO_UINT into a select.
 372       // FIXME: We would like to use a Custom expander here eventually to do
 373       // the optimal thing for SSE vs. the default expansion in the legalizer.
 374       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 375     else
 376       // With SSE3 we can use fisttpll to convert to a signed i64; without
 377       // SSE, we're stuck with a fistpll.
 378       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 379   }
 380
 381   if (isTargetFTOL()) {
 382     // Use the _ftol2 runtime function, which has a pseudo-instruction
 383     // to handle its weird calling convention.
 384     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 385   }
 386
 387   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 388   if (!X86ScalarSSEf64) {
 389     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 390     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 391     if (Subtarget->is64Bit()) {
 392       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 393       // Without SSE, i64->f64 goes through memory.
 394       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 395     }
 396   }
 397
 398   // Scalar integer divide and remainder are lowered to use operations that
 399   // produce two results, to match the available instructions. This exposes
 400   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 401   // into a single instruction.
 402   //
 403   // Scalar integer multiply-high is also lowered to use two-result
 404   // operations, to match the available instructions. However, plain multiply
 405   // (low) operations are left as Legal, as there are single-result
 406   // instructions for this in x86. Using the two-result multiply instructions
 407   // when both high and low results are needed must be arranged by dagcombine.
 408   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 409     MVT VT = IntVTs[i];
 410     setOperationAction(ISD::MULHS, VT, Expand);
 411     setOperationAction(ISD::MULHU, VT, Expand);
 412     setOperationAction(ISD::SDIV, VT, Expand);
 413     setOperationAction(ISD::UDIV, VT, Expand);
 414     setOperationAction(ISD::SREM, VT, Expand);
 415     setOperationAction(ISD::UREM, VT, Expand);
 416
 417     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
 418     setOperationAction(ISD::ADDC, VT, Custom);
 419     setOperationAction(ISD::ADDE, VT, Custom);
 420     setOperationAction(ISD::SUBC, VT, Custom);
 421     setOperationAction(ISD::SUBE, VT, Custom);
 422   }
 423
 424   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 425   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 426   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
 427   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
 428   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
 429   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
 430   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
 431   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
 432   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
 433   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
 434   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
 435   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
 436   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
 437   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
 438   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
 439   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
 440   if (Subtarget->is64Bit())
 441     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 442   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 443   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 444   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 445   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 446   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 447   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 448   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 449   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 450
 451   // Promote the i8 variants and force them on up to i32 which has a shorter
 452   // encoding.
 453   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
 454   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
 455   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
 456   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
 457   if (Subtarget->hasBMI()) {
 458     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
 459     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
 460     if (Subtarget->is64Bit())
 461       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
 462   } else {
 463     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 464     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 465     if (Subtarget->is64Bit())
 466       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 467   }
 468
 469   if (Subtarget->hasLZCNT()) {
 470     // When promoting the i8 variants, force them to i32 for a shorter
 471     // encoding.
 472     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
 473     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
 474     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
 475     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 476     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
 477     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
 478     if (Subtarget->is64Bit())
 479       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 480   } else {
 481     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 482     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 483     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 484     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 485     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 486     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 487     if (Subtarget->is64Bit()) {
 488       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 489       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 490     }
 491   }
 492
 493   // Special handling for half-precision floating point conversions.
 494   // If we don't have F16C support, then lower half float conversions
 495   // into library calls.
 496   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
 497     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 498     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 499   }
 500
 501   // There's never any support for operations beyond MVT::f32.
 502   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 503   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 504   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 505   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 506
 507   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 508   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 509   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 510   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 511   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 512   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 513
 514   if (Subtarget->hasPOPCNT()) {
 515     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 516   } else {
 517     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 518     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 519     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 520     if (Subtarget->is64Bit())
 521       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 522   }
 523
 524   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 525
 526   if (!Subtarget->hasMOVBE())
 527     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 528
 529   // These should be promoted to a larger select which is supported.
 530   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 531   // X86 wants to expand cmov itself.
 532   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
 533   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
 534   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
 535   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
 536   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
 537   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
 538   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
 539   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
 540   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
 541   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
 542   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
 543   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
 544   if (Subtarget->is64Bit()) {
 545     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
 546     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
 547   }
 548   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 549   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 550   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 551   // support continuation, user-level threading, and etc.. As a result, no
 552   // other SjLj exception interfaces are implemented and please don't build
 553   // your own exception handling based on them.
 554   // LLVM/Clang supports zero-cost DWARF exception handling.
 555   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 556   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 557
 558   // Darwin ABI issue.
 559   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
 560   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
 561   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
 562   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
 563   if (Subtarget->is64Bit())
 564     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 565   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
 566   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
 567   if (Subtarget->is64Bit()) {
 568     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
 569     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
 570     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
 571     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
 572     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
 573   }
 574   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
 575   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
 576   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
 577   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
 578   if (Subtarget->is64Bit()) {
 579     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
 580     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
 581     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
 582   }
 583
 584   if (Subtarget->hasSSE1())
 585     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 586
 587   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 588
 589   // Expand certain atomics
 590   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 591     MVT VT = IntVTs[i];
 592     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 593     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 594     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 595   }
 596
 597   if (Subtarget->hasCmpxchg16b()) {
 598     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 599   }
 600
 601   // FIXME - use subtarget debug flags
 602   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
 603       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
 604     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 605   }
 606
 607   if (Subtarget->is64Bit()) {
 608     setExceptionPointerRegister(X86::RAX);
 609     setExceptionSelectorRegister(X86::RDX);
 610   } else {
 611     setExceptionPointerRegister(X86::EAX);
 612     setExceptionSelectorRegister(X86::EDX);
 613   }
 614   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 615   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 616
 617   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 618   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 619
 620   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 621   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 622
 623   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 624   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 625   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 626   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
 627     // TargetInfo::X86_64ABIBuiltinVaList
 628     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
 629     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
 630   } else {
 631     // TargetInfo::CharPtrBuiltinVaList
 632     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
 633     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
 634   }
 635
 636   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 637   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 638
 639   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
 640
 641   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
 642     // f32 and f64 use SSE.
 643     // Set up the FP register classes.
 644     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 645     addRegisterClass(MVT::f64, &X86::FR64RegClass);
 646
 647     // Use ANDPD to simulate FABS.
 648     setOperationAction(ISD::FABS , MVT::f64, Custom);
 649     setOperationAction(ISD::FABS , MVT::f32, Custom);
 650
 651     // Use XORP to simulate FNEG.
 652     setOperationAction(ISD::FNEG , MVT::f64, Custom);
 653     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 654
 655     // Use ANDPD and ORPD to simulate FCOPYSIGN.
 656     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 657     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 658
 659     // Lower this to FGETSIGNx86 plus an AND.
 660     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 661     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 662
 663     // We don't support sin/cos/fmod
 664     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 665     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 666     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 667     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 668     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 669     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 670
 671     // Expand FP immediates into loads from the stack, except for the special
 672     // cases we handle.
 673     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 674     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 675   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
 676     // Use SSE for f32, x87 for f64.
 677     // Set up the FP register classes.
 678     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 679     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 680
 681     // Use ANDPS to simulate FABS.
 682     setOperationAction(ISD::FABS , MVT::f32, Custom);
 683
 684     // Use XORP to simulate FNEG.
 685     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 686
 687     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 688
 689     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 690     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 691     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 692
 693     // We don't support sin/cos/fmod
 694     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 695     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 696     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 697
 698     // Special cases we handle for FP constants.
 699     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 700     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 701     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 702     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 703     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 704
 705     if (!TM.Options.UnsafeFPMath) {
 706       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 707       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 708       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 709     }
 710   } else if (!TM.Options.UseSoftFloat) {
 711     // f32 and f64 in x87.
 712     // Set up the FP register classes.
 713     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 714     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 715
 716     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 717     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
 718     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 719     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 720
 721     if (!TM.Options.UnsafeFPMath) {
 722       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 723       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 724       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 725       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 726       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 727       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 728     }
 729     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 730     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 731     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 732     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 733     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 734     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 735     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 736     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 737   }
 738
 739   // We don't support FMA.
 740   setOperationAction(ISD::FMA, MVT::f64, Expand);
 741   setOperationAction(ISD::FMA, MVT::f32, Expand);
 742
 743   // Long double always uses X87.
 744   if (!TM.Options.UseSoftFloat) {
 745     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 746     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 747     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 748     {
 749       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
 750       addLegalFPImmediate(TmpFlt);  // FLD0
 751       TmpFlt.changeSign();
 752       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 753
 754       bool ignored;
 755       APFloat TmpFlt2(+1.0);
 756       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
 757                       &ignored);
 758       addLegalFPImmediate(TmpFlt2);  // FLD1
 759       TmpFlt2.changeSign();
 760       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 761     }
 762
 763     if (!TM.Options.UnsafeFPMath) {
 764       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 765       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 766       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 767     }
 768
 769     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 770     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 771     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 772     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 773     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 774     setOperationAction(ISD::FMA, MVT::f80, Expand);
 775   }
 776
 777   // Always use a library call for pow.
 778   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 779   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 780   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 781
 782   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 783   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 784   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 785   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 786   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 787   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 788   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 789
 790   // First set operation action for all vector types to either promote
 791   // (for widening) or expand (for scalarization). Then we will selectively
 792   // turn on ones that can be effectively codegen'd.
 793   for (MVT VT : MVT::vector_valuetypes()) {
 794     setOperationAction(ISD::ADD , VT, Expand);
 795     setOperationAction(ISD::SUB , VT, Expand);
 796     setOperationAction(ISD::FADD, VT, Expand);
 797     setOperationAction(ISD::FNEG, VT, Expand);
 798     setOperationAction(ISD::FSUB, VT, Expand);
 799     setOperationAction(ISD::MUL , VT, Expand);
 800     setOperationAction(ISD::FMUL, VT, Expand);
 801     setOperationAction(ISD::SDIV, VT, Expand);
 802     setOperationAction(ISD::UDIV, VT, Expand);
 803     setOperationAction(ISD::FDIV, VT, Expand);
 804     setOperationAction(ISD::SREM, VT, Expand);
 805     setOperationAction(ISD::UREM, VT, Expand);
 806     setOperationAction(ISD::LOAD, VT, Expand);
 807     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 808     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 809     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 810     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 811     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 812     setOperationAction(ISD::FABS, VT, Expand);
 813     setOperationAction(ISD::FSIN, VT, Expand);
 814     setOperationAction(ISD::FSINCOS, VT, Expand);
 815     setOperationAction(ISD::FCOS, VT, Expand);
 816     setOperationAction(ISD::FSINCOS, VT, Expand);
 817     setOperationAction(ISD::FREM, VT, Expand);
 818     setOperationAction(ISD::FMA,  VT, Expand);
 819     setOperationAction(ISD::FPOWI, VT, Expand);
 820     setOperationAction(ISD::FSQRT, VT, Expand);
 821     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 822     setOperationAction(ISD::FFLOOR, VT, Expand);
 823     setOperationAction(ISD::FCEIL, VT, Expand);
 824     setOperationAction(ISD::FTRUNC, VT, Expand);
 825     setOperationAction(ISD::FRINT, VT, Expand);
 826     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 827     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 828     setOperationAction(ISD::MULHS, VT, Expand);
 829     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 830     setOperationAction(ISD::MULHU, VT, Expand);
 831     setOperationAction(ISD::SDIVREM, VT, Expand);
 832     setOperationAction(ISD::UDIVREM, VT, Expand);
 833     setOperationAction(ISD::FPOW, VT, Expand);
 834     setOperationAction(ISD::CTPOP, VT, Expand);
 835     setOperationAction(ISD::CTTZ, VT, Expand);
 836     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
 837     setOperationAction(ISD::CTLZ, VT, Expand);
 838     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
 839     setOperationAction(ISD::SHL, VT, Expand);
 840     setOperationAction(ISD::SRA, VT, Expand);
 841     setOperationAction(ISD::SRL, VT, Expand);
 842     setOperationAction(ISD::ROTL, VT, Expand);
 843     setOperationAction(ISD::ROTR, VT, Expand);
 844     setOperationAction(ISD::BSWAP, VT, Expand);
 845     setOperationAction(ISD::SETCC, VT, Expand);
 846     setOperationAction(ISD::FLOG, VT, Expand);
 847     setOperationAction(ISD::FLOG2, VT, Expand);
 848     setOperationAction(ISD::FLOG10, VT, Expand);
 849     setOperationAction(ISD::FEXP, VT, Expand);
 850     setOperationAction(ISD::FEXP2, VT, Expand);
 851     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 852     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 853     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 854     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 855     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 856     setOperationAction(ISD::TRUNCATE, VT, Expand);
 857     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 858     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 859     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 860     setOperationAction(ISD::VSELECT, VT, Expand);
 861     setOperationAction(ISD::SELECT_CC, VT, Expand);
 862     for (MVT InnerVT : MVT::vector_valuetypes()) {
 863       setTruncStoreAction(InnerVT, VT, Expand);
 864
 865       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 866       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 867
 868       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 869       // types, we have to deal with them whether we ask for Expansion or not.
 870       // Setting Expand causes its own optimisation problems though, so leave
 871       // them legal.
 872       if (VT.getVectorElementType() == MVT::i1)
 873         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 874     }
 875   }
 876
 877   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 878   // with -msoft-float, disable use of MMX as well.
 879   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
 880     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 881     // No operations on x86mmx supported, everything uses intrinsics.
 882   }
 883
 884   // MMX-sized vectors (other than x86mmx) are expected to be expanded
 885   // into smaller operations.
 886   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
 887   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
 888   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
 889   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
 890   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
 891   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
 892   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
 893   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
 894   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
 895   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
 896   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
 897   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
 898   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
 899   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
 900   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
 901   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
 902   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
 903   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
 904   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
 905   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
 906   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
 907   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
 908   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
 909   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
 910   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
 911   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
 912   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
 913   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
 914   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
 915
 916   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
 917     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 918
 919     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
 920     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
 921     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
 922     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
 923     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
 924     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 925     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 926     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
 927     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 928     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 929     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 930     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 931     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 932   }
 933
 934   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
 935     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 936
 937     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 938     // registers cannot be used even for integer operations.
 939     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
 940     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
 941     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
 942     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 943
 944     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
 945     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
 946     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
 947     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
 948     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 949     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 950     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 951     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 952     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 953     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 954     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
 955     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
 956     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
 957     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
 958     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 959     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
 960     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
 961     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
 962     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
 963     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
 964     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 965     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 966
 967     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
 968     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
 969     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
 970     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
 971
 972     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
 973     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
 974     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 975     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 976     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 977
 978     // Only provide customized ctpop vector bit twiddling for vector types we
 979     // know to perform better than using the popcnt instructions on each vector
 980     // element. If popcnt isn't supported, always provide the custom version.
 981     if (!Subtarget->hasPOPCNT()) {
 982       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
 983       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
 984     }
 985
 986     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
 987     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
 988       MVT VT = (MVT::SimpleValueType)i;
 989       // Do not attempt to custom lower non-power-of-2 vectors
 990       if (!isPowerOf2_32(VT.getVectorNumElements()))
 991         continue;
 992       // Do not attempt to custom lower non-128-bit vectors
 993       if (!VT.is128BitVector())
 994         continue;
 995       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 996       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 997       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 998     }
 999
1000     // We support custom legalizing of sext and anyext loads for specific
1001     // memory vector types which we can load as a scalar (or sequence of
1002     // scalars) and extend in-register to a legal 128-bit vector type. For sext
1003     // loads these must work with a single scalar load.
1004     for (MVT VT : MVT::integer_vector_valuetypes()) {
1005       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
1006       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
1007       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
1008       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
1009       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
1010       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
1011       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
1012       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
1013       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
1014     }
1015
1016     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
1017     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
1018     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
1019     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
1020     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
1021     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
1022
1023     if (Subtarget->is64Bit()) {
1024       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1025       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1026     }
1027
1028     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
1029     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1030       MVT VT = (MVT::SimpleValueType)i;
1031
1032       // Do not attempt to promote non-128-bit vectors
1033       if (!VT.is128BitVector())
1034         continue;
1035
1036       setOperationAction(ISD::AND,    VT, Promote);
1037       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
1038       setOperationAction(ISD::OR,     VT, Promote);
1039       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
1040       setOperationAction(ISD::XOR,    VT, Promote);
1041       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
1042       setOperationAction(ISD::LOAD,   VT, Promote);
1043       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
1044       setOperationAction(ISD::SELECT, VT, Promote);
1045       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1046     }
1047
1048     // Custom lower v2i64 and v2f64 selects.
1049     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
1050     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
1051     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1052     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1053
1054     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1055     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1056
1057     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
1058     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
1059     // As there is no 64-bit GPR available, we need build a special custom
1060     // sequence to convert from v2i32 to v2f32.
1061     if (!Subtarget->is64Bit())
1062       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
1063
1064     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1065     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1066
1067     for (MVT VT : MVT::fp_vector_valuetypes())
1068       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
1069
1070     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1071     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1072     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1073   }
1074
1075   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1076     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
1077     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
1078     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
1079     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
1080     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
1081     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
1082     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
1083     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
1084     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
1085     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
1086
1087     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
1088     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
1089     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
1090     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
1091     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
1092     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
1093     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
1094     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
1095     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
1096     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
1097
1098     // FIXME: Do we need to handle scalar-to-vector here?
1099     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1100
1101     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
1102     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
1103     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
1104     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
1105     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
1106     // There is no BLENDI for byte vectors. We don't need to custom lower
1107     // some vselects for now.
1108     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1109
1110     // SSE41 brings specific instructions for doing vector sign extend even in
1111     // cases where we don't have SRA.
1112     for (MVT VT : MVT::integer_vector_valuetypes()) {
1113       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
1114       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
1115       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
1116     }
1117
1118     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1119     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1120     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1121     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1122     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1123     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1124     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1125
1126     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
1127     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1128     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1129     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1130     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1131     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1132
1133     // i8 and i16 vectors are custom because the source register and source
1134     // source memory operand types are not the same width.  f32 vectors are
1135     // custom since the immediate controlling the insert encodes additional
1136     // information.
1137     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1138     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1139     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1140     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1141
1142     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1143     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1144     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1145     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1146
1147     // FIXME: these should be Legal, but that's only for the case where
1148     // the index is constant.  For now custom expand to deal with that.
1149     if (Subtarget->is64Bit()) {
1150       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1151       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1152     }
1153   }
1154
1155   if (Subtarget->hasSSE2()) {
1156     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1157     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1158
1159     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1160     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1161
1162     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1163     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1164
1165     // In the customized shift lowering, the legal cases in AVX2 will be
1166     // recognized.
1167     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
1168     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
1169
1170     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
1171     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
1172
1173     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
1174   }
1175
1176   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1177     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1178     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1179     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1180     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1181     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1182     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1183
1184     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1185     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1186     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1187
1188     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1189     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1190     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1191     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1192     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1193     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1194     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1195     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1196     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1197     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1198     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1199     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1200
1201     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1202     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1203     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1204     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1205     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1206     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1207     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1208     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1209     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1210     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1211     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1212     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1213
1214     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1215     // even though v8i16 is a legal type.
1216     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1217     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1218     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1219
1220     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1221     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1222     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1223
1224     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1225     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1226
1227     for (MVT VT : MVT::fp_vector_valuetypes())
1228       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1229
1230     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1231     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1232
1233     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1234     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1235
1236     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1237     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1238
1239     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1240     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1241     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1242     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1243
1244     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1245     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1246     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1247
1248     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
1249     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
1250     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
1251     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
1252
1253     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
1254     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
1255     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1256     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1257     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1258     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1259     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1260     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1261     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1262     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1263     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1264     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1265
1266     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1267       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1268       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1269       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1270       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1271       setOperationAction(ISD::FMA,             MVT::f32, Legal);
1272       setOperationAction(ISD::FMA,             MVT::f64, Legal);
1273     }
1274
1275     if (Subtarget->hasInt256()) {
1276       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1277       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1278       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1279       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1280
1281       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1282       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1283       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1284       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1285
1286       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1287       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1288       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1289       // Don't lower v32i8 because there is no 128-bit byte mul
1290
1291       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
1292       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
1293       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
1294       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
1295
1296       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
1297       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1298
1299       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1300       // when we have a 256bit-wide blend with immediate.
1301       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1302
1303       // Only provide customized ctpop vector bit twiddling for vector types we
1304       // know to perform better than using the popcnt instructions on each
1305       // vector element. If popcnt isn't supported, always provide the custom
1306       // version.
1307       if (!Subtarget->hasPOPCNT())
1308         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
1309
1310       // Custom CTPOP always performs better on natively supported v8i32
1311       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
1312
1313       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1314       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1315       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1316       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1317       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1318       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1319       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1320
1321       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1322       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1323       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1324       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1325       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1326       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1327     } else {
1328       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1329       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1330       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1331       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1332
1333       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1334       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1335       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1336       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1337
1338       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1339       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1340       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1341       // Don't lower v32i8 because there is no 128-bit byte mul
1342     }
1343
1344     // In the customized shift lowering, the legal cases in AVX2 will be
1345     // recognized.
1346     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
1347     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
1348
1349     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
1350     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
1351
1352     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
1353
1354     // Custom lower several nodes for 256-bit types.
1355     for (MVT VT : MVT::vector_valuetypes()) {
1356       if (VT.getScalarSizeInBits() >= 32) {
1357         setOperationAction(ISD::MLOAD,  VT, Legal);
1358         setOperationAction(ISD::MSTORE, VT, Legal);
1359       }
1360       // Extract subvector is special because the value type
1361       // (result) is 128-bit but the source is 256-bit wide.
1362       if (VT.is128BitVector()) {
1363         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1364       }
1365       // Do not attempt to custom lower other non-256-bit vectors
1366       if (!VT.is256BitVector())
1367         continue;
1368
1369       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1370       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1371       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1372       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1373       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1374       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1375       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1376     }
1377
1378     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1379     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1380       MVT VT = (MVT::SimpleValueType)i;
1381
1382       // Do not attempt to promote non-256-bit vectors
1383       if (!VT.is256BitVector())
1384         continue;
1385
1386       setOperationAction(ISD::AND,    VT, Promote);
1387       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1388       setOperationAction(ISD::OR,     VT, Promote);
1389       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1390       setOperationAction(ISD::XOR,    VT, Promote);
1391       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1392       setOperationAction(ISD::LOAD,   VT, Promote);
1393       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1394       setOperationAction(ISD::SELECT, VT, Promote);
1395       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1396     }
1397   }
1398
1399   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1400     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1401     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1402     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1403     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1404
1405     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1406     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1407     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1408
1409     for (MVT VT : MVT::fp_vector_valuetypes())
1410       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1411
1412     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1413     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1414     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1415     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1416     setOperationAction(ISD::AND,                MVT::i1,    Legal);
1417     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
1418     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
1419     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
1420     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
1421     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
1422
1423     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
1424     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
1425     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
1426     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
1427     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
1428     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
1429
1430     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
1431     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
1432     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
1433     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
1434     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
1435     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
1436     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
1437     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
1438
1439     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
1440     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
1441     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
1442     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
1443     if (Subtarget->is64Bit()) {
1444       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
1445       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
1446       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
1447       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
1448     }
1449     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1450     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1451     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1452     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1453     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1454     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1455     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1456     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1457     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1458     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1459     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1460     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1461     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1462     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1463
1464     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1465     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1466     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1467     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1468     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1469     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1470     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1471     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1472     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1473     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1474     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1475     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1476     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1477
1478     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1479     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1480     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1481     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1482     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
1483     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
1484
1485     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1486     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1487
1488     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1489
1490     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1491     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1492     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1493     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1494     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1495     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1496     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1497     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1498     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1499
1500     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
1501     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
1502
1503     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
1504     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
1505
1506     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1507
1508     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
1509     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
1510
1511     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
1512     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
1513
1514     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
1515     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
1516
1517     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
1518     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
1519     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
1520     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
1521     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
1522     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
1523
1524     if (Subtarget->hasCDI()) {
1525       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
1526       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1527     }
1528
1529     // Custom lower several nodes.
1530     for (MVT VT : MVT::vector_valuetypes()) {
1531       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1532       // Extract subvector is special because the value type
1533       // (result) is 256/128-bit but the source is 512-bit wide.
1534       if (VT.is128BitVector() || VT.is256BitVector()) {
1535         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1536       }
1537       if (VT.getVectorElementType() == MVT::i1)
1538         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1539
1540       // Do not attempt to custom lower other non-512-bit vectors
1541       if (!VT.is512BitVector())
1542         continue;
1543
1544       if ( EltSize >= 32) {
1545         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1546         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1547         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1548         setOperationAction(ISD::VSELECT,             VT, Legal);
1549         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1550         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1551         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1552         setOperationAction(ISD::MLOAD,               VT, Legal);
1553         setOperationAction(ISD::MSTORE,              VT, Legal);
1554       }
1555     }
1556     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1557       MVT VT = (MVT::SimpleValueType)i;
1558
1559       // Do not attempt to promote non-512-bit vectors.
1560       if (!VT.is512BitVector())
1561         continue;
1562
1563       setOperationAction(ISD::SELECT, VT, Promote);
1564       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1565     }
1566   }// has  AVX-512
1567
1568   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
1569     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1570     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1571
1572     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1573     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1574
1575     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
1576     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
1577     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1578     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1579     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
1580     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
1581     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
1582     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
1583     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1584
1585     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1586       const MVT VT = (MVT::SimpleValueType)i;
1587
1588       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1589
1590       // Do not attempt to promote non-512-bit vectors.
1591       if (!VT.is512BitVector())
1592         continue;
1593
1594       if (EltSize < 32) {
1595         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1596         setOperationAction(ISD::VSELECT,             VT, Legal);
1597       }
1598     }
1599   }
1600
1601   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
1602     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1603     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1604
1605     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
1606     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
1607     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
1608
1609     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
1610     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
1611     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
1612     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
1613     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
1614     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
1615   }
1616
1617   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1618   // of this type with custom code.
1619   for (MVT VT : MVT::vector_valuetypes())
1620     setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1621
1622   // We want to custom lower some of our intrinsics.
1623   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1624   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1625   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1626   if (!Subtarget->is64Bit())
1627     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1628
1629   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1630   // handle type legalization for these operations here.
1631   //
1632   // FIXME: We really should do custom legalization for addition and
1633   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1634   // than generic legalization for 64-bit multiplication-with-overflow, though.
1635   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1636     // Add/Sub/Mul with overflow operations are custom lowered.
1637     MVT VT = IntVTs[i];
1638     setOperationAction(ISD::SADDO, VT, Custom);
1639     setOperationAction(ISD::UADDO, VT, Custom);
1640     setOperationAction(ISD::SSUBO, VT, Custom);
1641     setOperationAction(ISD::USUBO, VT, Custom);
1642     setOperationAction(ISD::SMULO, VT, Custom);
1643     setOperationAction(ISD::UMULO, VT, Custom);
1644   }
1645
1646
1647   if (!Subtarget->is64Bit()) {
1648     // These libcalls are not available in 32-bit.
1649     setLibcallName(RTLIB::SHL_I128, nullptr);
1650     setLibcallName(RTLIB::SRL_I128, nullptr);
1651     setLibcallName(RTLIB::SRA_I128, nullptr);
1652   }
1653
1654   // Combine sin / cos into one node or libcall if possible.
1655   if (Subtarget->hasSinCos()) {
1656     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1657     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1658     if (Subtarget->isTargetDarwin()) {
1659       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1660       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1661       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1662       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1663     }
1664   }
1665
1666   if (Subtarget->isTargetWin64()) {
1667     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1668     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1669     setOperationAction(ISD::SREM, MVT::i128, Custom);
1670     setOperationAction(ISD::UREM, MVT::i128, Custom);
1671     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1672     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1673   }
1674
1675   // We have target-specific dag combine patterns for the following nodes:
1676   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1677   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1678   setTargetDAGCombine(ISD::BITCAST);
1679   setTargetDAGCombine(ISD::VSELECT);
1680   setTargetDAGCombine(ISD::SELECT);
1681   setTargetDAGCombine(ISD::SHL);
1682   setTargetDAGCombine(ISD::SRA);
1683   setTargetDAGCombine(ISD::SRL);
1684   setTargetDAGCombine(ISD::OR);
1685   setTargetDAGCombine(ISD::AND);
1686   setTargetDAGCombine(ISD::ADD);
1687   setTargetDAGCombine(ISD::FADD);
1688   setTargetDAGCombine(ISD::FSUB);
1689   setTargetDAGCombine(ISD::FMA);
1690   setTargetDAGCombine(ISD::SUB);
1691   setTargetDAGCombine(ISD::LOAD);
1692   setTargetDAGCombine(ISD::MLOAD);
1693   setTargetDAGCombine(ISD::STORE);
1694   setTargetDAGCombine(ISD::MSTORE);
1695   setTargetDAGCombine(ISD::ZERO_EXTEND);
1696   setTargetDAGCombine(ISD::ANY_EXTEND);
1697   setTargetDAGCombine(ISD::SIGN_EXTEND);
1698   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1699   setTargetDAGCombine(ISD::TRUNCATE);
1700   setTargetDAGCombine(ISD::SINT_TO_FP);
1701   setTargetDAGCombine(ISD::SETCC);
1702   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1703   setTargetDAGCombine(ISD::BUILD_VECTOR);
1704   setTargetDAGCombine(ISD::MUL);
1705   setTargetDAGCombine(ISD::XOR);
1706
1707   computeRegisterProperties();
1708
1709   // On Darwin, -Os means optimize for size without hurting performance,
1710   // do not reduce the limit.
1711   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1712   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1713   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1714   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1715   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1716   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1717   setPrefLoopAlignment(4); // 2^4 bytes.
1718
1719   // Predictable cmov don't hurt on atom because it's in-order.
1720   PredictableSelectIsExpensive = !Subtarget->isAtom();
1721   EnableExtLdPromotion = true;
1722   setPrefFunctionAlignment(4); // 2^4 bytes.
1723
1724   verifyIntrinsicTables();
1725 }
1726
1727 // This has so far only been implemented for 64-bit MachO.
1728 bool X86TargetLowering::useLoadStackGuardNode() const {
1729   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
1730 }
1731
1732 TargetLoweringBase::LegalizeTypeAction
1733 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1734   if (ExperimentalVectorWideningLegalization &&
1735       VT.getVectorNumElements() != 1 &&
1736       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1737     return TypeWidenVector;
1738
1739   return TargetLoweringBase::getPreferredVectorAction(VT);
1740 }
1741
1742 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1743   if (!VT.isVector())
1744     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1745
1746   const unsigned NumElts = VT.getVectorNumElements();
1747   const EVT EltVT = VT.getVectorElementType();
1748   if (VT.is512BitVector()) {
1749     if (Subtarget->hasAVX512())
1750       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1751           EltVT == MVT::f32 || EltVT == MVT::f64)
1752         switch(NumElts) {
1753         case  8: return MVT::v8i1;
1754         case 16: return MVT::v16i1;
1755       }
1756     if (Subtarget->hasBWI())
1757       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1758         switch(NumElts) {
1759         case 32: return MVT::v32i1;
1760         case 64: return MVT::v64i1;
1761       }
1762   }
1763
1764   if (VT.is256BitVector() || VT.is128BitVector()) {
1765     if (Subtarget->hasVLX())
1766       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1767           EltVT == MVT::f32 || EltVT == MVT::f64)
1768         switch(NumElts) {
1769         case 2: return MVT::v2i1;
1770         case 4: return MVT::v4i1;
1771         case 8: return MVT::v8i1;
1772       }
1773     if (Subtarget->hasBWI() && Subtarget->hasVLX())
1774       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1775         switch(NumElts) {
1776         case  8: return MVT::v8i1;
1777         case 16: return MVT::v16i1;
1778         case 32: return MVT::v32i1;
1779       }
1780   }
1781
1782   return VT.changeVectorElementTypeToInteger();
1783 }
1784
1785 /// Helper for getByValTypeAlignment to determine
1786 /// the desired ByVal argument alignment.
1787 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1788   if (MaxAlign == 16)
1789     return;
1790   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1791     if (VTy->getBitWidth() == 128)
1792       MaxAlign = 16;
1793   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1794     unsigned EltAlign = 0;
1795     getMaxByValAlign(ATy->getElementType(), EltAlign);
1796     if (EltAlign > MaxAlign)
1797       MaxAlign = EltAlign;
1798   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1799     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1800       unsigned EltAlign = 0;
1801       getMaxByValAlign(STy->getElementType(i), EltAlign);
1802       if (EltAlign > MaxAlign)
1803         MaxAlign = EltAlign;
1804       if (MaxAlign == 16)
1805         break;
1806     }
1807   }
1808 }
1809
1810 /// Return the desired alignment for ByVal aggregate
1811 /// function arguments in the caller parameter area. For X86, aggregates
1812 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1813 /// are at 4-byte boundaries.
1814 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1815   if (Subtarget->is64Bit()) {
1816     // Max of 8 and alignment of type.
1817     unsigned TyAlign = TD->getABITypeAlignment(Ty);
1818     if (TyAlign > 8)
1819       return TyAlign;
1820     return 8;
1821   }
1822
1823   unsigned Align = 4;
1824   if (Subtarget->hasSSE1())
1825     getMaxByValAlign(Ty, Align);
1826   return Align;
1827 }
1828
1829 /// Returns the target specific optimal type for load
1830 /// and store operations as a result of memset, memcpy, and memmove
1831 /// lowering. If DstAlign is zero that means it's safe to destination
1832 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1833 /// means there isn't a need to check it against alignment requirement,
1834 /// probably because the source does not need to be loaded. If 'IsMemset' is
1835 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1836 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1837 /// source is constant so it does not need to be loaded.
1838 /// It returns EVT::Other if the type should be determined using generic
1839 /// target-independent logic.
1840 EVT
1841 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1842                                        unsigned DstAlign, unsigned SrcAlign,
1843                                        bool IsMemset, bool ZeroMemset,
1844                                        bool MemcpyStrSrc,
1845                                        MachineFunction &MF) const {
1846   const Function *F = MF.getFunction();
1847   if ((!IsMemset || ZeroMemset) &&
1848       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1849                                        Attribute::NoImplicitFloat)) {
1850     if (Size >= 16 &&
1851         (Subtarget->isUnalignedMemAccessFast() ||
1852          ((DstAlign == 0 || DstAlign >= 16) &&
1853           (SrcAlign == 0 || SrcAlign >= 16)))) {
1854       if (Size >= 32) {
1855         if (Subtarget->hasInt256())
1856           return MVT::v8i32;
1857         if (Subtarget->hasFp256())
1858           return MVT::v8f32;
1859       }
1860       if (Subtarget->hasSSE2())
1861         return MVT::v4i32;
1862       if (Subtarget->hasSSE1())
1863         return MVT::v4f32;
1864     } else if (!MemcpyStrSrc && Size >= 8 &&
1865                !Subtarget->is64Bit() &&
1866                Subtarget->hasSSE2()) {
1867       // Do not use f64 to lower memcpy if source is string constant. It's
1868       // better to use i32 to avoid the loads.
1869       return MVT::f64;
1870     }
1871   }
1872   if (Subtarget->is64Bit() && Size >= 8)
1873     return MVT::i64;
1874   return MVT::i32;
1875 }
1876
1877 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1878   if (VT == MVT::f32)
1879     return X86ScalarSSEf32;
1880   else if (VT == MVT::f64)
1881     return X86ScalarSSEf64;
1882   return true;
1883 }
1884
1885 bool
1886 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1887                                                   unsigned,
1888                                                   unsigned,
1889                                                   bool *Fast) const {
1890   if (Fast)
1891     *Fast = Subtarget->isUnalignedMemAccessFast();
1892   return true;
1893 }
1894
1895 /// Return the entry encoding for a jump table in the
1896 /// current function.  The returned value is a member of the
1897 /// MachineJumpTableInfo::JTEntryKind enum.
1898 unsigned X86TargetLowering::getJumpTableEncoding() const {
1899   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1900   // symbol.
1901   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1902       Subtarget->isPICStyleGOT())
1903     return MachineJumpTableInfo::EK_Custom32;
1904
1905   // Otherwise, use the normal jump table encoding heuristics.
1906   return TargetLowering::getJumpTableEncoding();
1907 }
1908
1909 const MCExpr *
1910 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1911                                              const MachineBasicBlock *MBB,
1912                                              unsigned uid,MCContext &Ctx) const{
1913   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
1914          Subtarget->isPICStyleGOT());
1915   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1916   // entries.
1917   return MCSymbolRefExpr::Create(MBB->getSymbol(),
1918                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1919 }
1920
1921 /// Returns relocation base for the given PIC jumptable.
1922 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1923                                                     SelectionDAG &DAG) const {
1924   if (!Subtarget->is64Bit())
1925     // This doesn't have SDLoc associated with it, but is not really the
1926     // same as a Register.
1927     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1928   return Table;
1929 }
1930
1931 /// This returns the relocation base for the given PIC jumptable,
1932 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1933 const MCExpr *X86TargetLowering::
1934 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1935                              MCContext &Ctx) const {
1936   // X86-64 uses RIP relative addressing based on the jump table label.
1937   if (Subtarget->isPICStyleRIPRel())
1938     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1939
1940   // Otherwise, the reference is relative to the PIC base.
1941   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1942 }
1943
1944 // FIXME: Why this routine is here? Move to RegInfo!
1945 std::pair<const TargetRegisterClass*, uint8_t>
1946 X86TargetLowering::findRepresentativeClass(MVT VT) const{
1947   const TargetRegisterClass *RRC = nullptr;
1948   uint8_t Cost = 1;
1949   switch (VT.SimpleTy) {
1950   default:
1951     return TargetLowering::findRepresentativeClass(VT);
1952   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1953     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1954     break;
1955   case MVT::x86mmx:
1956     RRC = &X86::VR64RegClass;
1957     break;
1958   case MVT::f32: case MVT::f64:
1959   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1960   case MVT::v4f32: case MVT::v2f64:
1961   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1962   case MVT::v4f64:
1963     RRC = &X86::VR128RegClass;
1964     break;
1965   }
1966   return std::make_pair(RRC, Cost);
1967 }
1968
1969 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1970                                                unsigned &Offset) const {
1971   if (!Subtarget->isTargetLinux())
1972     return false;
1973
1974   if (Subtarget->is64Bit()) {
1975     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1976     Offset = 0x28;
1977     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1978       AddressSpace = 256;
1979     else
1980       AddressSpace = 257;
1981   } else {
1982     // %gs:0x14 on i386
1983     Offset = 0x14;
1984     AddressSpace = 256;
1985   }
1986   return true;
1987 }
1988
1989 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1990                                             unsigned DestAS) const {
1991   assert(SrcAS != DestAS && "Expected different address spaces!");
1992
1993   return SrcAS < 256 && DestAS < 256;
1994 }
1995
1996 //===----------------------------------------------------------------------===//
1997 //               Return Value Calling Convention Implementation
1998 //===----------------------------------------------------------------------===//
1999
2000 #include "X86GenCallingConv.inc"
2001
2002 bool
2003 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2004                                   MachineFunction &MF, bool isVarArg,
2005                         const SmallVectorImpl<ISD::OutputArg> &Outs,
2006                         LLVMContext &Context) const {
2007   SmallVector<CCValAssign, 16> RVLocs;
2008   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2009   return CCInfo.CheckReturn(Outs, RetCC_X86);
2010 }
2011
2012 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2013   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2014   return ScratchRegs;
2015 }
2016
2017 SDValue
2018 X86TargetLowering::LowerReturn(SDValue Chain,
2019                                CallingConv::ID CallConv, bool isVarArg,
2020                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2021                                const SmallVectorImpl<SDValue> &OutVals,
2022                                SDLoc dl, SelectionDAG &DAG) const {
2023   MachineFunction &MF = DAG.getMachineFunction();
2024   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2025
2026   SmallVector<CCValAssign, 16> RVLocs;
2027   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2028   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2029
2030   SDValue Flag;
2031   SmallVector<SDValue, 6> RetOps;
2032   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2033   // Operand #1 = Bytes To Pop
2034   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
2035                    MVT::i16));
2036
2037   // Copy the result values into the output registers.
2038   for (unsigned i = 0; i != RVLocs.size(); ++i) {
2039     CCValAssign &VA = RVLocs[i];
2040     assert(VA.isRegLoc() && "Can only return in registers!");
2041     SDValue ValToCopy = OutVals[i];
2042     EVT ValVT = ValToCopy.getValueType();
2043
2044     // Promote values to the appropriate types.
2045     if (VA.getLocInfo() == CCValAssign::SExt)
2046       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2047     else if (VA.getLocInfo() == CCValAssign::ZExt)
2048       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2049     else if (VA.getLocInfo() == CCValAssign::AExt)
2050       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2051     else if (VA.getLocInfo() == CCValAssign::BCvt)
2052       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
2053
2054     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2055            "Unexpected FP-extend for return value.");
2056
2057     // If this is x86-64, and we disabled SSE, we can't return FP values,
2058     // or SSE or MMX vectors.
2059     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2060          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2061           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
2062       report_fatal_error("SSE register return with SSE disabled");
2063     }
2064     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2065     // llvm-gcc has never done it right and no one has noticed, so this
2066     // should be OK for now.
2067     if (ValVT == MVT::f64 &&
2068         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
2069       report_fatal_error("SSE2 register return with SSE2 disabled");
2070
2071     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2072     // the RET instruction and handled by the FP Stackifier.
2073     if (VA.getLocReg() == X86::FP0 ||
2074         VA.getLocReg() == X86::FP1) {
2075       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2076       // change the value to the FP stack register class.
2077       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2078         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2079       RetOps.push_back(ValToCopy);
2080       // Don't emit a copytoreg.
2081       continue;
2082     }
2083
2084     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2085     // which is returned in RAX / RDX.
2086     if (Subtarget->is64Bit()) {
2087       if (ValVT == MVT::x86mmx) {
2088         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2089           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
2090           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2091                                   ValToCopy);
2092           // If we don't have SSE2 available, convert to v4f32 so the generated
2093           // register is legal.
2094           if (!Subtarget->hasSSE2())
2095             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
2096         }
2097       }
2098     }
2099
2100     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2101     Flag = Chain.getValue(1);
2102     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2103   }
2104
2105   // The x86-64 ABIs require that for returning structs by value we copy
2106   // the sret argument into %rax/%eax (depending on ABI) for the return.
2107   // Win32 requires us to put the sret argument to %eax as well.
2108   // We saved the argument into a virtual register in the entry block,
2109   // so now we copy the value out and into %rax/%eax.
2110   //
2111   // Checking Function.hasStructRetAttr() here is insufficient because the IR
2112   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2113   // false, then an sret argument may be implicitly inserted in the SelDAG. In
2114   // either case FuncInfo->setSRetReturnReg() will have been called.
2115   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2116     assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) &&
2117            "No need for an sret register");
2118     SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
2119
2120     unsigned RetValReg
2121         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
2122           X86::RAX : X86::EAX;
2123     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2124     Flag = Chain.getValue(1);
2125
2126     // RAX/EAX now acts like a return value.
2127     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
2128   }
2129
2130   RetOps[0] = Chain;  // Update chain.
2131
2132   // Add the flag if we have it.
2133   if (Flag.getNode())
2134     RetOps.push_back(Flag);
2135
2136   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
2137 }
2138
2139 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2140   if (N->getNumValues() != 1)
2141     return false;
2142   if (!N->hasNUsesOfValue(1, 0))
2143     return false;
2144
2145   SDValue TCChain = Chain;
2146   SDNode *Copy = *N->use_begin();
2147   if (Copy->getOpcode() == ISD::CopyToReg) {
2148     // If the copy has a glue operand, we conservatively assume it isn't safe to
2149     // perform a tail call.
2150     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2151       return false;
2152     TCChain = Copy->getOperand(0);
2153   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2154     return false;
2155
2156   bool HasRet = false;
2157   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2158        UI != UE; ++UI) {
2159     if (UI->getOpcode() != X86ISD::RET_FLAG)
2160       return false;
2161     // If we are returning more than one value, we can definitely
2162     // not make a tail call see PR19530
2163     if (UI->getNumOperands() > 4)
2164       return false;
2165     if (UI->getNumOperands() == 4 &&
2166         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2167       return false;
2168     HasRet = true;
2169   }
2170
2171   if (!HasRet)
2172     return false;
2173
2174   Chain = TCChain;
2175   return true;
2176 }
2177
2178 EVT
2179 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
2180                                             ISD::NodeType ExtendKind) const {
2181   MVT ReturnMVT;
2182   // TODO: Is this also valid on 32-bit?
2183   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2184     ReturnMVT = MVT::i8;
2185   else
2186     ReturnMVT = MVT::i32;
2187
2188   EVT MinVT = getRegisterType(Context, ReturnMVT);
2189   return VT.bitsLT(MinVT) ? MinVT : VT;
2190 }
2191
2192 /// Lower the result values of a call into the
2193 /// appropriate copies out of appropriate physical registers.
2194 ///
2195 SDValue
2196 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2197                                    CallingConv::ID CallConv, bool isVarArg,
2198                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2199                                    SDLoc dl, SelectionDAG &DAG,
2200                                    SmallVectorImpl<SDValue> &InVals) const {
2201
2202   // Assign locations to each value returned by this call.
2203   SmallVector<CCValAssign, 16> RVLocs;
2204   bool Is64Bit = Subtarget->is64Bit();
2205   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2206                  *DAG.getContext());
2207   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2208
2209   // Copy all of the result registers out of their specified physreg.
2210   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2211     CCValAssign &VA = RVLocs[i];
2212     EVT CopyVT = VA.getValVT();
2213
2214     // If this is x86-64, and we disabled SSE, we can't return FP values
2215     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2216         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2217       report_fatal_error("SSE register return with SSE disabled");
2218     }
2219
2220     // If we prefer to use the value in xmm registers, copy it out as f80 and
2221     // use a truncate to move it from fp stack reg to xmm reg.
2222     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2223         isScalarFPTypeInSSEReg(VA.getValVT()))
2224       CopyVT = MVT::f80;
2225
2226     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2227                                CopyVT, InFlag).getValue(1);
2228     SDValue Val = Chain.getValue(0);
2229
2230     if (CopyVT != VA.getValVT())
2231       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2232                         // This truncation won't change the value.
2233                         DAG.getIntPtrConstant(1));
2234
2235     InFlag = Chain.getValue(2);
2236     InVals.push_back(Val);
2237   }
2238
2239   return Chain;
2240 }
2241
2242 //===----------------------------------------------------------------------===//
2243 //                C & StdCall & Fast Calling Convention implementation
2244 //===----------------------------------------------------------------------===//
2245 //  StdCall calling convention seems to be standard for many Windows' API
2246 //  routines and around. It differs from C calling convention just a little:
2247 //  callee should clean up the stack, not caller. Symbols should be also
2248 //  decorated in some fancy way :) It doesn't support any vector arguments.
2249 //  For info on fast calling convention see Fast Calling Convention (tail call)
2250 //  implementation LowerX86_32FastCCCallTo.
2251
2252 /// CallIsStructReturn - Determines whether a call uses struct return
2253 /// semantics.
2254 enum StructReturnType {
2255   NotStructReturn,
2256   RegStructReturn,
2257   StackStructReturn
2258 };
2259 static StructReturnType
2260 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2261   if (Outs.empty())
2262     return NotStructReturn;
2263
2264   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2265   if (!Flags.isSRet())
2266     return NotStructReturn;
2267   if (Flags.isInReg())
2268     return RegStructReturn;
2269   return StackStructReturn;
2270 }
2271
2272 /// Determines whether a function uses struct return semantics.
2273 static StructReturnType
2274 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2275   if (Ins.empty())
2276     return NotStructReturn;
2277
2278   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2279   if (!Flags.isSRet())
2280     return NotStructReturn;
2281   if (Flags.isInReg())
2282     return RegStructReturn;
2283   return StackStructReturn;
2284 }
2285
2286 /// Make a copy of an aggregate at address specified by "Src" to address
2287 /// "Dst" with size and alignment information specified by the specific
2288 /// parameter attribute. The copy will be passed as a byval function parameter.
2289 static SDValue
2290 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2291                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2292                           SDLoc dl) {
2293   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2294
2295   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2296                        /*isVolatile*/false, /*AlwaysInline=*/true,
2297                        MachinePointerInfo(), MachinePointerInfo());
2298 }
2299
2300 /// Return true if the calling convention is one that
2301 /// supports tail call optimization.
2302 static bool IsTailCallConvention(CallingConv::ID CC) {
2303   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2304           CC == CallingConv::HiPE);
2305 }
2306
2307 /// \brief Return true if the calling convention is a C calling convention.
2308 static bool IsCCallConvention(CallingConv::ID CC) {
2309   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2310           CC == CallingConv::X86_64_SysV);
2311 }
2312
2313 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2314   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2315     return false;
2316
2317   CallSite CS(CI);
2318   CallingConv::ID CalleeCC = CS.getCallingConv();
2319   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2320     return false;
2321
2322   return true;
2323 }
2324
2325 /// Return true if the function is being made into
2326 /// a tailcall target by changing its ABI.
2327 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2328                                    bool GuaranteedTailCallOpt) {
2329   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2330 }
2331
2332 SDValue
2333 X86TargetLowering::LowerMemArgument(SDValue Chain,
2334                                     CallingConv::ID CallConv,
2335                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2336                                     SDLoc dl, SelectionDAG &DAG,
2337                                     const CCValAssign &VA,
2338                                     MachineFrameInfo *MFI,
2339                                     unsigned i) const {
2340   // Create the nodes corresponding to a load from this parameter slot.
2341   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2342   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2343       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2344   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2345   EVT ValVT;
2346
2347   // If value is passed by pointer we have address passed instead of the value
2348   // itself.
2349   if (VA.getLocInfo() == CCValAssign::Indirect)
2350     ValVT = VA.getLocVT();
2351   else
2352     ValVT = VA.getValVT();
2353
2354   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2355   // changed with more analysis.
2356   // In case of tail call optimization mark all arguments mutable. Since they
2357   // could be overwritten by lowering of arguments in case of a tail call.
2358   if (Flags.isByVal()) {
2359     unsigned Bytes = Flags.getByValSize();
2360     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2361     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2362     return DAG.getFrameIndex(FI, getPointerTy());
2363   } else {
2364     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2365                                     VA.getLocMemOffset(), isImmutable);
2366     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2367     return DAG.getLoad(ValVT, dl, Chain, FIN,
2368                        MachinePointerInfo::getFixedStack(FI),
2369                        false, false, false, 0);
2370   }
2371 }
2372
2373 // FIXME: Get this from tablegen.
2374 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2375                                                 const X86Subtarget *Subtarget) {
2376   assert(Subtarget->is64Bit());
2377
2378   if (Subtarget->isCallingConvWin64(CallConv)) {
2379     static const MCPhysReg GPR64ArgRegsWin64[] = {
2380       X86::RCX, X86::RDX, X86::R8,  X86::R9
2381     };
2382     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2383   }
2384
2385   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2386     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2387   };
2388   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2389 }
2390
2391 // FIXME: Get this from tablegen.
2392 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2393                                                 CallingConv::ID CallConv,
2394                                                 const X86Subtarget *Subtarget) {
2395   assert(Subtarget->is64Bit());
2396   if (Subtarget->isCallingConvWin64(CallConv)) {
2397     // The XMM registers which might contain var arg parameters are shadowed
2398     // in their paired GPR.  So we only need to save the GPR to their home
2399     // slots.
2400     // TODO: __vectorcall will change this.
2401     return None;
2402   }
2403
2404   const Function *Fn = MF.getFunction();
2405   bool NoImplicitFloatOps = Fn->getAttributes().
2406       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2407   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
2408          "SSE register cannot be used when SSE is disabled!");
2409   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2410       !Subtarget->hasSSE1())
2411     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2412     // registers.
2413     return None;
2414
2415   static const MCPhysReg XMMArgRegs64Bit[] = {
2416     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2417     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2418   };
2419   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2420 }
2421
2422 SDValue
2423 X86TargetLowering::LowerFormalArguments(SDValue Chain,
2424                                         CallingConv::ID CallConv,
2425                                         bool isVarArg,
2426                                       const SmallVectorImpl<ISD::InputArg> &Ins,
2427                                         SDLoc dl,
2428                                         SelectionDAG &DAG,
2429                                         SmallVectorImpl<SDValue> &InVals)
2430                                           const {
2431   MachineFunction &MF = DAG.getMachineFunction();
2432   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2433
2434   const Function* Fn = MF.getFunction();
2435   if (Fn->hasExternalLinkage() &&
2436       Subtarget->isTargetCygMing() &&
2437       Fn->getName() == "main")
2438     FuncInfo->setForceFramePointer(true);
2439
2440   MachineFrameInfo *MFI = MF.getFrameInfo();
2441   bool Is64Bit = Subtarget->is64Bit();
2442   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2443
2444   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2445          "Var args not supported with calling convention fastcc, ghc or hipe");
2446
2447   // Assign locations to all of the incoming arguments.
2448   SmallVector<CCValAssign, 16> ArgLocs;
2449   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2450
2451   // Allocate shadow area for Win64
2452   if (IsWin64)
2453     CCInfo.AllocateStack(32, 8);
2454
2455   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2456
2457   unsigned LastVal = ~0U;
2458   SDValue ArgValue;
2459   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2460     CCValAssign &VA = ArgLocs[i];
2461     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2462     // places.
2463     assert(VA.getValNo() != LastVal &&
2464            "Don't support value assigned to multiple locs yet");
2465     (void)LastVal;
2466     LastVal = VA.getValNo();
2467
2468     if (VA.isRegLoc()) {
2469       EVT RegVT = VA.getLocVT();
2470       const TargetRegisterClass *RC;
2471       if (RegVT == MVT::i32)
2472         RC = &X86::GR32RegClass;
2473       else if (Is64Bit && RegVT == MVT::i64)
2474         RC = &X86::GR64RegClass;
2475       else if (RegVT == MVT::f32)
2476         RC = &X86::FR32RegClass;
2477       else if (RegVT == MVT::f64)
2478         RC = &X86::FR64RegClass;
2479       else if (RegVT.is512BitVector())
2480         RC = &X86::VR512RegClass;
2481       else if (RegVT.is256BitVector())
2482         RC = &X86::VR256RegClass;
2483       else if (RegVT.is128BitVector())
2484         RC = &X86::VR128RegClass;
2485       else if (RegVT == MVT::x86mmx)
2486         RC = &X86::VR64RegClass;
2487       else if (RegVT == MVT::i1)
2488         RC = &X86::VK1RegClass;
2489       else if (RegVT == MVT::v8i1)
2490         RC = &X86::VK8RegClass;
2491       else if (RegVT == MVT::v16i1)
2492         RC = &X86::VK16RegClass;
2493       else if (RegVT == MVT::v32i1)
2494         RC = &X86::VK32RegClass;
2495       else if (RegVT == MVT::v64i1)
2496         RC = &X86::VK64RegClass;
2497       else
2498         llvm_unreachable("Unknown argument type!");
2499
2500       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2501       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2502
2503       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2504       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2505       // right size.
2506       if (VA.getLocInfo() == CCValAssign::SExt)
2507         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2508                                DAG.getValueType(VA.getValVT()));
2509       else if (VA.getLocInfo() == CCValAssign::ZExt)
2510         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2511                                DAG.getValueType(VA.getValVT()));
2512       else if (VA.getLocInfo() == CCValAssign::BCvt)
2513         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2514
2515       if (VA.isExtInLoc()) {
2516         // Handle MMX values passed in XMM regs.
2517         if (RegVT.isVector())
2518           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2519         else
2520           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2521       }
2522     } else {
2523       assert(VA.isMemLoc());
2524       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2525     }
2526
2527     // If value is passed via pointer - do a load.
2528     if (VA.getLocInfo() == CCValAssign::Indirect)
2529       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2530                              MachinePointerInfo(), false, false, false, 0);
2531
2532     InVals.push_back(ArgValue);
2533   }
2534
2535   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
2536     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2537       // The x86-64 ABIs require that for returning structs by value we copy
2538       // the sret argument into %rax/%eax (depending on ABI) for the return.
2539       // Win32 requires us to put the sret argument to %eax as well.
2540       // Save the argument into a virtual register so that we can access it
2541       // from the return points.
2542       if (Ins[i].Flags.isSRet()) {
2543         unsigned Reg = FuncInfo->getSRetReturnReg();
2544         if (!Reg) {
2545           MVT PtrTy = getPointerTy();
2546           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2547           FuncInfo->setSRetReturnReg(Reg);
2548         }
2549         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2550         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2551         break;
2552       }
2553     }
2554   }
2555
2556   unsigned StackSize = CCInfo.getNextStackOffset();
2557   // Align stack specially for tail calls.
2558   if (FuncIsMadeTailCallSafe(CallConv,
2559                              MF.getTarget().Options.GuaranteedTailCallOpt))
2560     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2561
2562   // If the function takes variable number of arguments, make a frame index for
2563   // the start of the first vararg value... for expansion of llvm.va_start. We
2564   // can skip this if there are no va_start calls.
2565   if (MFI->hasVAStart() &&
2566       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2567                    CallConv != CallingConv::X86_ThisCall))) {
2568     FuncInfo->setVarArgsFrameIndex(
2569         MFI->CreateFixedObject(1, StackSize, true));
2570   }
2571
2572   // Figure out if XMM registers are in use.
2573   assert(!(MF.getTarget().Options.UseSoftFloat &&
2574            Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
2575                                             Attribute::NoImplicitFloat)) &&
2576          "SSE register cannot be used when SSE is disabled!");
2577
2578   // 64-bit calling conventions support varargs and register parameters, so we
2579   // have to do extra work to spill them in the prologue.
2580   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2581     // Find the first unallocated argument registers.
2582     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2583     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2584     unsigned NumIntRegs =
2585         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
2586     unsigned NumXMMRegs =
2587         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
2588     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2589            "SSE register cannot be used when SSE is disabled!");
2590
2591     // Gather all the live in physical registers.
2592     SmallVector<SDValue, 6> LiveGPRs;
2593     SmallVector<SDValue, 8> LiveXMMRegs;
2594     SDValue ALVal;
2595     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2596       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2597       LiveGPRs.push_back(
2598           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2599     }
2600     if (!ArgXMMs.empty()) {
2601       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2602       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2603       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2604         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2605         LiveXMMRegs.push_back(
2606             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2607       }
2608     }
2609
2610     if (IsWin64) {
2611       const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
2612       // Get to the caller-allocated home save location.  Add 8 to account
2613       // for the return address.
2614       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2615       FuncInfo->setRegSaveFrameIndex(
2616           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2617       // Fixup to set vararg frame on shadow area (4 x i64).
2618       if (NumIntRegs < 4)
2619         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2620     } else {
2621       // For X86-64, if there are vararg parameters that are passed via
2622       // registers, then we must store them to their spots on the stack so
2623       // they may be loaded by deferencing the result of va_next.
2624       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2625       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2626       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2627           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2628     }
2629
2630     // Store the integer parameter registers.
2631     SmallVector<SDValue, 8> MemOps;
2632     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2633                                       getPointerTy());
2634     unsigned Offset = FuncInfo->getVarArgsGPOffset();
2635     for (SDValue Val : LiveGPRs) {
2636       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2637                                 DAG.getIntPtrConstant(Offset));
2638       SDValue Store =
2639         DAG.getStore(Val.getValue(1), dl, Val, FIN,
2640                      MachinePointerInfo::getFixedStack(
2641                        FuncInfo->getRegSaveFrameIndex(), Offset),
2642                      false, false, 0);
2643       MemOps.push_back(Store);
2644       Offset += 8;
2645     }
2646
2647     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2648       // Now store the XMM (fp + vector) parameter registers.
2649       SmallVector<SDValue, 12> SaveXMMOps;
2650       SaveXMMOps.push_back(Chain);
2651       SaveXMMOps.push_back(ALVal);
2652       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2653                              FuncInfo->getRegSaveFrameIndex()));
2654       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2655                              FuncInfo->getVarArgsFPOffset()));
2656       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2657                         LiveXMMRegs.end());
2658       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2659                                    MVT::Other, SaveXMMOps));
2660     }
2661
2662     if (!MemOps.empty())
2663       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2664   }
2665
2666   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2667     // Find the largest legal vector type.
2668     MVT VecVT = MVT::Other;
2669     // FIXME: Only some x86_32 calling conventions support AVX512.
2670     if (Subtarget->hasAVX512() &&
2671         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2672                      CallConv == CallingConv::Intel_OCL_BI)))
2673       VecVT = MVT::v16f32;
2674     else if (Subtarget->hasAVX())
2675       VecVT = MVT::v8f32;
2676     else if (Subtarget->hasSSE2())
2677       VecVT = MVT::v4f32;
2678
2679     // We forward some GPRs and some vector types.
2680     SmallVector<MVT, 2> RegParmTypes;
2681     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2682     RegParmTypes.push_back(IntVT);
2683     if (VecVT != MVT::Other)
2684       RegParmTypes.push_back(VecVT);
2685
2686     // Compute the set of forwarded registers. The rest are scratch.
2687     SmallVectorImpl<ForwardedRegister> &Forwards =
2688         FuncInfo->getForwardedMustTailRegParms();
2689     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2690
2691     // Conservatively forward AL on x86_64, since it might be used for varargs.
2692     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2693       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2694       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2695     }
2696
2697     // Copy all forwards from physical to virtual registers.
2698     for (ForwardedRegister &F : Forwards) {
2699       // FIXME: Can we use a less constrained schedule?
2700       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2701       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2702       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2703     }
2704   }
2705
2706   // Some CCs need callee pop.
2707   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2708                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
2709     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2710   } else {
2711     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2712     // If this is an sret function, the return should pop the hidden pointer.
2713     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2714         !Subtarget->getTargetTriple().isOSMSVCRT() &&
2715         argsAreStructReturn(Ins) == StackStructReturn)
2716       FuncInfo->setBytesToPopOnReturn(4);
2717   }
2718
2719   if (!Is64Bit) {
2720     // RegSaveFrameIndex is X86-64 only.
2721     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2722     if (CallConv == CallingConv::X86_FastCall ||
2723         CallConv == CallingConv::X86_ThisCall)
2724       // fastcc functions can't have varargs.
2725       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2726   }
2727
2728   FuncInfo->setArgumentStackSize(StackSize);
2729
2730   return Chain;
2731 }
2732
2733 SDValue
2734 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2735                                     SDValue StackPtr, SDValue Arg,
2736                                     SDLoc dl, SelectionDAG &DAG,
2737                                     const CCValAssign &VA,
2738                                     ISD::ArgFlagsTy Flags) const {
2739   unsigned LocMemOffset = VA.getLocMemOffset();
2740   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2741   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2742   if (Flags.isByVal())
2743     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2744
2745   return DAG.getStore(Chain, dl, Arg, PtrOff,
2746                       MachinePointerInfo::getStack(LocMemOffset),
2747                       false, false, 0);
2748 }
2749
2750 /// Emit a load of return address if tail call
2751 /// optimization is performed and it is required.
2752 SDValue
2753 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2754                                            SDValue &OutRetAddr, SDValue Chain,
2755                                            bool IsTailCall, bool Is64Bit,
2756                                            int FPDiff, SDLoc dl) const {
2757   // Adjust the Return address stack slot.
2758   EVT VT = getPointerTy();
2759   OutRetAddr = getReturnAddressFrameIndex(DAG);
2760
2761   // Load the "old" Return address.
2762   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2763                            false, false, false, 0);
2764   return SDValue(OutRetAddr.getNode(), 1);
2765 }
2766
2767 /// Emit a store of the return address if tail call
2768 /// optimization is performed and it is required (FPDiff!=0).
2769 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2770                                         SDValue Chain, SDValue RetAddrFrIdx,
2771                                         EVT PtrVT, unsigned SlotSize,
2772                                         int FPDiff, SDLoc dl) {
2773   // Store the return address to the appropriate stack slot.
2774   if (!FPDiff) return Chain;
2775   // Calculate the new stack slot for the return address.
2776   int NewReturnAddrFI =
2777     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2778                                          false);
2779   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2780   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2781                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2782                        false, false, 0);
2783   return Chain;
2784 }
2785
2786 SDValue
2787 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2788                              SmallVectorImpl<SDValue> &InVals) const {
2789   SelectionDAG &DAG                     = CLI.DAG;
2790   SDLoc &dl                             = CLI.DL;
2791   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2792   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2793   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2794   SDValue Chain                         = CLI.Chain;
2795   SDValue Callee                        = CLI.Callee;
2796   CallingConv::ID CallConv              = CLI.CallConv;
2797   bool &isTailCall                      = CLI.IsTailCall;
2798   bool isVarArg                         = CLI.IsVarArg;
2799
2800   MachineFunction &MF = DAG.getMachineFunction();
2801   bool Is64Bit        = Subtarget->is64Bit();
2802   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
2803   StructReturnType SR = callIsStructReturn(Outs);
2804   bool IsSibcall      = false;
2805   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2806
2807   if (MF.getTarget().Options.DisableTailCalls)
2808     isTailCall = false;
2809
2810   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2811   if (IsMustTail) {
2812     // Force this to be a tail call.  The verifier rules are enough to ensure
2813     // that we can lower this successfully without moving the return address
2814     // around.
2815     isTailCall = true;
2816   } else if (isTailCall) {
2817     // Check if it's really possible to do a tail call.
2818     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2819                     isVarArg, SR != NotStructReturn,
2820                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2821                     Outs, OutVals, Ins, DAG);
2822
2823     // Sibcalls are automatically detected tailcalls which do not require
2824     // ABI changes.
2825     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2826       IsSibcall = true;
2827
2828     if (isTailCall)
2829       ++NumTailCalls;
2830   }
2831
2832   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2833          "Var args not supported with calling convention fastcc, ghc or hipe");
2834
2835   // Analyze operands of the call, assigning locations to each operand.
2836   SmallVector<CCValAssign, 16> ArgLocs;
2837   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2838
2839   // Allocate shadow area for Win64
2840   if (IsWin64)
2841     CCInfo.AllocateStack(32, 8);
2842
2843   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2844
2845   // Get a count of how many bytes are to be pushed on the stack.
2846   unsigned NumBytes = CCInfo.getNextStackOffset();
2847   if (IsSibcall)
2848     // This is a sibcall. The memory operands are available in caller's
2849     // own caller's stack.
2850     NumBytes = 0;
2851   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2852            IsTailCallConvention(CallConv))
2853     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2854
2855   int FPDiff = 0;
2856   if (isTailCall && !IsSibcall && !IsMustTail) {
2857     // Lower arguments at fp - stackoffset + fpdiff.
2858     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2859
2860     FPDiff = NumBytesCallerPushed - NumBytes;
2861
2862     // Set the delta of movement of the returnaddr stackslot.
2863     // But only set if delta is greater than previous delta.
2864     if (FPDiff < X86Info->getTCReturnAddrDelta())
2865       X86Info->setTCReturnAddrDelta(FPDiff);
2866   }
2867
2868   unsigned NumBytesToPush = NumBytes;
2869   unsigned NumBytesToPop = NumBytes;
2870
2871   // If we have an inalloca argument, all stack space has already been allocated
2872   // for us and be right at the top of the stack.  We don't support multiple
2873   // arguments passed in memory when using inalloca.
2874   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2875     NumBytesToPush = 0;
2876     if (!ArgLocs.back().isMemLoc())
2877       report_fatal_error("cannot use inalloca attribute on a register "
2878                          "parameter");
2879     if (ArgLocs.back().getLocMemOffset() != 0)
2880       report_fatal_error("any parameter with the inalloca attribute must be "
2881                          "the only memory argument");
2882   }
2883
2884   if (!IsSibcall)
2885     Chain = DAG.getCALLSEQ_START(
2886         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
2887
2888   SDValue RetAddrFrIdx;
2889   // Load return address for tail calls.
2890   if (isTailCall && FPDiff)
2891     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2892                                     Is64Bit, FPDiff, dl);
2893
2894   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2895   SmallVector<SDValue, 8> MemOpChains;
2896   SDValue StackPtr;
2897
2898   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2899   // of tail call optimization arguments are handle later.
2900   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2901   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2902     // Skip inalloca arguments, they have already been written.
2903     ISD::ArgFlagsTy Flags = Outs[i].Flags;
2904     if (Flags.isInAlloca())
2905       continue;
2906
2907     CCValAssign &VA = ArgLocs[i];
2908     EVT RegVT = VA.getLocVT();
2909     SDValue Arg = OutVals[i];
2910     bool isByVal = Flags.isByVal();
2911
2912     // Promote the value if needed.
2913     switch (VA.getLocInfo()) {
2914     default: llvm_unreachable("Unknown loc info!");
2915     case CCValAssign::Full: break;
2916     case CCValAssign::SExt:
2917       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2918       break;
2919     case CCValAssign::ZExt:
2920       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2921       break;
2922     case CCValAssign::AExt:
2923       if (RegVT.is128BitVector()) {
2924         // Special case: passing MMX values in XMM registers.
2925         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2926         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2927         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2928       } else
2929         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2930       break;
2931     case CCValAssign::BCvt:
2932       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2933       break;
2934     case CCValAssign::Indirect: {
2935       // Store the argument.
2936       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2937       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2938       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2939                            MachinePointerInfo::getFixedStack(FI),
2940                            false, false, 0);
2941       Arg = SpillSlot;
2942       break;
2943     }
2944     }
2945
2946     if (VA.isRegLoc()) {
2947       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2948       if (isVarArg && IsWin64) {
2949         // Win64 ABI requires argument XMM reg to be copied to the corresponding
2950         // shadow reg if callee is a varargs function.
2951         unsigned ShadowReg = 0;
2952         switch (VA.getLocReg()) {
2953         case X86::XMM0: ShadowReg = X86::RCX; break;
2954         case X86::XMM1: ShadowReg = X86::RDX; break;
2955         case X86::XMM2: ShadowReg = X86::R8; break;
2956         case X86::XMM3: ShadowReg = X86::R9; break;
2957         }
2958         if (ShadowReg)
2959           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2960       }
2961     } else if (!IsSibcall && (!isTailCall || isByVal)) {
2962       assert(VA.isMemLoc());
2963       if (!StackPtr.getNode())
2964         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2965                                       getPointerTy());
2966       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2967                                              dl, DAG, VA, Flags));
2968     }
2969   }
2970
2971   if (!MemOpChains.empty())
2972     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2973
2974   if (Subtarget->isPICStyleGOT()) {
2975     // ELF / PIC requires GOT in the EBX register before function calls via PLT
2976     // GOT pointer.
2977     if (!isTailCall) {
2978       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2979                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2980     } else {
2981       // If we are tail calling and generating PIC/GOT style code load the
2982       // address of the callee into ECX. The value in ecx is used as target of
2983       // the tail jump. This is done to circumvent the ebx/callee-saved problem
2984       // for tail calls on PIC/GOT architectures. Normally we would just put the
2985       // address of GOT into ebx and then call target@PLT. But for tail calls
2986       // ebx would be restored (since ebx is callee saved) before jumping to the
2987       // target@PLT.
2988
2989       // Note: The actual moving to ECX is done further down.
2990       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2991       if (G && !G->getGlobal()->hasHiddenVisibility() &&
2992           !G->getGlobal()->hasProtectedVisibility())
2993         Callee = LowerGlobalAddress(Callee, DAG);
2994       else if (isa<ExternalSymbolSDNode>(Callee))
2995         Callee = LowerExternalSymbol(Callee, DAG);
2996     }
2997   }
2998
2999   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3000     // From AMD64 ABI document:
3001     // For calls that may call functions that use varargs or stdargs
3002     // (prototype-less calls or calls to functions containing ellipsis (...) in
3003     // the declaration) %al is used as hidden argument to specify the number
3004     // of SSE registers used. The contents of %al do not need to match exactly
3005     // the number of registers, but must be an ubound on the number of SSE
3006     // registers used and is in the range 0 - 8 inclusive.
3007
3008     // Count the number of XMM registers allocated.
3009     static const MCPhysReg XMMArgRegs[] = {
3010       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3011       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3012     };
3013     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
3014     assert((Subtarget->hasSSE1() || !NumXMMRegs)
3015            && "SSE registers cannot be used when SSE is disabled");
3016
3017     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3018                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
3019   }
3020
3021   if (isVarArg && IsMustTail) {
3022     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3023     for (const auto &F : Forwards) {
3024       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3025       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3026     }
3027   }
3028
3029   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3030   // don't need this because the eligibility check rejects calls that require
3031   // shuffling arguments passed in memory.
3032   if (!IsSibcall && isTailCall) {
3033     // Force all the incoming stack arguments to be loaded from the stack
3034     // before any new outgoing arguments are stored to the stack, because the
3035     // outgoing stack slots may alias the incoming argument stack slots, and
3036     // the alias isn't otherwise explicit. This is slightly more conservative
3037     // than necessary, because it means that each store effectively depends
3038     // on every argument instead of just those arguments it would clobber.
3039     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3040
3041     SmallVector<SDValue, 8> MemOpChains2;
3042     SDValue FIN;
3043     int FI = 0;
3044     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3045       CCValAssign &VA = ArgLocs[i];
3046       if (VA.isRegLoc())
3047         continue;
3048       assert(VA.isMemLoc());
3049       SDValue Arg = OutVals[i];
3050       ISD::ArgFlagsTy Flags = Outs[i].Flags;
3051       // Skip inalloca arguments.  They don't require any work.
3052       if (Flags.isInAlloca())
3053         continue;
3054       // Create frame index.
3055       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3056       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3057       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3058       FIN = DAG.getFrameIndex(FI, getPointerTy());
3059
3060       if (Flags.isByVal()) {
3061         // Copy relative to framepointer.
3062         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
3063         if (!StackPtr.getNode())
3064           StackPtr = DAG.getCopyFromReg(Chain, dl,
3065                                         RegInfo->getStackRegister(),
3066                                         getPointerTy());
3067         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
3068
3069         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3070                                                          ArgChain,
3071                                                          Flags, DAG, dl));
3072       } else {
3073         // Store relative to framepointer.
3074         MemOpChains2.push_back(
3075           DAG.getStore(ArgChain, dl, Arg, FIN,
3076                        MachinePointerInfo::getFixedStack(FI),
3077                        false, false, 0));
3078       }
3079     }
3080
3081     if (!MemOpChains2.empty())
3082       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3083
3084     // Store the return address to the appropriate stack slot.
3085     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3086                                      getPointerTy(), RegInfo->getSlotSize(),
3087                                      FPDiff, dl);
3088   }
3089
3090   // Build a sequence of copy-to-reg nodes chained together with token chain
3091   // and flag operands which copy the outgoing args into registers.
3092   SDValue InFlag;
3093   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3094     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3095                              RegsToPass[i].second, InFlag);
3096     InFlag = Chain.getValue(1);
3097   }
3098
3099   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3100     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3101     // In the 64-bit large code model, we have to make all calls
3102     // through a register, since the call instruction's 32-bit
3103     // pc-relative offset may not be large enough to hold the whole
3104     // address.
3105   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3106     // If the callee is a GlobalAddress node (quite common, every direct call
3107     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3108     // it.
3109     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3110
3111     // We should use extra load for direct calls to dllimported functions in
3112     // non-JIT mode.
3113     const GlobalValue *GV = G->getGlobal();
3114     if (!GV->hasDLLImportStorageClass()) {
3115       unsigned char OpFlags = 0;
3116       bool ExtraLoad = false;
3117       unsigned WrapperKind = ISD::DELETED_NODE;
3118
3119       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
3120       // external symbols most go through the PLT in PIC mode.  If the symbol
3121       // has hidden or protected visibility, or if it is static or local, then
3122       // we don't need to use the PLT - we can directly call it.
3123       if (Subtarget->isTargetELF() &&
3124           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
3125           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
3126         OpFlags = X86II::MO_PLT;
3127       } else if (Subtarget->isPICStyleStubAny() &&
3128                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
3129                  (!Subtarget->getTargetTriple().isMacOSX() ||
3130                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3131         // PC-relative references to external symbols should go through $stub,
3132         // unless we're building with the leopard linker or later, which
3133         // automatically synthesizes these stubs.
3134         OpFlags = X86II::MO_DARWIN_STUB;
3135       } else if (Subtarget->isPICStyleRIPRel() &&
3136                  isa<Function>(GV) &&
3137                  cast<Function>(GV)->getAttributes().
3138                    hasAttribute(AttributeSet::FunctionIndex,
3139                                 Attribute::NonLazyBind)) {
3140         // If the function is marked as non-lazy, generate an indirect call
3141         // which loads from the GOT directly. This avoids runtime overhead
3142         // at the cost of eager binding (and one extra byte of encoding).
3143         OpFlags = X86II::MO_GOTPCREL;
3144         WrapperKind = X86ISD::WrapperRIP;
3145         ExtraLoad = true;
3146       }
3147
3148       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
3149                                           G->getOffset(), OpFlags);
3150
3151       // Add a wrapper if needed.
3152       if (WrapperKind != ISD::DELETED_NODE)
3153         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
3154       // Add extra indirection if needed.
3155       if (ExtraLoad)
3156         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
3157                              MachinePointerInfo::getGOT(),
3158                              false, false, false, 0);
3159     }
3160   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3161     unsigned char OpFlags = 0;
3162
3163     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
3164     // external symbols should go through the PLT.
3165     if (Subtarget->isTargetELF() &&
3166         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
3167       OpFlags = X86II::MO_PLT;
3168     } else if (Subtarget->isPICStyleStubAny() &&
3169                (!Subtarget->getTargetTriple().isMacOSX() ||
3170                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3171       // PC-relative references to external symbols should go through $stub,
3172       // unless we're building with the leopard linker or later, which
3173       // automatically synthesizes these stubs.
3174       OpFlags = X86II::MO_DARWIN_STUB;
3175     }
3176
3177     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
3178                                          OpFlags);
3179   } else if (Subtarget->isTarget64BitILP32() &&
3180              Callee->getValueType(0) == MVT::i32) {
3181     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3182     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3183   }
3184
3185   // Returns a chain & a flag for retval copy to use.
3186   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3187   SmallVector<SDValue, 8> Ops;
3188
3189   if (!IsSibcall && isTailCall) {
3190     Chain = DAG.getCALLSEQ_END(Chain,
3191                                DAG.getIntPtrConstant(NumBytesToPop, true),
3192                                DAG.getIntPtrConstant(0, true), InFlag, dl);
3193     InFlag = Chain.getValue(1);
3194   }
3195
3196   Ops.push_back(Chain);
3197   Ops.push_back(Callee);
3198
3199   if (isTailCall)
3200     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
3201
3202   // Add argument registers to the end of the list so that they are known live
3203   // into the call.
3204   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3205     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3206                                   RegsToPass[i].second.getValueType()));
3207
3208   // Add a register mask operand representing the call-preserved registers.
3209   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
3210   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
3211   assert(Mask && "Missing call preserved mask for calling convention");
3212   Ops.push_back(DAG.getRegisterMask(Mask));
3213
3214   if (InFlag.getNode())
3215     Ops.push_back(InFlag);
3216
3217   if (isTailCall) {
3218     // We used to do:
3219     //// If this is the first return lowered for this function, add the regs
3220     //// to the liveout set for the function.
3221     // This isn't right, although it's probably harmless on x86; liveouts
3222     // should be computed from returns not tail calls.  Consider a void
3223     // function making a tail call to a function returning int.
3224     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3225   }
3226
3227   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3228   InFlag = Chain.getValue(1);
3229
3230   // Create the CALLSEQ_END node.
3231   unsigned NumBytesForCalleeToPop;
3232   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3233                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3234     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3235   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3236            !Subtarget->getTargetTriple().isOSMSVCRT() &&
3237            SR == StackStructReturn)
3238     // If this is a call to a struct-return function, the callee
3239     // pops the hidden struct pointer, so we have to push it back.
3240     // This is common for Darwin/X86, Linux & Mingw32 targets.
3241     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3242     NumBytesForCalleeToPop = 4;
3243   else
3244     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3245
3246   // Returns a flag for retval copy to use.
3247   if (!IsSibcall) {
3248     Chain = DAG.getCALLSEQ_END(Chain,
3249                                DAG.getIntPtrConstant(NumBytesToPop, true),
3250                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
3251                                                      true),
3252                                InFlag, dl);
3253     InFlag = Chain.getValue(1);
3254   }
3255
3256   // Handle result values, copying them out of physregs into vregs that we
3257   // return.
3258   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3259                          Ins, dl, DAG, InVals);
3260 }
3261
3262 //===----------------------------------------------------------------------===//
3263 //                Fast Calling Convention (tail call) implementation
3264 //===----------------------------------------------------------------------===//
3265
3266 //  Like std call, callee cleans arguments, convention except that ECX is
3267 //  reserved for storing the tail called function address. Only 2 registers are
3268 //  free for argument passing (inreg). Tail call optimization is performed
3269 //  provided:
3270 //                * tailcallopt is enabled
3271 //                * caller/callee are fastcc
3272 //  On X86_64 architecture with GOT-style position independent code only local
3273 //  (within module) calls are supported at the moment.
3274 //  To keep the stack aligned according to platform abi the function
3275 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3276 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3277 //  If a tail called function callee has more arguments than the caller the
3278 //  caller needs to make sure that there is room to move the RETADDR to. This is
3279 //  achieved by reserving an area the size of the argument delta right after the
3280 //  original RETADDR, but before the saved framepointer or the spilled registers
3281 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3282 //  stack layout:
3283 //    arg1
3284 //    arg2
3285 //    RETADDR
3286 //    [ new RETADDR
3287 //      move area ]
3288 //    (possible EBP)
3289 //    ESI
3290 //    EDI
3291 //    local1 ..
3292
3293 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3294 /// for a 16 byte align requirement.
3295 unsigned
3296 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3297                                                SelectionDAG& DAG) const {
3298   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3299   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
3300   unsigned StackAlignment = TFI.getStackAlignment();
3301   uint64_t AlignMask = StackAlignment - 1;
3302   int64_t Offset = StackSize;
3303   unsigned SlotSize = RegInfo->getSlotSize();
3304   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3305     // Number smaller than 12 so just add the difference.
3306     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3307   } else {
3308     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3309     Offset = ((~AlignMask) & Offset) + StackAlignment +
3310       (StackAlignment-SlotSize);
3311   }
3312   return Offset;
3313 }
3314
3315 /// MatchingStackOffset - Return true if the given stack call argument is
3316 /// already available in the same position (relatively) of the caller's
3317 /// incoming argument stack.
3318 static
3319 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3320                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3321                          const X86InstrInfo *TII) {
3322   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3323   int FI = INT_MAX;
3324   if (Arg.getOpcode() == ISD::CopyFromReg) {
3325     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3326     if (!TargetRegisterInfo::isVirtualRegister(VR))
3327       return false;
3328     MachineInstr *Def = MRI->getVRegDef(VR);
3329     if (!Def)
3330       return false;
3331     if (!Flags.isByVal()) {
3332       if (!TII->isLoadFromStackSlot(Def, FI))
3333         return false;
3334     } else {
3335       unsigned Opcode = Def->getOpcode();
3336       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3337            Opcode == X86::LEA64_32r) &&
3338           Def->getOperand(1).isFI()) {
3339         FI = Def->getOperand(1).getIndex();
3340         Bytes = Flags.getByValSize();
3341       } else
3342         return false;
3343     }
3344   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3345     if (Flags.isByVal())
3346       // ByVal argument is passed in as a pointer but it's now being
3347       // dereferenced. e.g.
3348       // define @foo(%struct.X* %A) {
3349       //   tail call @bar(%struct.X* byval %A)
3350       // }
3351       return false;
3352     SDValue Ptr = Ld->getBasePtr();
3353     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3354     if (!FINode)
3355       return false;
3356     FI = FINode->getIndex();
3357   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3358     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3359     FI = FINode->getIndex();
3360     Bytes = Flags.getByValSize();
3361   } else
3362     return false;
3363
3364   assert(FI != INT_MAX);
3365   if (!MFI->isFixedObjectIndex(FI))
3366     return false;
3367   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3368 }
3369
3370 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
3371 /// for tail call optimization. Targets which want to do tail call
3372 /// optimization should implement this function.
3373 bool
3374 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3375                                                      CallingConv::ID CalleeCC,
3376                                                      bool isVarArg,
3377                                                      bool isCalleeStructRet,
3378                                                      bool isCallerStructRet,
3379                                                      Type *RetTy,
3380                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
3381                                     const SmallVectorImpl<SDValue> &OutVals,
3382                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3383                                                      SelectionDAG &DAG) const {
3384   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3385     return false;
3386
3387   // If -tailcallopt is specified, make fastcc functions tail-callable.
3388   const MachineFunction &MF = DAG.getMachineFunction();
3389   const Function *CallerF = MF.getFunction();
3390
3391   // If the function return type is x86_fp80 and the callee return type is not,
3392   // then the FP_EXTEND of the call result is not a nop. It's not safe to
3393   // perform a tailcall optimization here.
3394   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3395     return false;
3396
3397   CallingConv::ID CallerCC = CallerF->getCallingConv();
3398   bool CCMatch = CallerCC == CalleeCC;
3399   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3400   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3401
3402   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3403     if (IsTailCallConvention(CalleeCC) && CCMatch)
3404       return true;
3405     return false;
3406   }
3407
3408   // Look for obvious safe cases to perform tail call optimization that do not
3409   // require ABI changes. This is what gcc calls sibcall.
3410
3411   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3412   // emit a special epilogue.
3413   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3414   if (RegInfo->needsStackRealignment(MF))
3415     return false;
3416
3417   // Also avoid sibcall optimization if either caller or callee uses struct
3418   // return semantics.
3419   if (isCalleeStructRet || isCallerStructRet)
3420     return false;
3421
3422   // An stdcall/thiscall caller is expected to clean up its arguments; the
3423   // callee isn't going to do that.
3424   // FIXME: this is more restrictive than needed. We could produce a tailcall
3425   // when the stack adjustment matches. For example, with a thiscall that takes
3426   // only one argument.
3427   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3428                    CallerCC == CallingConv::X86_ThisCall))
3429     return false;
3430
3431   // Do not sibcall optimize vararg calls unless all arguments are passed via
3432   // registers.
3433   if (isVarArg && !Outs.empty()) {
3434
3435     // Optimizing for varargs on Win64 is unlikely to be safe without
3436     // additional testing.
3437     if (IsCalleeWin64 || IsCallerWin64)
3438       return false;
3439
3440     SmallVector<CCValAssign, 16> ArgLocs;
3441     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3442                    *DAG.getContext());
3443
3444     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3445     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3446       if (!ArgLocs[i].isRegLoc())
3447         return false;
3448   }
3449
3450   // If the call result is in ST0 / ST1, it needs to be popped off the x87
3451   // stack.  Therefore, if it's not used by the call it is not safe to optimize
3452   // this into a sibcall.
3453   bool Unused = false;
3454   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3455     if (!Ins[i].Used) {
3456       Unused = true;
3457       break;
3458     }
3459   }
3460   if (Unused) {
3461     SmallVector<CCValAssign, 16> RVLocs;
3462     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
3463                    *DAG.getContext());
3464     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3465     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3466       CCValAssign &VA = RVLocs[i];
3467       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3468         return false;
3469     }
3470   }
3471
3472   // If the calling conventions do not match, then we'd better make sure the
3473   // results are returned in the same way as what the caller expects.
3474   if (!CCMatch) {
3475     SmallVector<CCValAssign, 16> RVLocs1;
3476     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
3477                     *DAG.getContext());
3478     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3479
3480     SmallVector<CCValAssign, 16> RVLocs2;
3481     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
3482                     *DAG.getContext());
3483     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3484
3485     if (RVLocs1.size() != RVLocs2.size())
3486       return false;
3487     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3488       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3489         return false;
3490       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3491         return false;
3492       if (RVLocs1[i].isRegLoc()) {
3493         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3494           return false;
3495       } else {
3496         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3497           return false;
3498       }
3499     }
3500   }
3501
3502   // If the callee takes no arguments then go on to check the results of the
3503   // call.
3504   if (!Outs.empty()) {
3505     // Check if stack adjustment is needed. For now, do not do this if any
3506     // argument is passed on the stack.
3507     SmallVector<CCValAssign, 16> ArgLocs;
3508     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3509                    *DAG.getContext());
3510
3511     // Allocate shadow area for Win64
3512     if (IsCalleeWin64)
3513       CCInfo.AllocateStack(32, 8);
3514
3515     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3516     if (CCInfo.getNextStackOffset()) {
3517       MachineFunction &MF = DAG.getMachineFunction();
3518       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3519         return false;
3520
3521       // Check if the arguments are already laid out in the right way as
3522       // the caller's fixed stack objects.
3523       MachineFrameInfo *MFI = MF.getFrameInfo();
3524       const MachineRegisterInfo *MRI = &MF.getRegInfo();
3525       const X86InstrInfo *TII = Subtarget->getInstrInfo();
3526       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3527         CCValAssign &VA = ArgLocs[i];
3528         SDValue Arg = OutVals[i];
3529         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3530         if (VA.getLocInfo() == CCValAssign::Indirect)
3531           return false;
3532         if (!VA.isRegLoc()) {
3533           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3534                                    MFI, MRI, TII))
3535             return false;
3536         }
3537       }
3538     }
3539
3540     // If the tailcall address may be in a register, then make sure it's
3541     // possible to register allocate for it. In 32-bit, the call address can
3542     // only target EAX, EDX, or ECX since the tail call must be scheduled after
3543     // callee-saved registers are restored. These happen to be the same
3544     // registers used to pass 'inreg' arguments so watch out for those.
3545     if (!Subtarget->is64Bit() &&
3546         ((!isa<GlobalAddressSDNode>(Callee) &&
3547           !isa<ExternalSymbolSDNode>(Callee)) ||
3548          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3549       unsigned NumInRegs = 0;
3550       // In PIC we need an extra register to formulate the address computation
3551       // for the callee.
3552       unsigned MaxInRegs =
3553         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3554
3555       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3556         CCValAssign &VA = ArgLocs[i];
3557         if (!VA.isRegLoc())
3558           continue;
3559         unsigned Reg = VA.getLocReg();
3560         switch (Reg) {
3561         default: break;
3562         case X86::EAX: case X86::EDX: case X86::ECX:
3563           if (++NumInRegs == MaxInRegs)
3564             return false;
3565           break;
3566         }
3567       }
3568     }
3569   }
3570
3571   return true;
3572 }
3573
3574 FastISel *
3575 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3576                                   const TargetLibraryInfo *libInfo) const {
3577   return X86::createFastISel(funcInfo, libInfo);
3578 }
3579
3580 //===----------------------------------------------------------------------===//
3581 //                           Other Lowering Hooks
3582 //===----------------------------------------------------------------------===//
3583
3584 static bool MayFoldLoad(SDValue Op) {
3585   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3586 }
3587
3588 static bool MayFoldIntoStore(SDValue Op) {
3589   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3590 }
3591
3592 static bool isTargetShuffle(unsigned Opcode) {
3593   switch(Opcode) {
3594   default: return false;
3595   case X86ISD::BLENDI:
3596   case X86ISD::PSHUFB:
3597   case X86ISD::PSHUFD:
3598   case X86ISD::PSHUFHW:
3599   case X86ISD::PSHUFLW:
3600   case X86ISD::SHUFP:
3601   case X86ISD::PALIGNR:
3602   case X86ISD::MOVLHPS:
3603   case X86ISD::MOVLHPD:
3604   case X86ISD::MOVHLPS:
3605   case X86ISD::MOVLPS:
3606   case X86ISD::MOVLPD:
3607   case X86ISD::MOVSHDUP:
3608   case X86ISD::MOVSLDUP:
3609   case X86ISD::MOVDDUP:
3610   case X86ISD::MOVSS:
3611   case X86ISD::MOVSD:
3612   case X86ISD::UNPCKL:
3613   case X86ISD::UNPCKH:
3614   case X86ISD::VPERMILPI:
3615   case X86ISD::VPERM2X128:
3616   case X86ISD::VPERMI:
3617     return true;
3618   }
3619 }
3620
3621 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3622                                     SDValue V1, SelectionDAG &DAG) {
3623   switch(Opc) {
3624   default: llvm_unreachable("Unknown x86 shuffle node");
3625   case X86ISD::MOVSHDUP:
3626   case X86ISD::MOVSLDUP:
3627   case X86ISD::MOVDDUP:
3628     return DAG.getNode(Opc, dl, VT, V1);
3629   }
3630 }
3631
3632 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3633                                     SDValue V1, unsigned TargetMask,
3634                                     SelectionDAG &DAG) {
3635   switch(Opc) {
3636   default: llvm_unreachable("Unknown x86 shuffle node");
3637   case X86ISD::PSHUFD:
3638   case X86ISD::PSHUFHW:
3639   case X86ISD::PSHUFLW:
3640   case X86ISD::VPERMILPI:
3641   case X86ISD::VPERMI:
3642     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3643   }
3644 }
3645
3646 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3647                                     SDValue V1, SDValue V2, unsigned TargetMask,
3648                                     SelectionDAG &DAG) {
3649   switch(Opc) {
3650   default: llvm_unreachable("Unknown x86 shuffle node");
3651   case X86ISD::PALIGNR:
3652   case X86ISD::VALIGN:
3653   case X86ISD::SHUFP:
3654   case X86ISD::VPERM2X128:
3655     return DAG.getNode(Opc, dl, VT, V1, V2,
3656                        DAG.getConstant(TargetMask, MVT::i8));
3657   }
3658 }
3659
3660 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3661                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
3662   switch(Opc) {
3663   default: llvm_unreachable("Unknown x86 shuffle node");
3664   case X86ISD::MOVLHPS:
3665   case X86ISD::MOVLHPD:
3666   case X86ISD::MOVHLPS:
3667   case X86ISD::MOVLPS:
3668   case X86ISD::MOVLPD:
3669   case X86ISD::MOVSS:
3670   case X86ISD::MOVSD:
3671   case X86ISD::UNPCKL:
3672   case X86ISD::UNPCKH:
3673     return DAG.getNode(Opc, dl, VT, V1, V2);
3674   }
3675 }
3676
3677 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3678   MachineFunction &MF = DAG.getMachineFunction();
3679   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
3680   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3681   int ReturnAddrIndex = FuncInfo->getRAIndex();
3682
3683   if (ReturnAddrIndex == 0) {
3684     // Set up a frame object for the return address.
3685     unsigned SlotSize = RegInfo->getSlotSize();
3686     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3687                                                            -(int64_t)SlotSize,
3688                                                            false);
3689     FuncInfo->setRAIndex(ReturnAddrIndex);
3690   }
3691
3692   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3693 }
3694
3695 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3696                                        bool hasSymbolicDisplacement) {
3697   // Offset should fit into 32 bit immediate field.
3698   if (!isInt<32>(Offset))
3699     return false;
3700
3701   // If we don't have a symbolic displacement - we don't have any extra
3702   // restrictions.
3703   if (!hasSymbolicDisplacement)
3704     return true;
3705
3706   // FIXME: Some tweaks might be needed for medium code model.
3707   if (M != CodeModel::Small && M != CodeModel::Kernel)
3708     return false;
3709
3710   // For small code model we assume that latest object is 16MB before end of 31
3711   // bits boundary. We may also accept pretty large negative constants knowing
3712   // that all objects are in the positive half of address space.
3713   if (M == CodeModel::Small && Offset < 16*1024*1024)
3714     return true;
3715
3716   // For kernel code model we know that all object resist in the negative half
3717   // of 32bits address space. We may not accept negative offsets, since they may
3718   // be just off and we may accept pretty large positive ones.
3719   if (M == CodeModel::Kernel && Offset >= 0)
3720     return true;
3721
3722   return false;
3723 }
3724
3725 /// isCalleePop - Determines whether the callee is required to pop its
3726 /// own arguments. Callee pop is necessary to support tail calls.
3727 bool X86::isCalleePop(CallingConv::ID CallingConv,
3728                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3729   switch (CallingConv) {
3730   default:
3731     return false;
3732   case CallingConv::X86_StdCall:
3733   case CallingConv::X86_FastCall:
3734   case CallingConv::X86_ThisCall:
3735     return !is64Bit;
3736   case CallingConv::Fast:
3737   case CallingConv::GHC:
3738   case CallingConv::HiPE:
3739     if (IsVarArg)
3740       return false;
3741     return TailCallOpt;
3742   }
3743 }
3744
3745 /// \brief Return true if the condition is an unsigned comparison operation.
3746 static bool isX86CCUnsigned(unsigned X86CC) {
3747   switch (X86CC) {
3748   default: llvm_unreachable("Invalid integer condition!");
3749   case X86::COND_E:     return true;
3750   case X86::COND_G:     return false;
3751   case X86::COND_GE:    return false;
3752   case X86::COND_L:     return false;
3753   case X86::COND_LE:    return false;
3754   case X86::COND_NE:    return true;
3755   case X86::COND_B:     return true;
3756   case X86::COND_A:     return true;
3757   case X86::COND_BE:    return true;
3758   case X86::COND_AE:    return true;
3759   }
3760   llvm_unreachable("covered switch fell through?!");
3761 }
3762
3763 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3764 /// specific condition code, returning the condition code and the LHS/RHS of the
3765 /// comparison to make.
3766 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3767                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3768   if (!isFP) {
3769     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3770       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3771         // X > -1   -> X == 0, jump !sign.
3772         RHS = DAG.getConstant(0, RHS.getValueType());
3773         return X86::COND_NS;
3774       }
3775       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3776         // X < 0   -> X == 0, jump on sign.
3777         return X86::COND_S;
3778       }
3779       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3780         // X < 1   -> X <= 0
3781         RHS = DAG.getConstant(0, RHS.getValueType());
3782         return X86::COND_LE;
3783       }
3784     }
3785
3786     switch (SetCCOpcode) {
3787     default: llvm_unreachable("Invalid integer condition!");
3788     case ISD::SETEQ:  return X86::COND_E;
3789     case ISD::SETGT:  return X86::COND_G;
3790     case ISD::SETGE:  return X86::COND_GE;
3791     case ISD::SETLT:  return X86::COND_L;
3792     case ISD::SETLE:  return X86::COND_LE;
3793     case ISD::SETNE:  return X86::COND_NE;
3794     case ISD::SETULT: return X86::COND_B;
3795     case ISD::SETUGT: return X86::COND_A;
3796     case ISD::SETULE: return X86::COND_BE;
3797     case ISD::SETUGE: return X86::COND_AE;
3798     }
3799   }
3800
3801   // First determine if it is required or is profitable to flip the operands.
3802
3803   // If LHS is a foldable load, but RHS is not, flip the condition.
3804   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3805       !ISD::isNON_EXTLoad(RHS.getNode())) {
3806     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3807     std::swap(LHS, RHS);
3808   }
3809
3810   switch (SetCCOpcode) {
3811   default: break;
3812   case ISD::SETOLT:
3813   case ISD::SETOLE:
3814   case ISD::SETUGT:
3815   case ISD::SETUGE:
3816     std::swap(LHS, RHS);
3817     break;
3818   }
3819
3820   // On a floating point condition, the flags are set as follows:
3821   // ZF  PF  CF   op
3822   //  0 | 0 | 0 | X > Y
3823   //  0 | 0 | 1 | X < Y
3824   //  1 | 0 | 0 | X == Y
3825   //  1 | 1 | 1 | unordered
3826   switch (SetCCOpcode) {
3827   default: llvm_unreachable("Condcode should be pre-legalized away");
3828   case ISD::SETUEQ:
3829   case ISD::SETEQ:   return X86::COND_E;
3830   case ISD::SETOLT:              // flipped
3831   case ISD::SETOGT:
3832   case ISD::SETGT:   return X86::COND_A;
3833   case ISD::SETOLE:              // flipped
3834   case ISD::SETOGE:
3835   case ISD::SETGE:   return X86::COND_AE;
3836   case ISD::SETUGT:              // flipped
3837   case ISD::SETULT:
3838   case ISD::SETLT:   return X86::COND_B;
3839   case ISD::SETUGE:              // flipped
3840   case ISD::SETULE:
3841   case ISD::SETLE:   return X86::COND_BE;
3842   case ISD::SETONE:
3843   case ISD::SETNE:   return X86::COND_NE;
3844   case ISD::SETUO:   return X86::COND_P;
3845   case ISD::SETO:    return X86::COND_NP;
3846   case ISD::SETOEQ:
3847   case ISD::SETUNE:  return X86::COND_INVALID;
3848   }
3849 }
3850
3851 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
3852 /// code. Current x86 isa includes the following FP cmov instructions:
3853 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3854 static bool hasFPCMov(unsigned X86CC) {
3855   switch (X86CC) {
3856   default:
3857     return false;
3858   case X86::COND_B:
3859   case X86::COND_BE:
3860   case X86::COND_E:
3861   case X86::COND_P:
3862   case X86::COND_A:
3863   case X86::COND_AE:
3864   case X86::COND_NE:
3865   case X86::COND_NP:
3866     return true;
3867   }
3868 }
3869
3870 /// isFPImmLegal - Returns true if the target can instruction select the
3871 /// specified FP immediate natively. If false, the legalizer will
3872 /// materialize the FP immediate as a load from a constant pool.
3873 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3874   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3875     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3876       return true;
3877   }
3878   return false;
3879 }
3880
3881 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
3882                                               ISD::LoadExtType ExtTy,
3883                                               EVT NewVT) const {
3884   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3885   // relocation target a movq or addq instruction: don't let the load shrink.
3886   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3887   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3888     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3889       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3890   return true;
3891 }
3892
3893 /// \brief Returns true if it is beneficial to convert a load of a constant
3894 /// to just the constant itself.
3895 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3896                                                           Type *Ty) const {
3897   assert(Ty->isIntegerTy());
3898
3899   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3900   if (BitSize == 0 || BitSize > 64)
3901     return false;
3902   return true;
3903 }
3904
3905 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
3906                                                 unsigned Index) const {
3907   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3908     return false;
3909
3910   return (Index == 0 || Index == ResVT.getVectorNumElements());
3911 }
3912
3913 bool X86TargetLowering::isCheapToSpeculateCttz() const {
3914   // Speculate cttz only if we can directly use TZCNT.
3915   return Subtarget->hasBMI();
3916 }
3917
3918 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
3919   // Speculate ctlz only if we can directly use LZCNT.
3920   return Subtarget->hasLZCNT();
3921 }
3922
3923 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
3924 /// the specified range (L, H].
3925 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3926   return (Val < 0) || (Val >= Low && Val < Hi);
3927 }
3928
3929 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3930 /// specified value.
3931 static bool isUndefOrEqual(int Val, int CmpVal) {
3932   return (Val < 0 || Val == CmpVal);
3933 }
3934
3935 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3936 /// from position Pos and ending in Pos+Size, falls within the specified
3937 /// sequential range (Low, Low+Size]. or is undef.
3938 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3939                                        unsigned Pos, unsigned Size, int Low) {
3940   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3941     if (!isUndefOrEqual(Mask[i], Low))
3942       return false;
3943   return true;
3944 }
3945
3946 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3947 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
3948 /// operand - by default will match for first operand.
3949 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
3950                          bool TestSecondOperand = false) {
3951   if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
3952       VT != MVT::v2f64 && VT != MVT::v2i64)
3953     return false;
3954
3955   unsigned NumElems = VT.getVectorNumElements();
3956   unsigned Lo = TestSecondOperand ? NumElems : 0;
3957   unsigned Hi = Lo + NumElems;
3958
3959   for (unsigned i = 0; i < NumElems; ++i)
3960     if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
3961       return false;
3962
3963   return true;
3964 }
3965
3966 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3967 /// is suitable for input to PSHUFHW.
3968 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3969   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3970     return false;
3971
3972   // Lower quadword copied in order or undef.
3973   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3974     return false;
3975
3976   // Upper quadword shuffled.
3977   for (unsigned i = 4; i != 8; ++i)
3978     if (!isUndefOrInRange(Mask[i], 4, 8))
3979       return false;
3980
3981   if (VT == MVT::v16i16) {
3982     // Lower quadword copied in order or undef.
3983     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3984       return false;
3985
3986     // Upper quadword shuffled.
3987     for (unsigned i = 12; i != 16; ++i)
3988       if (!isUndefOrInRange(Mask[i], 12, 16))
3989         return false;
3990   }
3991
3992   return true;
3993 }
3994
3995 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3996 /// is suitable for input to PSHUFLW.
3997 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3998   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3999     return false;
4000
4001   // Upper quadword copied in order.
4002   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
4003     return false;
4004
4005   // Lower quadword shuffled.
4006   for (unsigned i = 0; i != 4; ++i)
4007     if (!isUndefOrInRange(Mask[i], 0, 4))
4008       return false;
4009
4010   if (VT == MVT::v16i16) {
4011     // Upper quadword copied in order.
4012     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
4013       return false;
4014
4015     // Lower quadword shuffled.
4016     for (unsigned i = 8; i != 12; ++i)
4017       if (!isUndefOrInRange(Mask[i], 8, 12))
4018         return false;
4019   }
4020
4021   return true;
4022 }
4023
4024 /// \brief Return true if the mask specifies a shuffle of elements that is
4025 /// suitable for input to intralane (palignr) or interlane (valign) vector
4026 /// right-shift.
4027 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
4028   unsigned NumElts = VT.getVectorNumElements();
4029   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
4030   unsigned NumLaneElts = NumElts/NumLanes;
4031
4032   // Do not handle 64-bit element shuffles with palignr.
4033   if (NumLaneElts == 2)
4034     return false;
4035
4036   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
4037     unsigned i;
4038     for (i = 0; i != NumLaneElts; ++i) {
4039       if (Mask[i+l] >= 0)
4040         break;
4041     }
4042
4043     // Lane is all undef, go to next lane
4044     if (i == NumLaneElts)
4045       continue;
4046
4047     int Start = Mask[i+l];
4048
4049     // Make sure its in this lane in one of the sources
4050     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
4051         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
4052       return false;
4053
4054     // If not lane 0, then we must match lane 0
4055     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
4056       return false;
4057
4058     // Correct second source to be contiguous with first source
4059     if (Start >= (int)NumElts)
4060       Start -= NumElts - NumLaneElts;
4061
4062     // Make sure we're shifting in the right direction.
4063     if (Start <= (int)(i+l))
4064       return false;
4065
4066     Start -= i;
4067
4068     // Check the rest of the elements to see if they are consecutive.
4069     for (++i; i != NumLaneElts; ++i) {
4070       int Idx = Mask[i+l];
4071
4072       // Make sure its in this lane
4073       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
4074           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
4075         return false;
4076
4077       // If not lane 0, then we must match lane 0
4078       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
4079         return false;
4080
4081       if (Idx >= (int)NumElts)
4082         Idx -= NumElts - NumLaneElts;
4083
4084       if (!isUndefOrEqual(Idx, Start+i))
4085         return false;
4086
4087     }
4088   }
4089
4090   return true;
4091 }
4092
4093 /// \brief Return true if the node specifies a shuffle of elements that is
4094 /// suitable for input to PALIGNR.
4095 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
4096                           const X86Subtarget *Subtarget) {
4097   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
4098       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
4099       VT.is512BitVector())
4100     // FIXME: Add AVX512BW.
4101     return false;
4102
4103   return isAlignrMask(Mask, VT, false);
4104 }
4105
4106 /// \brief Return true if the node specifies a shuffle of elements that is
4107 /// suitable for input to VALIGN.
4108 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
4109                           const X86Subtarget *Subtarget) {
4110   // FIXME: Add AVX512VL.
4111   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
4112     return false;
4113   return isAlignrMask(Mask, VT, true);
4114 }
4115
4116 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
4117 /// the two vector operands have swapped position.
4118 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
4119                                      unsigned NumElems) {
4120   for (unsigned i = 0; i != NumElems; ++i) {
4121     int idx = Mask[i];
4122     if (idx < 0)
4123       continue;
4124     else if (idx < (int)NumElems)
4125       Mask[i] = idx + NumElems;
4126     else
4127       Mask[i] = idx - NumElems;
4128   }
4129 }
4130
4131 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
4132 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
4133 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
4134 /// reverse of what x86 shuffles want.
4135 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
4136
4137   unsigned NumElems = VT.getVectorNumElements();
4138   unsigned NumLanes = VT.getSizeInBits()/128;
4139   unsigned NumLaneElems = NumElems/NumLanes;
4140
4141   if (NumLaneElems != 2 && NumLaneElems != 4)
4142     return false;
4143
4144   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4145   bool symetricMaskRequired =
4146     (VT.getSizeInBits() >= 256) && (EltSize == 32);
4147
4148   // VSHUFPSY divides the resulting vector into 4 chunks.
4149   // The sources are also splitted into 4 chunks, and each destination
4150   // chunk must come from a different source chunk.
4151   //
4152   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
4153   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
4154   //
4155   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
4156   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
4157   //
4158   // VSHUFPDY divides the resulting vector into 4 chunks.
4159   // The sources are also splitted into 4 chunks, and each destination
4160   // chunk must come from a different source chunk.
4161   //
4162   //  SRC1 =>      X3       X2       X1       X0
4163   //  SRC2 =>      Y3       Y2       Y1       Y0
4164   //
4165   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
4166   //
4167   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
4168   unsigned HalfLaneElems = NumLaneElems/2;
4169   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
4170     for (unsigned i = 0; i != NumLaneElems; ++i) {
4171       int Idx = Mask[i+l];
4172       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
4173       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
4174         return false;
4175       // For VSHUFPSY, the mask of the second half must be the same as the
4176       // first but with the appropriate offsets. This works in the same way as
4177       // VPERMILPS works with masks.
4178       if (!symetricMaskRequired || Idx < 0)
4179         continue;
4180       if (MaskVal[i] < 0) {
4181         MaskVal[i] = Idx - l;
4182         continue;
4183       }
4184       if ((signed)(Idx - l) != MaskVal[i])
4185         return false;
4186     }
4187   }
4188
4189   return true;
4190 }
4191
4192 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
4193 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
4194 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
4195   if (!VT.is128BitVector())
4196     return false;
4197
4198   unsigned NumElems = VT.getVectorNumElements();
4199
4200   if (NumElems != 4)
4201     return false;
4202
4203   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
4204   return isUndefOrEqual(Mask[0], 6) &&
4205          isUndefOrEqual(Mask[1], 7) &&
4206          isUndefOrEqual(Mask[2], 2) &&
4207          isUndefOrEqual(Mask[3], 3);
4208 }
4209
4210 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
4211 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
4212 /// <2, 3, 2, 3>
4213 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
4214   if (!VT.is128BitVector())
4215     return false;
4216
4217   unsigned NumElems = VT.getVectorNumElements();
4218
4219   if (NumElems != 4)
4220     return false;
4221
4222   return isUndefOrEqual(Mask[0], 2) &&
4223          isUndefOrEqual(Mask[1], 3) &&
4224          isUndefOrEqual(Mask[2], 2) &&
4225          isUndefOrEqual(Mask[3], 3);
4226 }
4227
4228 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
4229 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
4230 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
4231   if (!VT.is128BitVector())
4232     return false;
4233
4234   unsigned NumElems = VT.getVectorNumElements();
4235
4236   if (NumElems != 2 && NumElems != 4)
4237     return false;
4238
4239   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4240     if (!isUndefOrEqual(Mask[i], i + NumElems))
4241       return false;
4242
4243   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4244     if (!isUndefOrEqual(Mask[i], i))
4245       return false;
4246
4247   return true;
4248 }
4249
4250 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
4251 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
4252 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
4253   if (!VT.is128BitVector())
4254     return false;
4255
4256   unsigned NumElems = VT.getVectorNumElements();
4257
4258   if (NumElems != 2 && NumElems != 4)
4259     return false;
4260
4261   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4262     if (!isUndefOrEqual(Mask[i], i))
4263       return false;
4264
4265   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4266     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
4267       return false;
4268
4269   return true;
4270 }
4271
4272 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
4273 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
4274 /// i. e: If all but one element come from the same vector.
4275 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
4276   // TODO: Deal with AVX's VINSERTPS
4277   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
4278     return false;
4279
4280   unsigned CorrectPosV1 = 0;
4281   unsigned CorrectPosV2 = 0;
4282   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
4283     if (Mask[i] == -1) {
4284       ++CorrectPosV1;
4285       ++CorrectPosV2;
4286       continue;
4287     }
4288
4289     if (Mask[i] == i)
4290       ++CorrectPosV1;
4291     else if (Mask[i] == i + 4)
4292       ++CorrectPosV2;
4293   }
4294
4295   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
4296     // We have 3 elements (undefs count as elements from any vector) from one
4297     // vector, and one from another.
4298     return true;
4299
4300   return false;
4301 }
4302
4303 //
4304 // Some special combinations that can be optimized.
4305 //
4306 static
4307 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
4308                                SelectionDAG &DAG) {
4309   MVT VT = SVOp->getSimpleValueType(0);
4310   SDLoc dl(SVOp);
4311
4312   if (VT != MVT::v8i32 && VT != MVT::v8f32)
4313     return SDValue();
4314
4315   ArrayRef<int> Mask = SVOp->getMask();
4316
4317   // These are the special masks that may be optimized.
4318   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
4319   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
4320   bool MatchEvenMask = true;
4321   bool MatchOddMask  = true;
4322   for (int i=0; i<8; ++i) {
4323     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
4324       MatchEvenMask = false;
4325     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
4326       MatchOddMask = false;
4327   }
4328
4329   if (!MatchEvenMask && !MatchOddMask)
4330     return SDValue();
4331
4332   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
4333
4334   SDValue Op0 = SVOp->getOperand(0);
4335   SDValue Op1 = SVOp->getOperand(1);
4336
4337   if (MatchEvenMask) {
4338     // Shift the second operand right to 32 bits.
4339     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
4340     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
4341   } else {
4342     // Shift the first operand left to 32 bits.
4343     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
4344     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
4345   }
4346   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
4347   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
4348 }
4349
4350 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
4351 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
4352 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
4353                          bool HasInt256, bool V2IsSplat = false) {
4354
4355   assert(VT.getSizeInBits() >= 128 &&
4356          "Unsupported vector type for unpckl");
4357
4358   unsigned NumElts = VT.getVectorNumElements();
4359   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4360       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4361     return false;
4362
4363   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4364          "Unsupported vector type for unpckh");
4365
4366   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4367   unsigned NumLanes = VT.getSizeInBits()/128;
4368   unsigned NumLaneElts = NumElts/NumLanes;
4369
4370   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4371     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4372       int BitI  = Mask[l+i];
4373       int BitI1 = Mask[l+i+1];
4374       if (!isUndefOrEqual(BitI, j))
4375         return false;
4376       if (V2IsSplat) {
4377         if (!isUndefOrEqual(BitI1, NumElts))
4378           return false;
4379       } else {
4380         if (!isUndefOrEqual(BitI1, j + NumElts))
4381           return false;
4382       }
4383     }
4384   }
4385
4386   return true;
4387 }
4388
4389 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
4390 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
4391 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
4392                          bool HasInt256, bool V2IsSplat = false) {
4393   assert(VT.getSizeInBits() >= 128 &&
4394          "Unsupported vector type for unpckh");
4395
4396   unsigned NumElts = VT.getVectorNumElements();
4397   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4398       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4399     return false;
4400
4401   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4402          "Unsupported vector type for unpckh");
4403
4404   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4405   unsigned NumLanes = VT.getSizeInBits()/128;
4406   unsigned NumLaneElts = NumElts/NumLanes;
4407
4408   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4409     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4410       int BitI  = Mask[l+i];
4411       int BitI1 = Mask[l+i+1];
4412       if (!isUndefOrEqual(BitI, j))
4413         return false;
4414       if (V2IsSplat) {
4415         if (isUndefOrEqual(BitI1, NumElts))
4416           return false;
4417       } else {
4418         if (!isUndefOrEqual(BitI1, j+NumElts))
4419           return false;
4420       }
4421     }
4422   }
4423   return true;
4424 }
4425
4426 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4427 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4428 /// <0, 0, 1, 1>
4429 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4430   unsigned NumElts = VT.getVectorNumElements();
4431   bool Is256BitVec = VT.is256BitVector();
4432
4433   if (VT.is512BitVector())
4434     return false;
4435   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4436          "Unsupported vector type for unpckh");
4437
4438   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4439       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4440     return false;
4441
4442   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4443   // FIXME: Need a better way to get rid of this, there's no latency difference
4444   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4445   // the former later. We should also remove the "_undef" special mask.
4446   if (NumElts == 4 && Is256BitVec)
4447     return false;
4448
4449   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4450   // independently on 128-bit lanes.
4451   unsigned NumLanes = VT.getSizeInBits()/128;
4452   unsigned NumLaneElts = NumElts/NumLanes;
4453
4454   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4455     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4456       int BitI  = Mask[l+i];
4457       int BitI1 = Mask[l+i+1];
4458
4459       if (!isUndefOrEqual(BitI, j))
4460         return false;
4461       if (!isUndefOrEqual(BitI1, j))
4462         return false;
4463     }
4464   }
4465
4466   return true;
4467 }
4468
4469 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4470 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4471 /// <2, 2, 3, 3>
4472 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4473   unsigned NumElts = VT.getVectorNumElements();
4474
4475   if (VT.is512BitVector())
4476     return false;
4477
4478   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4479          "Unsupported vector type for unpckh");
4480
4481   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4482       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4483     return false;
4484
4485   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4486   // independently on 128-bit lanes.
4487   unsigned NumLanes = VT.getSizeInBits()/128;
4488   unsigned NumLaneElts = NumElts/NumLanes;
4489
4490   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4491     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4492       int BitI  = Mask[l+i];
4493       int BitI1 = Mask[l+i+1];
4494       if (!isUndefOrEqual(BitI, j))
4495         return false;
4496       if (!isUndefOrEqual(BitI1, j))
4497         return false;
4498     }
4499   }
4500   return true;
4501 }
4502
4503 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
4504 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
4505 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
4506   if (!VT.is512BitVector())
4507     return false;
4508
4509   unsigned NumElts = VT.getVectorNumElements();
4510   unsigned HalfSize = NumElts/2;
4511   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
4512     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
4513       *Imm = 1;
4514       return true;
4515     }
4516   }
4517   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
4518     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
4519       *Imm = 0;
4520       return true;
4521     }
4522   }
4523   return false;
4524 }
4525
4526 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4527 /// specifies a shuffle of elements that is suitable for input to MOVSS,
4528 /// MOVSD, and MOVD, i.e. setting the lowest element.
4529 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4530   if (VT.getVectorElementType().getSizeInBits() < 32)
4531     return false;
4532   if (!VT.is128BitVector())
4533     return false;
4534
4535   unsigned NumElts = VT.getVectorNumElements();
4536
4537   if (!isUndefOrEqual(Mask[0], NumElts))
4538     return false;
4539
4540   for (unsigned i = 1; i != NumElts; ++i)
4541     if (!isUndefOrEqual(Mask[i], i))
4542       return false;
4543
4544   return true;
4545 }
4546
4547 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4548 /// as permutations between 128-bit chunks or halves. As an example: this
4549 /// shuffle bellow:
4550 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4551 /// The first half comes from the second half of V1 and the second half from the
4552 /// the second half of V2.
4553 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4554   if (!HasFp256 || !VT.is256BitVector())
4555     return false;
4556
4557   // The shuffle result is divided into half A and half B. In total the two
4558   // sources have 4 halves, namely: C, D, E, F. The final values of A and
4559   // B must come from C, D, E or F.
4560   unsigned HalfSize = VT.getVectorNumElements()/2;
4561   bool MatchA = false, MatchB = false;
4562
4563   // Check if A comes from one of C, D, E, F.
4564   for (unsigned Half = 0; Half != 4; ++Half) {
4565     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4566       MatchA = true;
4567       break;
4568     }
4569   }
4570
4571   // Check if B comes from one of C, D, E, F.
4572   for (unsigned Half = 0; Half != 4; ++Half) {
4573     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4574       MatchB = true;
4575       break;
4576     }
4577   }
4578
4579   return MatchA && MatchB;
4580 }
4581
4582 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4583 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4584 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4585   MVT VT = SVOp->getSimpleValueType(0);
4586
4587   unsigned HalfSize = VT.getVectorNumElements()/2;
4588
4589   unsigned FstHalf = 0, SndHalf = 0;
4590   for (unsigned i = 0; i < HalfSize; ++i) {
4591     if (SVOp->getMaskElt(i) > 0) {
4592       FstHalf = SVOp->getMaskElt(i)/HalfSize;
4593       break;
4594     }
4595   }
4596   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4597     if (SVOp->getMaskElt(i) > 0) {
4598       SndHalf = SVOp->getMaskElt(i)/HalfSize;
4599       break;
4600     }
4601   }
4602
4603   return (FstHalf | (SndHalf << 4));
4604 }
4605
4606 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
4607 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4608   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4609   if (EltSize < 32)
4610     return false;
4611
4612   unsigned NumElts = VT.getVectorNumElements();
4613   Imm8 = 0;
4614   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4615     for (unsigned i = 0; i != NumElts; ++i) {
4616       if (Mask[i] < 0)
4617         continue;
4618       Imm8 |= Mask[i] << (i*2);
4619     }
4620     return true;
4621   }
4622
4623   unsigned LaneSize = 4;
4624   SmallVector<int, 4> MaskVal(LaneSize, -1);
4625
4626   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4627     for (unsigned i = 0; i != LaneSize; ++i) {
4628       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4629         return false;
4630       if (Mask[i+l] < 0)
4631         continue;
4632       if (MaskVal[i] < 0) {
4633         MaskVal[i] = Mask[i+l] - l;
4634         Imm8 |= MaskVal[i] << (i*2);
4635         continue;
4636       }
4637       if (Mask[i+l] != (signed)(MaskVal[i]+l))
4638         return false;
4639     }
4640   }
4641   return true;
4642 }
4643
4644 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4645 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4646 /// Note that VPERMIL mask matching is different depending whether theunderlying
4647 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4648 /// to the same elements of the low, but to the higher half of the source.
4649 /// In VPERMILPD the two lanes could be shuffled independently of each other
4650 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4651 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4652   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4653   if (VT.getSizeInBits() < 256 || EltSize < 32)
4654     return false;
4655   bool symetricMaskRequired = (EltSize == 32);
4656   unsigned NumElts = VT.getVectorNumElements();
4657
4658   unsigned NumLanes = VT.getSizeInBits()/128;
4659   unsigned LaneSize = NumElts/NumLanes;
4660   // 2 or 4 elements in one lane
4661
4662   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4663   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4664     for (unsigned i = 0; i != LaneSize; ++i) {
4665       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4666         return false;
4667       if (symetricMaskRequired) {
4668         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4669           ExpectedMaskVal[i] = Mask[i+l] - l;
4670           continue;
4671         }
4672         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4673           return false;
4674       }
4675     }
4676   }
4677   return true;
4678 }
4679
4680 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4681 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
4682 /// element of vector 2 and the other elements to come from vector 1 in order.
4683 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4684                                bool V2IsSplat = false, bool V2IsUndef = false) {
4685   if (!VT.is128BitVector())
4686     return false;
4687
4688   unsigned NumOps = VT.getVectorNumElements();
4689   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4690     return false;
4691
4692   if (!isUndefOrEqual(Mask[0], 0))
4693     return false;
4694
4695   for (unsigned i = 1; i != NumOps; ++i)
4696     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4697           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4698           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4699       return false;
4700
4701   return true;
4702 }
4703
4704 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4705 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4706 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4707 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4708                            const X86Subtarget *Subtarget) {
4709   if (!Subtarget->hasSSE3())
4710     return false;
4711
4712   unsigned NumElems = VT.getVectorNumElements();
4713
4714   if ((VT.is128BitVector() && NumElems != 4) ||
4715       (VT.is256BitVector() && NumElems != 8) ||
4716       (VT.is512BitVector() && NumElems != 16))
4717     return false;
4718
4719   // "i+1" is the value the indexed mask element must have
4720   for (unsigned i = 0; i != NumElems; i += 2)
4721     if (!isUndefOrEqual(Mask[i], i+1) ||
4722         !isUndefOrEqual(Mask[i+1], i+1))
4723       return false;
4724
4725   return true;
4726 }
4727
4728 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4729 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4730 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4731 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4732                            const X86Subtarget *Subtarget) {
4733   if (!Subtarget->hasSSE3())
4734     return false;
4735
4736   unsigned NumElems = VT.getVectorNumElements();
4737
4738   if ((VT.is128BitVector() && NumElems != 4) ||
4739       (VT.is256BitVector() && NumElems != 8) ||
4740       (VT.is512BitVector() && NumElems != 16))
4741     return false;
4742
4743   // "i" is the value the indexed mask element must have
4744   for (unsigned i = 0; i != NumElems; i += 2)
4745     if (!isUndefOrEqual(Mask[i], i) ||
4746         !isUndefOrEqual(Mask[i+1], i))
4747       return false;
4748
4749   return true;
4750 }
4751
4752 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4753 /// specifies a shuffle of elements that is suitable for input to 256-bit
4754 /// version of MOVDDUP.
4755 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4756   if (!HasFp256 || !VT.is256BitVector())
4757     return false;
4758
4759   unsigned NumElts = VT.getVectorNumElements();
4760   if (NumElts != 4)
4761     return false;
4762
4763   for (unsigned i = 0; i != NumElts/2; ++i)
4764     if (!isUndefOrEqual(Mask[i], 0))
4765       return false;
4766   for (unsigned i = NumElts/2; i != NumElts; ++i)
4767     if (!isUndefOrEqual(Mask[i], NumElts/2))
4768       return false;
4769   return true;
4770 }
4771
4772 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4773 /// specifies a shuffle of elements that is suitable for input to 128-bit
4774 /// version of MOVDDUP.
4775 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4776   if (!VT.is128BitVector())
4777     return false;
4778
4779   unsigned e = VT.getVectorNumElements() / 2;
4780   for (unsigned i = 0; i != e; ++i)
4781     if (!isUndefOrEqual(Mask[i], i))
4782       return false;
4783   for (unsigned i = 0; i != e; ++i)
4784     if (!isUndefOrEqual(Mask[e+i], i))
4785       return false;
4786   return true;
4787 }
4788
4789 /// isVEXTRACTIndex - Return true if the specified
4790 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4791 /// suitable for instruction that extract 128 or 256 bit vectors
4792 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4793   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4794   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4795     return false;
4796
4797   // The index should be aligned on a vecWidth-bit boundary.
4798   uint64_t Index =
4799     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4800
4801   MVT VT = N->getSimpleValueType(0);
4802   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4803   bool Result = (Index * ElSize) % vecWidth == 0;
4804
4805   return Result;
4806 }
4807
4808 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4809 /// operand specifies a subvector insert that is suitable for input to
4810 /// insertion of 128 or 256-bit subvectors
4811 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4812   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4813   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4814     return false;
4815   // The index should be aligned on a vecWidth-bit boundary.
4816   uint64_t Index =
4817     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4818
4819   MVT VT = N->getSimpleValueType(0);
4820   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4821   bool Result = (Index * ElSize) % vecWidth == 0;
4822
4823   return Result;
4824 }
4825
4826 bool X86::isVINSERT128Index(SDNode *N) {
4827   return isVINSERTIndex(N, 128);
4828 }
4829
4830 bool X86::isVINSERT256Index(SDNode *N) {
4831   return isVINSERTIndex(N, 256);
4832 }
4833
4834 bool X86::isVEXTRACT128Index(SDNode *N) {
4835   return isVEXTRACTIndex(N, 128);
4836 }
4837
4838 bool X86::isVEXTRACT256Index(SDNode *N) {
4839   return isVEXTRACTIndex(N, 256);
4840 }
4841
4842 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4843 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4844 /// Handles 128-bit and 256-bit.
4845 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4846   MVT VT = N->getSimpleValueType(0);
4847
4848   assert((VT.getSizeInBits() >= 128) &&
4849          "Unsupported vector type for PSHUF/SHUFP");
4850
4851   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4852   // independently on 128-bit lanes.
4853   unsigned NumElts = VT.getVectorNumElements();
4854   unsigned NumLanes = VT.getSizeInBits()/128;
4855   unsigned NumLaneElts = NumElts/NumLanes;
4856
4857   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
4858          "Only supports 2, 4 or 8 elements per lane");
4859
4860   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4861   unsigned Mask = 0;
4862   for (unsigned i = 0; i != NumElts; ++i) {
4863     int Elt = N->getMaskElt(i);
4864     if (Elt < 0) continue;
4865     Elt &= NumLaneElts - 1;
4866     unsigned ShAmt = (i << Shift) % 8;
4867     Mask |= Elt << ShAmt;
4868   }
4869
4870   return Mask;
4871 }
4872
4873 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4874 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4875 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4876   MVT VT = N->getSimpleValueType(0);
4877
4878   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4879          "Unsupported vector type for PSHUFHW");
4880
4881   unsigned NumElts = VT.getVectorNumElements();
4882
4883   unsigned Mask = 0;
4884   for (unsigned l = 0; l != NumElts; l += 8) {
4885     // 8 nodes per lane, but we only care about the last 4.
4886     for (unsigned i = 0; i < 4; ++i) {
4887       int Elt = N->getMaskElt(l+i+4);
4888       if (Elt < 0) continue;
4889       Elt &= 0x3; // only 2-bits.
4890       Mask |= Elt << (i * 2);
4891     }
4892   }
4893
4894   return Mask;
4895 }
4896
4897 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4898 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4899 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4900   MVT VT = N->getSimpleValueType(0);
4901
4902   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4903          "Unsupported vector type for PSHUFHW");
4904
4905   unsigned NumElts = VT.getVectorNumElements();
4906
4907   unsigned Mask = 0;
4908   for (unsigned l = 0; l != NumElts; l += 8) {
4909     // 8 nodes per lane, but we only care about the first 4.
4910     for (unsigned i = 0; i < 4; ++i) {
4911       int Elt = N->getMaskElt(l+i);
4912       if (Elt < 0) continue;
4913       Elt &= 0x3; // only 2-bits
4914       Mask |= Elt << (i * 2);
4915     }
4916   }
4917
4918   return Mask;
4919 }
4920
4921 /// \brief Return the appropriate immediate to shuffle the specified
4922 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
4923 /// VALIGN (if Interlane is true) instructions.
4924 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
4925                                            bool InterLane) {
4926   MVT VT = SVOp->getSimpleValueType(0);
4927   unsigned EltSize = InterLane ? 1 :
4928     VT.getVectorElementType().getSizeInBits() >> 3;
4929
4930   unsigned NumElts = VT.getVectorNumElements();
4931   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4932   unsigned NumLaneElts = NumElts/NumLanes;
4933
4934   int Val = 0;
4935   unsigned i;
4936   for (i = 0; i != NumElts; ++i) {
4937     Val = SVOp->getMaskElt(i);
4938     if (Val >= 0)
4939       break;
4940   }
4941   if (Val >= (int)NumElts)
4942     Val -= NumElts - NumLaneElts;
4943
4944   assert(Val - i > 0 && "PALIGNR imm should be positive");
4945   return (Val - i) * EltSize;
4946 }
4947
4948 /// \brief Return the appropriate immediate to shuffle the specified
4949 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
4950 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4951   return getShuffleAlignrImmediate(SVOp, false);
4952 }
4953
4954 /// \brief Return the appropriate immediate to shuffle the specified
4955 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
4956 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
4957   return getShuffleAlignrImmediate(SVOp, true);
4958 }
4959
4960
4961 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4962   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4963   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4964     llvm_unreachable("Illegal extract subvector for VEXTRACT");
4965
4966   uint64_t Index =
4967     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4968
4969   MVT VecVT = N->getOperand(0).getSimpleValueType();
4970   MVT ElVT = VecVT.getVectorElementType();
4971
4972   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4973   return Index / NumElemsPerChunk;
4974 }
4975
4976 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4977   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4978   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4979     llvm_unreachable("Illegal insert subvector for VINSERT");
4980
4981   uint64_t Index =
4982     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4983
4984   MVT VecVT = N->getSimpleValueType(0);
4985   MVT ElVT = VecVT.getVectorElementType();
4986
4987   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4988   return Index / NumElemsPerChunk;
4989 }
4990
4991 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
4992 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4993 /// and VINSERTI128 instructions.
4994 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4995   return getExtractVEXTRACTImmediate(N, 128);
4996 }
4997
4998 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
4999 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
5000 /// and VINSERTI64x4 instructions.
5001 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
5002   return getExtractVEXTRACTImmediate(N, 256);
5003 }
5004
5005 /// getInsertVINSERT128Immediate - Return the appropriate immediate
5006 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
5007 /// and VINSERTI128 instructions.
5008 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
5009   return getInsertVINSERTImmediate(N, 128);
5010 }
5011
5012 /// getInsertVINSERT256Immediate - Return the appropriate immediate
5013 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
5014 /// and VINSERTI64x4 instructions.
5015 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
5016   return getInsertVINSERTImmediate(N, 256);
5017 }
5018
5019 /// isZero - Returns true if Elt is a constant integer zero
5020 static bool isZero(SDValue V) {
5021   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
5022   return C && C->isNullValue();
5023 }
5024
5025 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
5026 /// constant +0.0.
5027 bool X86::isZeroNode(SDValue Elt) {
5028   if (isZero(Elt))
5029     return true;
5030   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
5031     return CFP->getValueAPF().isPosZero();
5032   return false;
5033 }
5034
5035 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
5036 /// match movhlps. The lower half elements should come from upper half of
5037 /// V1 (and in order), and the upper half elements should come from the upper
5038 /// half of V2 (and in order).
5039 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
5040   if (!VT.is128BitVector())
5041     return false;
5042   if (VT.getVectorNumElements() != 4)
5043     return false;
5044   for (unsigned i = 0, e = 2; i != e; ++i)
5045     if (!isUndefOrEqual(Mask[i], i+2))
5046       return false;
5047   for (unsigned i = 2; i != 4; ++i)
5048     if (!isUndefOrEqual(Mask[i], i+4))
5049       return false;
5050   return true;
5051 }
5052
5053 /// isScalarLoadToVector - Returns true if the node is a scalar load that
5054 /// is promoted to a vector. It also returns the LoadSDNode by reference if
5055 /// required.
5056 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
5057   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
5058     return false;
5059   N = N->getOperand(0).getNode();
5060   if (!ISD::isNON_EXTLoad(N))
5061     return false;
5062   if (LD)
5063     *LD = cast<LoadSDNode>(N);
5064   return true;
5065 }
5066
5067 // Test whether the given value is a vector value which will be legalized
5068 // into a load.
5069 static bool WillBeConstantPoolLoad(SDNode *N) {
5070   if (N->getOpcode() != ISD::BUILD_VECTOR)
5071     return false;
5072
5073   // Check for any non-constant elements.
5074   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
5075     switch (N->getOperand(i).getNode()->getOpcode()) {
5076     case ISD::UNDEF:
5077     case ISD::ConstantFP:
5078     case ISD::Constant:
5079       break;
5080     default:
5081       return false;
5082     }
5083
5084   // Vectors of all-zeros and all-ones are materialized with special
5085   // instructions rather than being loaded.
5086   return !ISD::isBuildVectorAllZeros(N) &&
5087          !ISD::isBuildVectorAllOnes(N);
5088 }
5089
5090 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
5091 /// match movlp{s|d}. The lower half elements should come from lower half of
5092 /// V1 (and in order), and the upper half elements should come from the upper
5093 /// half of V2 (and in order). And since V1 will become the source of the
5094 /// MOVLP, it must be either a vector load or a scalar load to vector.
5095 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
5096                                ArrayRef<int> Mask, MVT VT) {
5097   if (!VT.is128BitVector())
5098     return false;
5099
5100   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
5101     return false;
5102   // Is V2 is a vector load, don't do this transformation. We will try to use
5103   // load folding shufps op.
5104   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
5105     return false;
5106
5107   unsigned NumElems = VT.getVectorNumElements();
5108
5109   if (NumElems != 2 && NumElems != 4)
5110     return false;
5111   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
5112     if (!isUndefOrEqual(Mask[i], i))
5113       return false;
5114   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
5115     if (!isUndefOrEqual(Mask[i], i+NumElems))
5116       return false;
5117   return true;
5118 }
5119
5120 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
5121 /// to an zero vector.
5122 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
5123 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
5124   SDValue V1 = N->getOperand(0);
5125   SDValue V2 = N->getOperand(1);
5126   unsigned NumElems = N->getValueType(0).getVectorNumElements();
5127   for (unsigned i = 0; i != NumElems; ++i) {
5128     int Idx = N->getMaskElt(i);
5129     if (Idx >= (int)NumElems) {
5130       unsigned Opc = V2.getOpcode();
5131       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
5132         continue;
5133       if (Opc != ISD::BUILD_VECTOR ||
5134           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
5135         return false;
5136     } else if (Idx >= 0) {
5137       unsigned Opc = V1.getOpcode();
5138       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
5139         continue;
5140       if (Opc != ISD::BUILD_VECTOR ||
5141           !X86::isZeroNode(V1.getOperand(Idx)))
5142         return false;
5143     }
5144   }
5145   return true;
5146 }
5147
5148 /// getZeroVector - Returns a vector of specified type with all zero elements.
5149 ///
5150 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
5151                              SelectionDAG &DAG, SDLoc dl) {
5152   assert(VT.isVector() && "Expected a vector type");
5153
5154   // Always build SSE zero vectors as <4 x i32> bitcasted
5155   // to their dest type. This ensures they get CSE'd.
5156   SDValue Vec;
5157   if (VT.is128BitVector()) {  // SSE
5158     if (Subtarget->hasSSE2()) {  // SSE2
5159       SDValue Cst = DAG.getConstant(0, MVT::i32);
5160       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5161     } else { // SSE1
5162       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5163       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
5164     }
5165   } else if (VT.is256BitVector()) { // AVX
5166     if (Subtarget->hasInt256()) { // AVX2
5167       SDValue Cst = DAG.getConstant(0, MVT::i32);
5168       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5169       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5170     } else {
5171       // 256-bit logic and arithmetic instructions in AVX are all
5172       // floating-point, no support for integer ops. Emit fp zeroed vectors.
5173       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5174       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5175       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
5176     }
5177   } else if (VT.is512BitVector()) { // AVX-512
5178       SDValue Cst = DAG.getConstant(0, MVT::i32);
5179       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5180                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5181       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
5182   } else if (VT.getScalarType() == MVT::i1) {
5183     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
5184     SDValue Cst = DAG.getConstant(0, MVT::i1);
5185     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5186     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5187   } else
5188     llvm_unreachable("Unexpected vector type");
5189
5190   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5191 }
5192
5193 /// getOnesVector - Returns a vector of specified type with all bits set.
5194 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5195 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
5196 /// Then bitcast to their original type, ensuring they get CSE'd.
5197 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
5198                              SDLoc dl) {
5199   assert(VT.isVector() && "Expected a vector type");
5200
5201   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
5202   SDValue Vec;
5203   if (VT.is256BitVector()) {
5204     if (HasInt256) { // AVX2
5205       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5206       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5207     } else { // AVX
5208       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5209       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5210     }
5211   } else if (VT.is128BitVector()) {
5212     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5213   } else
5214     llvm_unreachable("Unexpected vector type");
5215
5216   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5217 }
5218
5219 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
5220 /// that point to V2 points to its first element.
5221 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
5222   for (unsigned i = 0; i != NumElems; ++i) {
5223     if (Mask[i] > (int)NumElems) {
5224       Mask[i] = NumElems;
5225     }
5226   }
5227 }
5228
5229 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
5230 /// operation of specified width.
5231 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
5232                        SDValue V2) {
5233   unsigned NumElems = VT.getVectorNumElements();
5234   SmallVector<int, 8> Mask;
5235   Mask.push_back(NumElems);
5236   for (unsigned i = 1; i != NumElems; ++i)
5237     Mask.push_back(i);
5238   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5239 }
5240
5241 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
5242 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5243                           SDValue V2) {
5244   unsigned NumElems = VT.getVectorNumElements();
5245   SmallVector<int, 8> Mask;
5246   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
5247     Mask.push_back(i);
5248     Mask.push_back(i + NumElems);
5249   }
5250   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5251 }
5252
5253 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
5254 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5255                           SDValue V2) {
5256   unsigned NumElems = VT.getVectorNumElements();
5257   SmallVector<int, 8> Mask;
5258   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
5259     Mask.push_back(i + Half);
5260     Mask.push_back(i + NumElems + Half);
5261   }
5262   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5263 }
5264
5265 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
5266 // a generic shuffle instruction because the target has no such instructions.
5267 // Generate shuffles which repeat i16 and i8 several times until they can be
5268 // represented by v4f32 and then be manipulated by target suported shuffles.
5269 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
5270   MVT VT = V.getSimpleValueType();
5271   int NumElems = VT.getVectorNumElements();
5272   SDLoc dl(V);
5273
5274   while (NumElems > 4) {
5275     if (EltNo < NumElems/2) {
5276       V = getUnpackl(DAG, dl, VT, V, V);
5277     } else {
5278       V = getUnpackh(DAG, dl, VT, V, V);
5279       EltNo -= NumElems/2;
5280     }
5281     NumElems >>= 1;
5282   }
5283   return V;
5284 }
5285
5286 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
5287 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
5288   MVT VT = V.getSimpleValueType();
5289   SDLoc dl(V);
5290
5291   if (VT.is128BitVector()) {
5292     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
5293     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
5294     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
5295                              &SplatMask[0]);
5296   } else if (VT.is256BitVector()) {
5297     // To use VPERMILPS to splat scalars, the second half of indicies must
5298     // refer to the higher part, which is a duplication of the lower one,
5299     // because VPERMILPS can only handle in-lane permutations.
5300     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
5301                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
5302
5303     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
5304     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
5305                              &SplatMask[0]);
5306   } else
5307     llvm_unreachable("Vector size not supported");
5308
5309   return DAG.getNode(ISD::BITCAST, dl, VT, V);
5310 }
5311
5312 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
5313 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
5314   MVT SrcVT = SV->getSimpleValueType(0);
5315   SDValue V1 = SV->getOperand(0);
5316   SDLoc dl(SV);
5317
5318   int EltNo = SV->getSplatIndex();
5319   int NumElems = SrcVT.getVectorNumElements();
5320   bool Is256BitVec = SrcVT.is256BitVector();
5321
5322   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
5323          "Unknown how to promote splat for type");
5324
5325   // Extract the 128-bit part containing the splat element and update
5326   // the splat element index when it refers to the higher register.
5327   if (Is256BitVec) {
5328     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
5329     if (EltNo >= NumElems/2)
5330       EltNo -= NumElems/2;
5331   }
5332
5333   // All i16 and i8 vector types can't be used directly by a generic shuffle
5334   // instruction because the target has no such instruction. Generate shuffles
5335   // which repeat i16 and i8 several times until they fit in i32, and then can
5336   // be manipulated by target suported shuffles.
5337   MVT EltVT = SrcVT.getVectorElementType();
5338   if (EltVT == MVT::i8 || EltVT == MVT::i16)
5339     V1 = PromoteSplati8i16(V1, DAG, EltNo);
5340
5341   // Recreate the 256-bit vector and place the same 128-bit vector
5342   // into the low and high part. This is necessary because we want
5343   // to use VPERM* to shuffle the vectors
5344   if (Is256BitVec) {
5345     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
5346   }
5347
5348   return getLegalSplat(DAG, V1, EltNo);
5349 }
5350
5351 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
5352 /// vector of zero or undef vector.  This produces a shuffle where the low
5353 /// element of V2 is swizzled into the zero/undef vector, landing at element
5354 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
5355 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
5356                                            bool IsZero,
5357                                            const X86Subtarget *Subtarget,
5358                                            SelectionDAG &DAG) {
5359   MVT VT = V2.getSimpleValueType();
5360   SDValue V1 = IsZero
5361     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5362   unsigned NumElems = VT.getVectorNumElements();
5363   SmallVector<int, 16> MaskVec;
5364   for (unsigned i = 0; i != NumElems; ++i)
5365     // If this is the insertion idx, put the low elt of V2 here.
5366     MaskVec.push_back(i == Idx ? NumElems : i);
5367   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
5368 }
5369
5370 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
5371 /// target specific opcode. Returns true if the Mask could be calculated. Sets
5372 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
5373 /// shuffles which use a single input multiple times, and in those cases it will
5374 /// adjust the mask to only have indices within that single input.
5375 static bool getTargetShuffleMask(SDNode *N, MVT VT,
5376                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
5377   unsigned NumElems = VT.getVectorNumElements();
5378   SDValue ImmN;
5379
5380   IsUnary = false;
5381   bool IsFakeUnary = false;
5382   switch(N->getOpcode()) {
5383   case X86ISD::BLENDI:
5384     ImmN = N->getOperand(N->getNumOperands()-1);
5385     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5386     break;
5387   case X86ISD::SHUFP:
5388     ImmN = N->getOperand(N->getNumOperands()-1);
5389     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5390     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5391     break;
5392   case X86ISD::UNPCKH:
5393     DecodeUNPCKHMask(VT, Mask);
5394     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5395     break;
5396   case X86ISD::UNPCKL:
5397     DecodeUNPCKLMask(VT, Mask);
5398     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5399     break;
5400   case X86ISD::MOVHLPS:
5401     DecodeMOVHLPSMask(NumElems, Mask);
5402     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5403     break;
5404   case X86ISD::MOVLHPS:
5405     DecodeMOVLHPSMask(NumElems, Mask);
5406     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5407     break;
5408   case X86ISD::PALIGNR:
5409     ImmN = N->getOperand(N->getNumOperands()-1);
5410     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5411     break;
5412   case X86ISD::PSHUFD:
5413   case X86ISD::VPERMILPI:
5414     ImmN = N->getOperand(N->getNumOperands()-1);
5415     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5416     IsUnary = true;
5417     break;
5418   case X86ISD::PSHUFHW:
5419     ImmN = N->getOperand(N->getNumOperands()-1);
5420     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5421     IsUnary = true;
5422     break;
5423   case X86ISD::PSHUFLW:
5424     ImmN = N->getOperand(N->getNumOperands()-1);
5425     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5426     IsUnary = true;
5427     break;
5428   case X86ISD::PSHUFB: {
5429     IsUnary = true;
5430     SDValue MaskNode = N->getOperand(1);
5431     while (MaskNode->getOpcode() == ISD::BITCAST)
5432       MaskNode = MaskNode->getOperand(0);
5433
5434     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
5435       // If we have a build-vector, then things are easy.
5436       EVT VT = MaskNode.getValueType();
5437       assert(VT.isVector() &&
5438              "Can't produce a non-vector with a build_vector!");
5439       if (!VT.isInteger())
5440         return false;
5441
5442       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
5443
5444       SmallVector<uint64_t, 32> RawMask;
5445       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
5446         SDValue Op = MaskNode->getOperand(i);
5447         if (Op->getOpcode() == ISD::UNDEF) {
5448           RawMask.push_back((uint64_t)SM_SentinelUndef);
5449           continue;
5450         }
5451         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
5452         if (!CN)
5453           return false;
5454         APInt MaskElement = CN->getAPIntValue();
5455
5456         // We now have to decode the element which could be any integer size and
5457         // extract each byte of it.
5458         for (int j = 0; j < NumBytesPerElement; ++j) {
5459           // Note that this is x86 and so always little endian: the low byte is
5460           // the first byte of the mask.
5461           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
5462           MaskElement = MaskElement.lshr(8);
5463         }
5464       }
5465       DecodePSHUFBMask(RawMask, Mask);
5466       break;
5467     }
5468
5469     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
5470     if (!MaskLoad)
5471       return false;
5472
5473     SDValue Ptr = MaskLoad->getBasePtr();
5474     if (Ptr->getOpcode() == X86ISD::Wrapper)
5475       Ptr = Ptr->getOperand(0);
5476
5477     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
5478     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
5479       return false;
5480
5481     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
5482       DecodePSHUFBMask(C, Mask);
5483       break;
5484     }
5485
5486     return false;
5487   }
5488   case X86ISD::VPERMI:
5489     ImmN = N->getOperand(N->getNumOperands()-1);
5490     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5491     IsUnary = true;
5492     break;
5493   case X86ISD::MOVSS:
5494   case X86ISD::MOVSD:
5495     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5496     break;
5497   case X86ISD::VPERM2X128:
5498     ImmN = N->getOperand(N->getNumOperands()-1);
5499     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5500     if (Mask.empty()) return false;
5501     break;
5502   case X86ISD::MOVSLDUP:
5503     DecodeMOVSLDUPMask(VT, Mask);
5504     IsUnary = true;
5505     break;
5506   case X86ISD::MOVSHDUP:
5507     DecodeMOVSHDUPMask(VT, Mask);
5508     IsUnary = true;
5509     break;
5510   case X86ISD::MOVDDUP:
5511     DecodeMOVDDUPMask(VT, Mask);
5512     IsUnary = true;
5513     break;
5514   case X86ISD::MOVLHPD:
5515   case X86ISD::MOVLPD:
5516   case X86ISD::MOVLPS:
5517     // Not yet implemented
5518     return false;
5519   default: llvm_unreachable("unknown target shuffle node");
5520   }
5521
5522   // If we have a fake unary shuffle, the shuffle mask is spread across two
5523   // inputs that are actually the same node. Re-map the mask to always point
5524   // into the first input.
5525   if (IsFakeUnary)
5526     for (int &M : Mask)
5527       if (M >= (int)Mask.size())
5528         M -= Mask.size();
5529
5530   return true;
5531 }
5532
5533 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
5534 /// element of the result of the vector shuffle.
5535 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5536                                    unsigned Depth) {
5537   if (Depth == 6)
5538     return SDValue();  // Limit search depth.
5539
5540   SDValue V = SDValue(N, 0);
5541   EVT VT = V.getValueType();
5542   unsigned Opcode = V.getOpcode();
5543
5544   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5545   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5546     int Elt = SV->getMaskElt(Index);
5547
5548     if (Elt < 0)
5549       return DAG.getUNDEF(VT.getVectorElementType());
5550
5551     unsigned NumElems = VT.getVectorNumElements();
5552     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5553                                          : SV->getOperand(1);
5554     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5555   }
5556
5557   // Recurse into target specific vector shuffles to find scalars.
5558   if (isTargetShuffle(Opcode)) {
5559     MVT ShufVT = V.getSimpleValueType();
5560     unsigned NumElems = ShufVT.getVectorNumElements();
5561     SmallVector<int, 16> ShuffleMask;
5562     bool IsUnary;
5563
5564     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5565       return SDValue();
5566
5567     int Elt = ShuffleMask[Index];
5568     if (Elt < 0)
5569       return DAG.getUNDEF(ShufVT.getVectorElementType());
5570
5571     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5572                                          : N->getOperand(1);
5573     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5574                                Depth+1);
5575   }
5576
5577   // Actual nodes that may contain scalar elements
5578   if (Opcode == ISD::BITCAST) {
5579     V = V.getOperand(0);
5580     EVT SrcVT = V.getValueType();
5581     unsigned NumElems = VT.getVectorNumElements();
5582
5583     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5584       return SDValue();
5585   }
5586
5587   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5588     return (Index == 0) ? V.getOperand(0)
5589                         : DAG.getUNDEF(VT.getVectorElementType());
5590
5591   if (V.getOpcode() == ISD::BUILD_VECTOR)
5592     return V.getOperand(Index);
5593
5594   return SDValue();
5595 }
5596
5597 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
5598 /// shuffle operation which come from a consecutively from a zero. The
5599 /// search can start in two different directions, from left or right.
5600 /// We count undefs as zeros until PreferredNum is reached.
5601 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
5602                                          unsigned NumElems, bool ZerosFromLeft,
5603                                          SelectionDAG &DAG,
5604                                          unsigned PreferredNum = -1U) {
5605   unsigned NumZeros = 0;
5606   for (unsigned i = 0; i != NumElems; ++i) {
5607     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5608     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5609     if (!Elt.getNode())
5610       break;
5611
5612     if (X86::isZeroNode(Elt))
5613       ++NumZeros;
5614     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5615       NumZeros = std::min(NumZeros + 1, PreferredNum);
5616     else
5617       break;
5618   }
5619
5620   return NumZeros;
5621 }
5622
5623 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5624 /// correspond consecutively to elements from one of the vector operands,
5625 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
5626 static
5627 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
5628                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5629                               unsigned NumElems, unsigned &OpNum) {
5630   bool SeenV1 = false;
5631   bool SeenV2 = false;
5632
5633   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5634     int Idx = SVOp->getMaskElt(i);
5635     // Ignore undef indicies
5636     if (Idx < 0)
5637       continue;
5638
5639     if (Idx < (int)NumElems)
5640       SeenV1 = true;
5641     else
5642       SeenV2 = true;
5643
5644     // Only accept consecutive elements from the same vector
5645     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5646       return false;
5647   }
5648
5649   OpNum = SeenV1 ? 0 : 1;
5650   return true;
5651 }
5652
5653 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5654 /// logical left shift of a vector.
5655 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5656                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5657   unsigned NumElems =
5658     SVOp->getSimpleValueType(0).getVectorNumElements();
5659   unsigned NumZeros = getNumOfConsecutiveZeros(
5660       SVOp, NumElems, false /* check zeros from right */, DAG,
5661       SVOp->getMaskElt(0));
5662   unsigned OpSrc;
5663
5664   if (!NumZeros)
5665     return false;
5666
5667   // Considering the elements in the mask that are not consecutive zeros,
5668   // check if they consecutively come from only one of the source vectors.
5669   //
5670   //               V1 = {X, A, B, C}     0
5671   //                         \  \  \    /
5672   //   vector_shuffle V1, V2 <1, 2, 3, X>
5673   //
5674   if (!isShuffleMaskConsecutive(SVOp,
5675             0,                   // Mask Start Index
5676             NumElems-NumZeros,   // Mask End Index(exclusive)
5677             NumZeros,            // Where to start looking in the src vector
5678             NumElems,            // Number of elements in vector
5679             OpSrc))              // Which source operand ?
5680     return false;
5681
5682   isLeft = false;
5683   ShAmt = NumZeros;
5684   ShVal = SVOp->getOperand(OpSrc);
5685   return true;
5686 }
5687
5688 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5689 /// logical left shift of a vector.
5690 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5691                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5692   unsigned NumElems =
5693     SVOp->getSimpleValueType(0).getVectorNumElements();
5694   unsigned NumZeros = getNumOfConsecutiveZeros(
5695       SVOp, NumElems, true /* check zeros from left */, DAG,
5696       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5697   unsigned OpSrc;
5698
5699   if (!NumZeros)
5700     return false;
5701
5702   // Considering the elements in the mask that are not consecutive zeros,
5703   // check if they consecutively come from only one of the source vectors.
5704   //
5705   //                           0    { A, B, X, X } = V2
5706   //                          / \    /  /
5707   //   vector_shuffle V1, V2 <X, X, 4, 5>
5708   //
5709   if (!isShuffleMaskConsecutive(SVOp,
5710             NumZeros,     // Mask Start Index
5711             NumElems,     // Mask End Index(exclusive)
5712             0,            // Where to start looking in the src vector
5713             NumElems,     // Number of elements in vector
5714             OpSrc))       // Which source operand ?
5715     return false;
5716
5717   isLeft = true;
5718   ShAmt = NumZeros;
5719   ShVal = SVOp->getOperand(OpSrc);
5720   return true;
5721 }
5722
5723 /// isVectorShift - Returns true if the shuffle can be implemented as a
5724 /// logical left or right shift of a vector.
5725 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5726                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5727   // Although the logic below support any bitwidth size, there are no
5728   // shift instructions which handle more than 128-bit vectors.
5729   if (!SVOp->getSimpleValueType(0).is128BitVector())
5730     return false;
5731
5732   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5733       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5734     return true;
5735
5736   return false;
5737 }
5738
5739 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5740 ///
5741 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5742                                        unsigned NumNonZero, unsigned NumZero,
5743                                        SelectionDAG &DAG,
5744                                        const X86Subtarget* Subtarget,
5745                                        const TargetLowering &TLI) {
5746   if (NumNonZero > 8)
5747     return SDValue();
5748
5749   SDLoc dl(Op);
5750   SDValue V;
5751   bool First = true;
5752   for (unsigned i = 0; i < 16; ++i) {
5753     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5754     if (ThisIsNonZero && First) {
5755       if (NumZero)
5756         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5757       else
5758         V = DAG.getUNDEF(MVT::v8i16);
5759       First = false;
5760     }
5761
5762     if ((i & 1) != 0) {
5763       SDValue ThisElt, LastElt;
5764       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5765       if (LastIsNonZero) {
5766         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5767                               MVT::i16, Op.getOperand(i-1));
5768       }
5769       if (ThisIsNonZero) {
5770         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5771         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5772                               ThisElt, DAG.getConstant(8, MVT::i8));
5773         if (LastIsNonZero)
5774           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5775       } else
5776         ThisElt = LastElt;
5777
5778       if (ThisElt.getNode())
5779         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5780                         DAG.getIntPtrConstant(i/2));
5781     }
5782   }
5783
5784   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5785 }
5786
5787 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5788 ///
5789 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5790                                      unsigned NumNonZero, unsigned NumZero,
5791                                      SelectionDAG &DAG,
5792                                      const X86Subtarget* Subtarget,
5793                                      const TargetLowering &TLI) {
5794   if (NumNonZero > 4)
5795     return SDValue();
5796
5797   SDLoc dl(Op);
5798   SDValue V;
5799   bool First = true;
5800   for (unsigned i = 0; i < 8; ++i) {
5801     bool isNonZero = (NonZeros & (1 << i)) != 0;
5802     if (isNonZero) {
5803       if (First) {
5804         if (NumZero)
5805           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5806         else
5807           V = DAG.getUNDEF(MVT::v8i16);
5808         First = false;
5809       }
5810       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5811                       MVT::v8i16, V, Op.getOperand(i),
5812                       DAG.getIntPtrConstant(i));
5813     }
5814   }
5815
5816   return V;
5817 }
5818
5819 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
5820 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5821                                      const X86Subtarget *Subtarget,
5822                                      const TargetLowering &TLI) {
5823   // Find all zeroable elements.
5824   bool Zeroable[4];
5825   for (int i=0; i < 4; ++i) {
5826     SDValue Elt = Op->getOperand(i);
5827     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
5828   }
5829   assert(std::count_if(&Zeroable[0], &Zeroable[4],
5830                        [](bool M) { return !M; }) > 1 &&
5831          "We expect at least two non-zero elements!");
5832
5833   // We only know how to deal with build_vector nodes where elements are either
5834   // zeroable or extract_vector_elt with constant index.
5835   SDValue FirstNonZero;
5836   unsigned FirstNonZeroIdx;
5837   for (unsigned i=0; i < 4; ++i) {
5838     if (Zeroable[i])
5839       continue;
5840     SDValue Elt = Op->getOperand(i);
5841     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5842         !isa<ConstantSDNode>(Elt.getOperand(1)))
5843       return SDValue();
5844     // Make sure that this node is extracting from a 128-bit vector.
5845     MVT VT = Elt.getOperand(0).getSimpleValueType();
5846     if (!VT.is128BitVector())
5847       return SDValue();
5848     if (!FirstNonZero.getNode()) {
5849       FirstNonZero = Elt;
5850       FirstNonZeroIdx = i;
5851     }
5852   }
5853
5854   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5855   SDValue V1 = FirstNonZero.getOperand(0);
5856   MVT VT = V1.getSimpleValueType();
5857
5858   // See if this build_vector can be lowered as a blend with zero.
5859   SDValue Elt;
5860   unsigned EltMaskIdx, EltIdx;
5861   int Mask[4];
5862   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5863     if (Zeroable[EltIdx]) {
5864       // The zero vector will be on the right hand side.
5865       Mask[EltIdx] = EltIdx+4;
5866       continue;
5867     }
5868
5869     Elt = Op->getOperand(EltIdx);
5870     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5871     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5872     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5873       break;
5874     Mask[EltIdx] = EltIdx;
5875   }
5876
5877   if (EltIdx == 4) {
5878     // Let the shuffle legalizer deal with blend operations.
5879     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5880     if (V1.getSimpleValueType() != VT)
5881       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
5882     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
5883   }
5884
5885   // See if we can lower this build_vector to a INSERTPS.
5886   if (!Subtarget->hasSSE41())
5887     return SDValue();
5888
5889   SDValue V2 = Elt.getOperand(0);
5890   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5891     V1 = SDValue();
5892
5893   bool CanFold = true;
5894   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5895     if (Zeroable[i])
5896       continue;
5897
5898     SDValue Current = Op->getOperand(i);
5899     SDValue SrcVector = Current->getOperand(0);
5900     if (!V1.getNode())
5901       V1 = SrcVector;
5902     CanFold = SrcVector == V1 &&
5903       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5904   }
5905
5906   if (!CanFold)
5907     return SDValue();
5908
5909   assert(V1.getNode() && "Expected at least two non-zero elements!");
5910   if (V1.getSimpleValueType() != MVT::v4f32)
5911     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
5912   if (V2.getSimpleValueType() != MVT::v4f32)
5913     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
5914
5915   // Ok, we can emit an INSERTPS instruction.
5916   unsigned ZMask = 0;
5917   for (int i = 0; i < 4; ++i)
5918     if (Zeroable[i])
5919       ZMask |= 1 << i;
5920
5921   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5922   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5923   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
5924                                DAG.getIntPtrConstant(InsertPSMask));
5925   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
5926 }
5927
5928 /// Return a vector logical shift node.
5929 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5930                          unsigned NumBits, SelectionDAG &DAG,
5931                          const TargetLowering &TLI, SDLoc dl) {
5932   assert(VT.is128BitVector() && "Unknown type for VShift");
5933   MVT ShVT = MVT::v2i64;
5934   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5935   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5936   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
5937   SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy);
5938   return DAG.getNode(ISD::BITCAST, dl, VT,
5939                      DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5940 }
5941
5942 static SDValue
5943 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
5944
5945   // Check if the scalar load can be widened into a vector load. And if
5946   // the address is "base + cst" see if the cst can be "absorbed" into
5947   // the shuffle mask.
5948   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5949     SDValue Ptr = LD->getBasePtr();
5950     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5951       return SDValue();
5952     EVT PVT = LD->getValueType(0);
5953     if (PVT != MVT::i32 && PVT != MVT::f32)
5954       return SDValue();
5955
5956     int FI = -1;
5957     int64_t Offset = 0;
5958     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5959       FI = FINode->getIndex();
5960       Offset = 0;
5961     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5962                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5963       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5964       Offset = Ptr.getConstantOperandVal(1);
5965       Ptr = Ptr.getOperand(0);
5966     } else {
5967       return SDValue();
5968     }
5969
5970     // FIXME: 256-bit vector instructions don't require a strict alignment,
5971     // improve this code to support it better.
5972     unsigned RequiredAlign = VT.getSizeInBits()/8;
5973     SDValue Chain = LD->getChain();
5974     // Make sure the stack object alignment is at least 16 or 32.
5975     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5976     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5977       if (MFI->isFixedObjectIndex(FI)) {
5978         // Can't change the alignment. FIXME: It's possible to compute
5979         // the exact stack offset and reference FI + adjust offset instead.
5980         // If someone *really* cares about this. That's the way to implement it.
5981         return SDValue();
5982       } else {
5983         MFI->setObjectAlignment(FI, RequiredAlign);
5984       }
5985     }
5986
5987     // (Offset % 16 or 32) must be multiple of 4. Then address is then
5988     // Ptr + (Offset & ~15).
5989     if (Offset < 0)
5990       return SDValue();
5991     if ((Offset % RequiredAlign) & 3)
5992       return SDValue();
5993     int64_t StartOffset = Offset & ~(RequiredAlign-1);
5994     if (StartOffset)
5995       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
5996                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
5997
5998     int EltNo = (Offset - StartOffset) >> 2;
5999     unsigned NumElems = VT.getVectorNumElements();
6000
6001     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6002     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6003                              LD->getPointerInfo().getWithOffset(StartOffset),
6004                              false, false, false, 0);
6005
6006     SmallVector<int, 8> Mask;
6007     for (unsigned i = 0; i != NumElems; ++i)
6008       Mask.push_back(EltNo);
6009
6010     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
6011   }
6012
6013   return SDValue();
6014 }
6015
6016 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6017 /// elements can be replaced by a single large load which has the same value as
6018 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6019 ///
6020 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
6021 ///
6022 /// FIXME: we'd also like to handle the case where the last elements are zero
6023 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
6024 /// There's even a handy isZeroNode for that purpose.
6025 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6026                                         SDLoc &DL, SelectionDAG &DAG,
6027                                         bool isAfterLegalize) {
6028   unsigned NumElems = Elts.size();
6029
6030   LoadSDNode *LDBase = nullptr;
6031   unsigned LastLoadedElt = -1U;
6032
6033   // For each element in the initializer, see if we've found a load or an undef.
6034   // If we don't find an initial load element, or later load elements are
6035   // non-consecutive, bail out.
6036   for (unsigned i = 0; i < NumElems; ++i) {
6037     SDValue Elt = Elts[i];
6038     // Look through a bitcast.
6039     if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
6040       Elt = Elt.getOperand(0);
6041     if (!Elt.getNode() ||
6042         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
6043       return SDValue();
6044     if (!LDBase) {
6045       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
6046         return SDValue();
6047       LDBase = cast<LoadSDNode>(Elt.getNode());
6048       LastLoadedElt = i;
6049       continue;
6050     }
6051     if (Elt.getOpcode() == ISD::UNDEF)
6052       continue;
6053
6054     LoadSDNode *LD = cast<LoadSDNode>(Elt);
6055     EVT LdVT = Elt.getValueType();
6056     // Each loaded element must be the correct fractional portion of the
6057     // requested vector load.
6058     if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
6059       return SDValue();
6060     if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
6061       return SDValue();
6062     LastLoadedElt = i;
6063   }
6064
6065   // If we have found an entire vector of loads and undefs, then return a large
6066   // load of the entire vector width starting at the base pointer.  If we found
6067   // consecutive loads for the low half, generate a vzext_load node.
6068   if (LastLoadedElt == NumElems - 1) {
6069     assert(LDBase && "Did not find base load for merging consecutive loads");
6070     EVT EltVT = LDBase->getValueType(0);
6071     // Ensure that the input vector size for the merged loads matches the
6072     // cumulative size of the input elements.
6073     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6074       return SDValue();
6075
6076     if (isAfterLegalize &&
6077         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
6078       return SDValue();
6079
6080     SDValue NewLd = SDValue();
6081
6082     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6083                         LDBase->getPointerInfo(), LDBase->isVolatile(),
6084                         LDBase->isNonTemporal(), LDBase->isInvariant(),
6085                         LDBase->getAlignment());
6086
6087     if (LDBase->hasAnyUseOfValue(1)) {
6088       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6089                                      SDValue(LDBase, 1),
6090                                      SDValue(NewLd.getNode(), 1));
6091       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6092       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6093                              SDValue(NewLd.getNode(), 1));
6094     }
6095
6096     return NewLd;
6097   }
6098
6099   //TODO: The code below fires only for for loading the low v2i32 / v2f32
6100   //of a v4i32 / v4f32. It's probably worth generalizing.
6101   EVT EltVT = VT.getVectorElementType();
6102   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
6103       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
6104     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
6105     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6106     SDValue ResNode =
6107         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
6108                                 LDBase->getPointerInfo(),
6109                                 LDBase->getAlignment(),
6110                                 false/*isVolatile*/, true/*ReadMem*/,
6111                                 false/*WriteMem*/);
6112
6113     // Make sure the newly-created LOAD is in the same position as LDBase in
6114     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
6115     // update uses of LDBase's output chain to use the TokenFactor.
6116     if (LDBase->hasAnyUseOfValue(1)) {
6117       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6118                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
6119       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6120       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6121                              SDValue(ResNode.getNode(), 1));
6122     }
6123
6124     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
6125   }
6126   return SDValue();
6127 }
6128
6129 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
6130 /// to generate a splat value for the following cases:
6131 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
6132 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6133 /// a scalar load, or a constant.
6134 /// The VBROADCAST node is returned when a pattern is found,
6135 /// or SDValue() otherwise.
6136 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
6137                                     SelectionDAG &DAG) {
6138   // VBROADCAST requires AVX.
6139   // TODO: Splats could be generated for non-AVX CPUs using SSE
6140   // instructions, but there's less potential gain for only 128-bit vectors.
6141   if (!Subtarget->hasAVX())
6142     return SDValue();
6143
6144   MVT VT = Op.getSimpleValueType();
6145   SDLoc dl(Op);
6146
6147   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6148          "Unsupported vector type for broadcast.");
6149
6150   SDValue Ld;
6151   bool ConstSplatVal;
6152
6153   switch (Op.getOpcode()) {
6154     default:
6155       // Unknown pattern found.
6156       return SDValue();
6157
6158     case ISD::BUILD_VECTOR: {
6159       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
6160       BitVector UndefElements;
6161       SDValue Splat = BVOp->getSplatValue(&UndefElements);
6162
6163       // We need a splat of a single value to use broadcast, and it doesn't
6164       // make any sense if the value is only in one element of the vector.
6165       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
6166         return SDValue();
6167
6168       Ld = Splat;
6169       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6170                        Ld.getOpcode() == ISD::ConstantFP);
6171
6172       // Make sure that all of the users of a non-constant load are from the
6173       // BUILD_VECTOR node.
6174       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6175         return SDValue();
6176       break;
6177     }
6178
6179     case ISD::VECTOR_SHUFFLE: {
6180       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6181
6182       // Shuffles must have a splat mask where the first element is
6183       // broadcasted.
6184       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
6185         return SDValue();
6186
6187       SDValue Sc = Op.getOperand(0);
6188       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
6189           Sc.getOpcode() != ISD::BUILD_VECTOR) {
6190
6191         if (!Subtarget->hasInt256())
6192           return SDValue();
6193
6194         // Use the register form of the broadcast instruction available on AVX2.
6195         if (VT.getSizeInBits() >= 256)
6196           Sc = Extract128BitVector(Sc, 0, DAG, dl);
6197         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
6198       }
6199
6200       Ld = Sc.getOperand(0);
6201       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6202                        Ld.getOpcode() == ISD::ConstantFP);
6203
6204       // The scalar_to_vector node and the suspected
6205       // load node must have exactly one user.
6206       // Constants may have multiple users.
6207
6208       // AVX-512 has register version of the broadcast
6209       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
6210         Ld.getValueType().getSizeInBits() >= 32;
6211       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
6212           !hasRegVer))
6213         return SDValue();
6214       break;
6215     }
6216   }
6217
6218   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
6219   bool IsGE256 = (VT.getSizeInBits() >= 256);
6220
6221   // When optimizing for size, generate up to 5 extra bytes for a broadcast
6222   // instruction to save 8 or more bytes of constant pool data.
6223   // TODO: If multiple splats are generated to load the same constant,
6224   // it may be detrimental to overall size. There needs to be a way to detect
6225   // that condition to know if this is truly a size win.
6226   const Function *F = DAG.getMachineFunction().getFunction();
6227   bool OptForSize = F->getAttributes().
6228     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
6229
6230   // Handle broadcasting a single constant scalar from the constant pool
6231   // into a vector.
6232   // On Sandybridge (no AVX2), it is still better to load a constant vector
6233   // from the constant pool and not to broadcast it from a scalar.
6234   // But override that restriction when optimizing for size.
6235   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6236   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
6237     EVT CVT = Ld.getValueType();
6238     assert(!CVT.isVector() && "Must not broadcast a vector type");
6239
6240     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6241     // For size optimization, also splat v2f64 and v2i64, and for size opt
6242     // with AVX2, also splat i8 and i16.
6243     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6244     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6245         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
6246       const Constant *C = nullptr;
6247       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6248         C = CI->getConstantIntValue();
6249       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6250         C = CF->getConstantFPValue();
6251
6252       assert(C && "Invalid constant type");
6253
6254       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6255       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
6256       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6257       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
6258                        MachinePointerInfo::getConstantPool(),
6259                        false, false, false, Alignment);
6260
6261       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6262     }
6263   }
6264
6265   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6266
6267   // Handle AVX2 in-register broadcasts.
6268   if (!IsLoad && Subtarget->hasInt256() &&
6269       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6270     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6271
6272   // The scalar source must be a normal load.
6273   if (!IsLoad)
6274     return SDValue();
6275
6276   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6277       (Subtarget->hasVLX() && ScalarSize == 64))
6278     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6279
6280   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6281   // double since there is no vbroadcastsd xmm
6282   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
6283     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6284       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6285   }
6286
6287   // Unsupported broadcast.
6288   return SDValue();
6289 }
6290
6291 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6292 /// underlying vector and index.
6293 ///
6294 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6295 /// index.
6296 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6297                                          SDValue ExtIdx) {
6298   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6299   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6300     return Idx;
6301
6302   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6303   // lowered this:
6304   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6305   // to:
6306   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
6307   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
6308   //                           undef)
6309   //                       Constant<0>)
6310   // In this case the vector is the extract_subvector expression and the index
6311   // is 2, as specified by the shuffle.
6312   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6313   SDValue ShuffleVec = SVOp->getOperand(0);
6314   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6315   assert(ShuffleVecVT.getVectorElementType() ==
6316          ExtractedFromVec.getSimpleValueType().getVectorElementType());
6317
6318   int ShuffleIdx = SVOp->getMaskElt(Idx);
6319   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6320     ExtractedFromVec = ShuffleVec;
6321     return ShuffleIdx;
6322   }
6323   return Idx;
6324 }
6325
6326 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6327   MVT VT = Op.getSimpleValueType();
6328
6329   // Skip if insert_vec_elt is not supported.
6330   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6331   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6332     return SDValue();
6333
6334   SDLoc DL(Op);
6335   unsigned NumElems = Op.getNumOperands();
6336
6337   SDValue VecIn1;
6338   SDValue VecIn2;
6339   SmallVector<unsigned, 4> InsertIndices;
6340   SmallVector<int, 8> Mask(NumElems, -1);
6341
6342   for (unsigned i = 0; i != NumElems; ++i) {
6343     unsigned Opc = Op.getOperand(i).getOpcode();
6344
6345     if (Opc == ISD::UNDEF)
6346       continue;
6347
6348     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6349       // Quit if more than 1 elements need inserting.
6350       if (InsertIndices.size() > 1)
6351         return SDValue();
6352
6353       InsertIndices.push_back(i);
6354       continue;
6355     }
6356
6357     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6358     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6359     // Quit if non-constant index.
6360     if (!isa<ConstantSDNode>(ExtIdx))
6361       return SDValue();
6362     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6363
6364     // Quit if extracted from vector of different type.
6365     if (ExtractedFromVec.getValueType() != VT)
6366       return SDValue();
6367
6368     if (!VecIn1.getNode())
6369       VecIn1 = ExtractedFromVec;
6370     else if (VecIn1 != ExtractedFromVec) {
6371       if (!VecIn2.getNode())
6372         VecIn2 = ExtractedFromVec;
6373       else if (VecIn2 != ExtractedFromVec)
6374         // Quit if more than 2 vectors to shuffle
6375         return SDValue();
6376     }
6377
6378     if (ExtractedFromVec == VecIn1)
6379       Mask[i] = Idx;
6380     else if (ExtractedFromVec == VecIn2)
6381       Mask[i] = Idx + NumElems;
6382   }
6383
6384   if (!VecIn1.getNode())
6385     return SDValue();
6386
6387   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6388   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
6389   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6390     unsigned Idx = InsertIndices[i];
6391     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6392                      DAG.getIntPtrConstant(Idx));
6393   }
6394
6395   return NV;
6396 }
6397
6398 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6399 SDValue
6400 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6401
6402   MVT VT = Op.getSimpleValueType();
6403   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
6404          "Unexpected type in LowerBUILD_VECTORvXi1!");
6405
6406   SDLoc dl(Op);
6407   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6408     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
6409     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6410     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6411   }
6412
6413   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
6414     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
6415     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6416     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6417   }
6418
6419   bool AllContants = true;
6420   uint64_t Immediate = 0;
6421   int NonConstIdx = -1;
6422   bool IsSplat = true;
6423   unsigned NumNonConsts = 0;
6424   unsigned NumConsts = 0;
6425   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6426     SDValue In = Op.getOperand(idx);
6427     if (In.getOpcode() == ISD::UNDEF)
6428       continue;
6429     if (!isa<ConstantSDNode>(In)) {
6430       AllContants = false;
6431       NonConstIdx = idx;
6432       NumNonConsts++;
6433     } else {
6434       NumConsts++;
6435       if (cast<ConstantSDNode>(In)->getZExtValue())
6436       Immediate |= (1ULL << idx);
6437     }
6438     if (In != Op.getOperand(0))
6439       IsSplat = false;
6440   }
6441
6442   if (AllContants) {
6443     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
6444       DAG.getConstant(Immediate, MVT::i16));
6445     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
6446                        DAG.getIntPtrConstant(0));
6447   }
6448
6449   if (NumNonConsts == 1 && NonConstIdx != 0) {
6450     SDValue DstVec;
6451     if (NumConsts) {
6452       SDValue VecAsImm = DAG.getConstant(Immediate,
6453                                          MVT::getIntegerVT(VT.getSizeInBits()));
6454       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
6455     }
6456     else
6457       DstVec = DAG.getUNDEF(VT);
6458     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6459                        Op.getOperand(NonConstIdx),
6460                        DAG.getIntPtrConstant(NonConstIdx));
6461   }
6462   if (!IsSplat && (NonConstIdx != 0))
6463     llvm_unreachable("Unsupported BUILD_VECTOR operation");
6464   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
6465   SDValue Select;
6466   if (IsSplat)
6467     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6468                           DAG.getConstant(-1, SelectVT),
6469                           DAG.getConstant(0, SelectVT));
6470   else
6471     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6472                          DAG.getConstant((Immediate | 1), SelectVT),
6473                          DAG.getConstant(Immediate, SelectVT));
6474   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
6475 }
6476
6477 /// \brief Return true if \p N implements a horizontal binop and return the
6478 /// operands for the horizontal binop into V0 and V1.
6479 ///
6480 /// This is a helper function of PerformBUILD_VECTORCombine.
6481 /// This function checks that the build_vector \p N in input implements a
6482 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6483 /// operation to match.
6484 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6485 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6486 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6487 /// arithmetic sub.
6488 ///
6489 /// This function only analyzes elements of \p N whose indices are
6490 /// in range [BaseIdx, LastIdx).
6491 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6492                               SelectionDAG &DAG,
6493                               unsigned BaseIdx, unsigned LastIdx,
6494                               SDValue &V0, SDValue &V1) {
6495   EVT VT = N->getValueType(0);
6496
6497   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6498   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6499          "Invalid Vector in input!");
6500
6501   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6502   bool CanFold = true;
6503   unsigned ExpectedVExtractIdx = BaseIdx;
6504   unsigned NumElts = LastIdx - BaseIdx;
6505   V0 = DAG.getUNDEF(VT);
6506   V1 = DAG.getUNDEF(VT);
6507
6508   // Check if N implements a horizontal binop.
6509   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6510     SDValue Op = N->getOperand(i + BaseIdx);
6511
6512     // Skip UNDEFs.
6513     if (Op->getOpcode() == ISD::UNDEF) {
6514       // Update the expected vector extract index.
6515       if (i * 2 == NumElts)
6516         ExpectedVExtractIdx = BaseIdx;
6517       ExpectedVExtractIdx += 2;
6518       continue;
6519     }
6520
6521     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6522
6523     if (!CanFold)
6524       break;
6525
6526     SDValue Op0 = Op.getOperand(0);
6527     SDValue Op1 = Op.getOperand(1);
6528
6529     // Try to match the following pattern:
6530     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6531     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6532         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6533         Op0.getOperand(0) == Op1.getOperand(0) &&
6534         isa<ConstantSDNode>(Op0.getOperand(1)) &&
6535         isa<ConstantSDNode>(Op1.getOperand(1)));
6536     if (!CanFold)
6537       break;
6538
6539     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6540     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6541
6542     if (i * 2 < NumElts) {
6543       if (V0.getOpcode() == ISD::UNDEF)
6544         V0 = Op0.getOperand(0);
6545     } else {
6546       if (V1.getOpcode() == ISD::UNDEF)
6547         V1 = Op0.getOperand(0);
6548       if (i * 2 == NumElts)
6549         ExpectedVExtractIdx = BaseIdx;
6550     }
6551
6552     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6553     if (I0 == ExpectedVExtractIdx)
6554       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6555     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6556       // Try to match the following dag sequence:
6557       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6558       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6559     } else
6560       CanFold = false;
6561
6562     ExpectedVExtractIdx += 2;
6563   }
6564
6565   return CanFold;
6566 }
6567
6568 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6569 /// a concat_vector.
6570 ///
6571 /// This is a helper function of PerformBUILD_VECTORCombine.
6572 /// This function expects two 256-bit vectors called V0 and V1.
6573 /// At first, each vector is split into two separate 128-bit vectors.
6574 /// Then, the resulting 128-bit vectors are used to implement two
6575 /// horizontal binary operations.
6576 ///
6577 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6578 ///
6579 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6580 /// the two new horizontal binop.
6581 /// When Mode is set, the first horizontal binop dag node would take as input
6582 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6583 /// horizontal binop dag node would take as input the lower 128-bit of V1
6584 /// and the upper 128-bit of V1.
6585 ///   Example:
6586 ///     HADD V0_LO, V0_HI
6587 ///     HADD V1_LO, V1_HI
6588 ///
6589 /// Otherwise, the first horizontal binop dag node takes as input the lower
6590 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6591 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
6592 ///   Example:
6593 ///     HADD V0_LO, V1_LO
6594 ///     HADD V0_HI, V1_HI
6595 ///
6596 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6597 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6598 /// the upper 128-bits of the result.
6599 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6600                                      SDLoc DL, SelectionDAG &DAG,
6601                                      unsigned X86Opcode, bool Mode,
6602                                      bool isUndefLO, bool isUndefHI) {
6603   EVT VT = V0.getValueType();
6604   assert(VT.is256BitVector() && VT == V1.getValueType() &&
6605          "Invalid nodes in input!");
6606
6607   unsigned NumElts = VT.getVectorNumElements();
6608   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
6609   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
6610   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
6611   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
6612   EVT NewVT = V0_LO.getValueType();
6613
6614   SDValue LO = DAG.getUNDEF(NewVT);
6615   SDValue HI = DAG.getUNDEF(NewVT);
6616
6617   if (Mode) {
6618     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6619     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
6620       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6621     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
6622       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6623   } else {
6624     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6625     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
6626                        V1_LO->getOpcode() != ISD::UNDEF))
6627       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6628
6629     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
6630                        V1_HI->getOpcode() != ISD::UNDEF))
6631       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6632   }
6633
6634   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6635 }
6636
6637 /// \brief Try to fold a build_vector that performs an 'addsub' into the
6638 /// sequence of 'vadd + vsub + blendi'.
6639 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
6640                            const X86Subtarget *Subtarget) {
6641   SDLoc DL(BV);
6642   EVT VT = BV->getValueType(0);
6643   unsigned NumElts = VT.getVectorNumElements();
6644   SDValue InVec0 = DAG.getUNDEF(VT);
6645   SDValue InVec1 = DAG.getUNDEF(VT);
6646
6647   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6648           VT == MVT::v2f64) && "build_vector with an invalid type found!");
6649
6650   // Odd-numbered elements in the input build vector are obtained from
6651   // adding two integer/float elements.
6652   // Even-numbered elements in the input build vector are obtained from
6653   // subtracting two integer/float elements.
6654   unsigned ExpectedOpcode = ISD::FSUB;
6655   unsigned NextExpectedOpcode = ISD::FADD;
6656   bool AddFound = false;
6657   bool SubFound = false;
6658
6659   for (unsigned i = 0, e = NumElts; i != e; i++) {
6660     SDValue Op = BV->getOperand(i);
6661
6662     // Skip 'undef' values.
6663     unsigned Opcode = Op.getOpcode();
6664     if (Opcode == ISD::UNDEF) {
6665       std::swap(ExpectedOpcode, NextExpectedOpcode);
6666       continue;
6667     }
6668
6669     // Early exit if we found an unexpected opcode.
6670     if (Opcode != ExpectedOpcode)
6671       return SDValue();
6672
6673     SDValue Op0 = Op.getOperand(0);
6674     SDValue Op1 = Op.getOperand(1);
6675
6676     // Try to match the following pattern:
6677     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6678     // Early exit if we cannot match that sequence.
6679     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6680         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6681         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6682         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6683         Op0.getOperand(1) != Op1.getOperand(1))
6684       return SDValue();
6685
6686     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6687     if (I0 != i)
6688       return SDValue();
6689
6690     // We found a valid add/sub node. Update the information accordingly.
6691     if (i & 1)
6692       AddFound = true;
6693     else
6694       SubFound = true;
6695
6696     // Update InVec0 and InVec1.
6697     if (InVec0.getOpcode() == ISD::UNDEF)
6698       InVec0 = Op0.getOperand(0);
6699     if (InVec1.getOpcode() == ISD::UNDEF)
6700       InVec1 = Op1.getOperand(0);
6701
6702     // Make sure that operands in input to each add/sub node always
6703     // come from a same pair of vectors.
6704     if (InVec0 != Op0.getOperand(0)) {
6705       if (ExpectedOpcode == ISD::FSUB)
6706         return SDValue();
6707
6708       // FADD is commutable. Try to commute the operands
6709       // and then test again.
6710       std::swap(Op0, Op1);
6711       if (InVec0 != Op0.getOperand(0))
6712         return SDValue();
6713     }
6714
6715     if (InVec1 != Op1.getOperand(0))
6716       return SDValue();
6717
6718     // Update the pair of expected opcodes.
6719     std::swap(ExpectedOpcode, NextExpectedOpcode);
6720   }
6721
6722   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6723   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
6724       InVec1.getOpcode() != ISD::UNDEF)
6725     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6726
6727   return SDValue();
6728 }
6729
6730 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
6731                                           const X86Subtarget *Subtarget) {
6732   SDLoc DL(N);
6733   EVT VT = N->getValueType(0);
6734   unsigned NumElts = VT.getVectorNumElements();
6735   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
6736   SDValue InVec0, InVec1;
6737
6738   // Try to match an ADDSUB.
6739   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
6740       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
6741     SDValue Value = matchAddSub(BV, DAG, Subtarget);
6742     if (Value.getNode())
6743       return Value;
6744   }
6745
6746   // Try to match horizontal ADD/SUB.
6747   unsigned NumUndefsLO = 0;
6748   unsigned NumUndefsHI = 0;
6749   unsigned Half = NumElts/2;
6750
6751   // Count the number of UNDEF operands in the build_vector in input.
6752   for (unsigned i = 0, e = Half; i != e; ++i)
6753     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6754       NumUndefsLO++;
6755
6756   for (unsigned i = Half, e = NumElts; i != e; ++i)
6757     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6758       NumUndefsHI++;
6759
6760   // Early exit if this is either a build_vector of all UNDEFs or all the
6761   // operands but one are UNDEF.
6762   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6763     return SDValue();
6764
6765   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
6766     // Try to match an SSE3 float HADD/HSUB.
6767     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6768       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6769
6770     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6771       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6772   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
6773     // Try to match an SSSE3 integer HADD/HSUB.
6774     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6775       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6776
6777     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6778       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6779   }
6780
6781   if (!Subtarget->hasAVX())
6782     return SDValue();
6783
6784   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6785     // Try to match an AVX horizontal add/sub of packed single/double
6786     // precision floating point values from 256-bit vectors.
6787     SDValue InVec2, InVec3;
6788     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6789         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6790         ((InVec0.getOpcode() == ISD::UNDEF ||
6791           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6792         ((InVec1.getOpcode() == ISD::UNDEF ||
6793           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6794       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6795
6796     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6797         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6798         ((InVec0.getOpcode() == ISD::UNDEF ||
6799           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6800         ((InVec1.getOpcode() == ISD::UNDEF ||
6801           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6802       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6803   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6804     // Try to match an AVX2 horizontal add/sub of signed integers.
6805     SDValue InVec2, InVec3;
6806     unsigned X86Opcode;
6807     bool CanFold = true;
6808
6809     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6810         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6811         ((InVec0.getOpcode() == ISD::UNDEF ||
6812           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6813         ((InVec1.getOpcode() == ISD::UNDEF ||
6814           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6815       X86Opcode = X86ISD::HADD;
6816     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6817         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6818         ((InVec0.getOpcode() == ISD::UNDEF ||
6819           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6820         ((InVec1.getOpcode() == ISD::UNDEF ||
6821           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6822       X86Opcode = X86ISD::HSUB;
6823     else
6824       CanFold = false;
6825
6826     if (CanFold) {
6827       // Fold this build_vector into a single horizontal add/sub.
6828       // Do this only if the target has AVX2.
6829       if (Subtarget->hasAVX2())
6830         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6831
6832       // Do not try to expand this build_vector into a pair of horizontal
6833       // add/sub if we can emit a pair of scalar add/sub.
6834       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6835         return SDValue();
6836
6837       // Convert this build_vector into a pair of horizontal binop followed by
6838       // a concat vector.
6839       bool isUndefLO = NumUndefsLO == Half;
6840       bool isUndefHI = NumUndefsHI == Half;
6841       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6842                                    isUndefLO, isUndefHI);
6843     }
6844   }
6845
6846   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6847        VT == MVT::v16i16) && Subtarget->hasAVX()) {
6848     unsigned X86Opcode;
6849     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6850       X86Opcode = X86ISD::HADD;
6851     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6852       X86Opcode = X86ISD::HSUB;
6853     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6854       X86Opcode = X86ISD::FHADD;
6855     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6856       X86Opcode = X86ISD::FHSUB;
6857     else
6858       return SDValue();
6859
6860     // Don't try to expand this build_vector into a pair of horizontal add/sub
6861     // if we can simply emit a pair of scalar add/sub.
6862     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6863       return SDValue();
6864
6865     // Convert this build_vector into two horizontal add/sub followed by
6866     // a concat vector.
6867     bool isUndefLO = NumUndefsLO == Half;
6868     bool isUndefHI = NumUndefsHI == Half;
6869     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6870                                  isUndefLO, isUndefHI);
6871   }
6872
6873   return SDValue();
6874 }
6875
6876 SDValue
6877 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6878   SDLoc dl(Op);
6879
6880   MVT VT = Op.getSimpleValueType();
6881   MVT ExtVT = VT.getVectorElementType();
6882   unsigned NumElems = Op.getNumOperands();
6883
6884   // Generate vectors for predicate vectors.
6885   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
6886     return LowerBUILD_VECTORvXi1(Op, DAG);
6887
6888   // Vectors containing all zeros can be matched by pxor and xorps later
6889   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6890     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6891     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6892     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6893       return Op;
6894
6895     return getZeroVector(VT, Subtarget, DAG, dl);
6896   }
6897
6898   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6899   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6900   // vpcmpeqd on 256-bit vectors.
6901   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6902     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
6903       return Op;
6904
6905     if (!VT.is512BitVector())
6906       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
6907   }
6908
6909   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
6910   if (Broadcast.getNode())
6911     return Broadcast;
6912
6913   unsigned EVTBits = ExtVT.getSizeInBits();
6914
6915   unsigned NumZero  = 0;
6916   unsigned NumNonZero = 0;
6917   unsigned NonZeros = 0;
6918   bool IsAllConstants = true;
6919   SmallSet<SDValue, 8> Values;
6920   for (unsigned i = 0; i < NumElems; ++i) {
6921     SDValue Elt = Op.getOperand(i);
6922     if (Elt.getOpcode() == ISD::UNDEF)
6923       continue;
6924     Values.insert(Elt);
6925     if (Elt.getOpcode() != ISD::Constant &&
6926         Elt.getOpcode() != ISD::ConstantFP)
6927       IsAllConstants = false;
6928     if (X86::isZeroNode(Elt))
6929       NumZero++;
6930     else {
6931       NonZeros |= (1 << i);
6932       NumNonZero++;
6933     }
6934   }
6935
6936   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
6937   if (NumNonZero == 0)
6938     return DAG.getUNDEF(VT);
6939
6940   // Special case for single non-zero, non-undef, element.
6941   if (NumNonZero == 1) {
6942     unsigned Idx = countTrailingZeros(NonZeros);
6943     SDValue Item = Op.getOperand(Idx);
6944
6945     // If this is an insertion of an i64 value on x86-32, and if the top bits of
6946     // the value are obviously zero, truncate the value to i32 and do the
6947     // insertion that way.  Only do this if the value is non-constant or if the
6948     // value is a constant being inserted into element 0.  It is cheaper to do
6949     // a constant pool load than it is to do a movd + shuffle.
6950     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
6951         (!IsAllConstants || Idx == 0)) {
6952       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6953         // Handle SSE only.
6954         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6955         EVT VecVT = MVT::v4i32;
6956         unsigned VecElts = 4;
6957
6958         // Truncate the value (which may itself be a constant) to i32, and
6959         // convert it to a vector with movd (S2V+shuffle to zero extend).
6960         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6961         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6962
6963         // If using the new shuffle lowering, just directly insert this.
6964         if (ExperimentalVectorShuffleLowering)
6965           return DAG.getNode(
6966               ISD::BITCAST, dl, VT,
6967               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
6968
6969         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6970
6971         // Now we have our 32-bit value zero extended in the low element of
6972         // a vector.  If Idx != 0, swizzle it into place.
6973         if (Idx != 0) {
6974           SmallVector<int, 4> Mask;
6975           Mask.push_back(Idx);
6976           for (unsigned i = 1; i != VecElts; ++i)
6977             Mask.push_back(i);
6978           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
6979                                       &Mask[0]);
6980         }
6981         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
6982       }
6983     }
6984
6985     // If we have a constant or non-constant insertion into the low element of
6986     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6987     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
6988     // depending on what the source datatype is.
6989     if (Idx == 0) {
6990       if (NumZero == 0)
6991         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6992
6993       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6994           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
6995         if (VT.is256BitVector() || VT.is512BitVector()) {
6996           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6997           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6998                              Item, DAG.getIntPtrConstant(0));
6999         }
7000         assert(VT.is128BitVector() && "Expected an SSE value type!");
7001         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7002         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7003         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7004       }
7005
7006       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7007         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7008         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7009         if (VT.is256BitVector()) {
7010           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
7011           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7012         } else {
7013           assert(VT.is128BitVector() && "Expected an SSE value type!");
7014           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7015         }
7016         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
7017       }
7018     }
7019
7020     // Is it a vector logical left shift?
7021     if (NumElems == 2 && Idx == 1 &&
7022         X86::isZeroNode(Op.getOperand(0)) &&
7023         !X86::isZeroNode(Op.getOperand(1))) {
7024       unsigned NumBits = VT.getSizeInBits();
7025       return getVShift(true, VT,
7026                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7027                                    VT, Op.getOperand(1)),
7028                        NumBits/2, DAG, *this, dl);
7029     }
7030
7031     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7032       return SDValue();
7033
7034     // Otherwise, if this is a vector with i32 or f32 elements, and the element
7035     // is a non-constant being inserted into an element other than the low one,
7036     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
7037     // movd/movss) to move this into the low element, then shuffle it into
7038     // place.
7039     if (EVTBits == 32) {
7040       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7041
7042       // If using the new shuffle lowering, just directly insert this.
7043       if (ExperimentalVectorShuffleLowering)
7044         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7045
7046       // Turn it into a shuffle of zero and zero-extended scalar to vector.
7047       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
7048       SmallVector<int, 8> MaskVec;
7049       for (unsigned i = 0; i != NumElems; ++i)
7050         MaskVec.push_back(i == Idx ? 0 : 1);
7051       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
7052     }
7053   }
7054
7055   // Splat is obviously ok. Let legalizer expand it to a shuffle.
7056   if (Values.size() == 1) {
7057     if (EVTBits == 32) {
7058       // Instead of a shuffle like this:
7059       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7060       // Check if it's possible to issue this instead.
7061       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7062       unsigned Idx = countTrailingZeros(NonZeros);
7063       SDValue Item = Op.getOperand(Idx);
7064       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7065         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7066     }
7067     return SDValue();
7068   }
7069
7070   // A vector full of immediates; various special cases are already
7071   // handled, so this is best done with a single constant-pool load.
7072   if (IsAllConstants)
7073     return SDValue();
7074
7075   // For AVX-length vectors, see if we can use a vector load to get all of the
7076   // elements, otherwise build the individual 128-bit pieces and use
7077   // shuffles to put them in place.
7078   if (VT.is256BitVector() || VT.is512BitVector()) {
7079     SmallVector<SDValue, 64> V;
7080     for (unsigned i = 0; i != NumElems; ++i)
7081       V.push_back(Op.getOperand(i));
7082
7083     // Check for a build vector of consecutive loads.
7084     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
7085       return LD;
7086
7087     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7088
7089     // Build both the lower and upper subvector.
7090     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7091                                 makeArrayRef(&V[0], NumElems/2));
7092     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7093                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
7094
7095     // Recreate the wider vector with the lower and upper part.
7096     if (VT.is256BitVector())
7097       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7098     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7099   }
7100
7101   // Let legalizer expand 2-wide build_vectors.
7102   if (EVTBits == 64) {
7103     if (NumNonZero == 1) {
7104       // One half is zero or undef.
7105       unsigned Idx = countTrailingZeros(NonZeros);
7106       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7107                                  Op.getOperand(Idx));
7108       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7109     }
7110     return SDValue();
7111   }
7112
7113   // If element VT is < 32 bits, convert it to inserts into a zero vector.
7114   if (EVTBits == 8 && NumElems == 16) {
7115     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
7116                                         Subtarget, *this);
7117     if (V.getNode()) return V;
7118   }
7119
7120   if (EVTBits == 16 && NumElems == 8) {
7121     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
7122                                       Subtarget, *this);
7123     if (V.getNode()) return V;
7124   }
7125
7126   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7127   if (EVTBits == 32 && NumElems == 4) {
7128     SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
7129     if (V.getNode())
7130       return V;
7131   }
7132
7133   // If element VT is == 32 bits, turn it into a number of shuffles.
7134   SmallVector<SDValue, 8> V(NumElems);
7135   if (NumElems == 4 && NumZero > 0) {
7136     for (unsigned i = 0; i < 4; ++i) {
7137       bool isZero = !(NonZeros & (1 << i));
7138       if (isZero)
7139         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
7140       else
7141         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7142     }
7143
7144     for (unsigned i = 0; i < 2; ++i) {
7145       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7146         default: break;
7147         case 0:
7148           V[i] = V[i*2];  // Must be a zero vector.
7149           break;
7150         case 1:
7151           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
7152           break;
7153         case 2:
7154           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
7155           break;
7156         case 3:
7157           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
7158           break;
7159       }
7160     }
7161
7162     bool Reverse1 = (NonZeros & 0x3) == 2;
7163     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7164     int MaskVec[] = {
7165       Reverse1 ? 1 : 0,
7166       Reverse1 ? 0 : 1,
7167       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7168       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
7169     };
7170     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
7171   }
7172
7173   if (Values.size() > 1 && VT.is128BitVector()) {
7174     // Check for a build vector of consecutive loads.
7175     for (unsigned i = 0; i < NumElems; ++i)
7176       V[i] = Op.getOperand(i);
7177
7178     // Check for elements which are consecutive loads.
7179     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
7180     if (LD.getNode())
7181       return LD;
7182
7183     // Check for a build vector from mostly shuffle plus few inserting.
7184     SDValue Sh = buildFromShuffleMostly(Op, DAG);
7185     if (Sh.getNode())
7186       return Sh;
7187
7188     // For SSE 4.1, use insertps to put the high elements into the low element.
7189     if (Subtarget->hasSSE41()) {
7190       SDValue Result;
7191       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
7192         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7193       else
7194         Result = DAG.getUNDEF(VT);
7195
7196       for (unsigned i = 1; i < NumElems; ++i) {
7197         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
7198         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7199                              Op.getOperand(i), DAG.getIntPtrConstant(i));
7200       }
7201       return Result;
7202     }
7203
7204     // Otherwise, expand into a number of unpckl*, start by extending each of
7205     // our (non-undef) elements to the full vector width with the element in the
7206     // bottom slot of the vector (which generates no code for SSE).
7207     for (unsigned i = 0; i < NumElems; ++i) {
7208       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
7209         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7210       else
7211         V[i] = DAG.getUNDEF(VT);
7212     }
7213
7214     // Next, we iteratively mix elements, e.g. for v4f32:
7215     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7216     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7217     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
7218     unsigned EltStride = NumElems >> 1;
7219     while (EltStride != 0) {
7220       for (unsigned i = 0; i < EltStride; ++i) {
7221         // If V[i+EltStride] is undef and this is the first round of mixing,
7222         // then it is safe to just drop this shuffle: V[i] is already in the
7223         // right place, the one element (since it's the first round) being
7224         // inserted as undef can be dropped.  This isn't safe for successive
7225         // rounds because they will permute elements within both vectors.
7226         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
7227             EltStride == NumElems/2)
7228           continue;
7229
7230         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
7231       }
7232       EltStride >>= 1;
7233     }
7234     return V[0];
7235   }
7236   return SDValue();
7237 }
7238
7239 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
7240 // to create 256-bit vectors from two other 128-bit ones.
7241 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7242   SDLoc dl(Op);
7243   MVT ResVT = Op.getSimpleValueType();
7244
7245   assert((ResVT.is256BitVector() ||
7246           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7247
7248   SDValue V1 = Op.getOperand(0);
7249   SDValue V2 = Op.getOperand(1);
7250   unsigned NumElems = ResVT.getVectorNumElements();
7251   if(ResVT.is256BitVector())
7252     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7253
7254   if (Op.getNumOperands() == 4) {
7255     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
7256                                 ResVT.getVectorNumElements()/2);
7257     SDValue V3 = Op.getOperand(2);
7258     SDValue V4 = Op.getOperand(3);
7259     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
7260       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
7261   }
7262   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7263 }
7264
7265 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7266   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
7267   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7268          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7269           Op.getNumOperands() == 4)));
7270
7271   // AVX can use the vinsertf128 instruction to create 256-bit vectors
7272   // from two other 128-bit ones.
7273
7274   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7275   return LowerAVXCONCAT_VECTORS(Op, DAG);
7276 }
7277
7278
7279 //===----------------------------------------------------------------------===//
7280 // Vector shuffle lowering
7281 //
7282 // This is an experimental code path for lowering vector shuffles on x86. It is
7283 // designed to handle arbitrary vector shuffles and blends, gracefully
7284 // degrading performance as necessary. It works hard to recognize idiomatic
7285 // shuffles and lower them to optimal instruction patterns without leaving
7286 // a framework that allows reasonably efficient handling of all vector shuffle
7287 // patterns.
7288 //===----------------------------------------------------------------------===//
7289
7290 /// \brief Tiny helper function to identify a no-op mask.
7291 ///
7292 /// This is a somewhat boring predicate function. It checks whether the mask
7293 /// array input, which is assumed to be a single-input shuffle mask of the kind
7294 /// used by the X86 shuffle instructions (not a fully general
7295 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7296 /// in-place shuffle are 'no-op's.
7297 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7298   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7299     if (Mask[i] != -1 && Mask[i] != i)
7300       return false;
7301   return true;
7302 }
7303
7304 /// \brief Helper function to classify a mask as a single-input mask.
7305 ///
7306 /// This isn't a generic single-input test because in the vector shuffle
7307 /// lowering we canonicalize single inputs to be the first input operand. This
7308 /// means we can more quickly test for a single input by only checking whether
7309 /// an input from the second operand exists. We also assume that the size of
7310 /// mask corresponds to the size of the input vectors which isn't true in the
7311 /// fully general case.
7312 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
7313   for (int M : Mask)
7314     if (M >= (int)Mask.size())
7315       return false;
7316   return true;
7317 }
7318
7319 /// \brief Test whether there are elements crossing 128-bit lanes in this
7320 /// shuffle mask.
7321 ///
7322 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7323 /// and we routinely test for these.
7324 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7325   int LaneSize = 128 / VT.getScalarSizeInBits();
7326   int Size = Mask.size();
7327   for (int i = 0; i < Size; ++i)
7328     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7329       return true;
7330   return false;
7331 }
7332
7333 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
7334 ///
7335 /// This checks a shuffle mask to see if it is performing the same
7336 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
7337 /// that it is also not lane-crossing. It may however involve a blend from the
7338 /// same lane of a second vector.
7339 ///
7340 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7341 /// non-trivial to compute in the face of undef lanes. The representation is
7342 /// *not* suitable for use with existing 128-bit shuffles as it will contain
7343 /// entries from both V1 and V2 inputs to the wider mask.
7344 static bool
7345 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7346                                 SmallVectorImpl<int> &RepeatedMask) {
7347   int LaneSize = 128 / VT.getScalarSizeInBits();
7348   RepeatedMask.resize(LaneSize, -1);
7349   int Size = Mask.size();
7350   for (int i = 0; i < Size; ++i) {
7351     if (Mask[i] < 0)
7352       continue;
7353     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7354       // This entry crosses lanes, so there is no way to model this shuffle.
7355       return false;
7356
7357     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7358     if (RepeatedMask[i % LaneSize] == -1)
7359       // This is the first non-undef entry in this slot of a 128-bit lane.
7360       RepeatedMask[i % LaneSize] =
7361           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
7362     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
7363       // Found a mismatch with the repeated mask.
7364       return false;
7365   }
7366   return true;
7367 }
7368
7369 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
7370 // 2013 will allow us to use it as a non-type template parameter.
7371 namespace {
7372
7373 /// \brief Implementation of the \c isShuffleEquivalent variadic functor.
7374 ///
7375 /// See its documentation for details.
7376 bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
7377   if (Mask.size() != Args.size())
7378     return false;
7379   for (int i = 0, e = Mask.size(); i < e; ++i) {
7380     assert(*Args[i] >= 0 && "Arguments must be positive integers!");
7381     if (Mask[i] != -1 && Mask[i] != *Args[i])
7382       return false;
7383   }
7384   return true;
7385 }
7386
7387 } // namespace
7388
7389 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7390 /// arguments.
7391 ///
7392 /// This is a fast way to test a shuffle mask against a fixed pattern:
7393 ///
7394 ///   if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
7395 ///
7396 /// It returns true if the mask is exactly as wide as the argument list, and
7397 /// each element of the mask is either -1 (signifying undef) or the value given
7398 /// in the argument.
7399 static const VariadicFunction1<
7400     bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
7401
7402 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7403 ///
7404 /// This helper function produces an 8-bit shuffle immediate corresponding to
7405 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7406 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7407 /// example.
7408 ///
7409 /// NB: We rely heavily on "undef" masks preserving the input lane.
7410 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
7411                                           SelectionDAG &DAG) {
7412   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7413   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7414   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7415   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7416   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7417
7418   unsigned Imm = 0;
7419   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
7420   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
7421   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
7422   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
7423   return DAG.getConstant(Imm, MVT::i8);
7424 }
7425
7426 /// \brief Try to emit a blend instruction for a shuffle.
7427 ///
7428 /// This doesn't do any checks for the availability of instructions for blending
7429 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7430 /// be matched in the backend with the type given. What it does check for is
7431 /// that the shuffle mask is in fact a blend.
7432 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
7433                                          SDValue V2, ArrayRef<int> Mask,
7434                                          const X86Subtarget *Subtarget,
7435                                          SelectionDAG &DAG) {
7436
7437   unsigned BlendMask = 0;
7438   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7439     if (Mask[i] >= Size) {
7440       if (Mask[i] != i + Size)
7441         return SDValue(); // Shuffled V2 input!
7442       BlendMask |= 1u << i;
7443       continue;
7444     }
7445     if (Mask[i] >= 0 && Mask[i] != i)
7446       return SDValue(); // Shuffled V1 input!
7447   }
7448   switch (VT.SimpleTy) {
7449   case MVT::v2f64:
7450   case MVT::v4f32:
7451   case MVT::v4f64:
7452   case MVT::v8f32:
7453     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7454                        DAG.getConstant(BlendMask, MVT::i8));
7455
7456   case MVT::v4i64:
7457   case MVT::v8i32:
7458     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7459     // FALLTHROUGH
7460   case MVT::v2i64:
7461   case MVT::v4i32:
7462     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7463     // that instruction.
7464     if (Subtarget->hasAVX2()) {
7465       // Scale the blend by the number of 32-bit dwords per element.
7466       int Scale =  VT.getScalarSizeInBits() / 32;
7467       BlendMask = 0;
7468       for (int i = 0, Size = Mask.size(); i < Size; ++i)
7469         if (Mask[i] >= Size)
7470           for (int j = 0; j < Scale; ++j)
7471             BlendMask |= 1u << (i * Scale + j);
7472
7473       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7474       V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
7475       V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
7476       return DAG.getNode(ISD::BITCAST, DL, VT,
7477                          DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7478                                      DAG.getConstant(BlendMask, MVT::i8)));
7479     }
7480     // FALLTHROUGH
7481   case MVT::v8i16: {
7482     // For integer shuffles we need to expand the mask and cast the inputs to
7483     // v8i16s prior to blending.
7484     int Scale = 8 / VT.getVectorNumElements();
7485     BlendMask = 0;
7486     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7487       if (Mask[i] >= Size)
7488         for (int j = 0; j < Scale; ++j)
7489           BlendMask |= 1u << (i * Scale + j);
7490
7491     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
7492     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
7493     return DAG.getNode(ISD::BITCAST, DL, VT,
7494                        DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7495                                    DAG.getConstant(BlendMask, MVT::i8)));
7496   }
7497
7498   case MVT::v16i16: {
7499     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7500     SmallVector<int, 8> RepeatedMask;
7501     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7502       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7503       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7504       BlendMask = 0;
7505       for (int i = 0; i < 8; ++i)
7506         if (RepeatedMask[i] >= 16)
7507           BlendMask |= 1u << i;
7508       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7509                          DAG.getConstant(BlendMask, MVT::i8));
7510     }
7511   }
7512     // FALLTHROUGH
7513   case MVT::v32i8: {
7514     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7515     // Scale the blend by the number of bytes per element.
7516     int Scale =  VT.getScalarSizeInBits() / 8;
7517     assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
7518
7519     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7520     // mix of LLVM's code generator and the x86 backend. We tell the code
7521     // generator that boolean values in the elements of an x86 vector register
7522     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7523     // mapping a select to operand #1, and 'false' mapping to operand #2. The
7524     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7525     // of the element (the remaining are ignored) and 0 in that high bit would
7526     // mean operand #1 while 1 in the high bit would mean operand #2. So while
7527     // the LLVM model for boolean values in vector elements gets the relevant
7528     // bit set, it is set backwards and over constrained relative to x86's
7529     // actual model.
7530     SDValue VSELECTMask[32];
7531     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7532       for (int j = 0; j < Scale; ++j)
7533         VSELECTMask[Scale * i + j] =
7534             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7535                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
7536
7537     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
7538     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
7539     return DAG.getNode(
7540         ISD::BITCAST, DL, VT,
7541         DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
7542                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
7543                     V1, V2));
7544   }
7545
7546   default:
7547     llvm_unreachable("Not a supported integer vector type!");
7548   }
7549 }
7550
7551 /// \brief Generic routine to lower a shuffle and blend as a decomposed set of
7552 /// unblended shuffles followed by an unshuffled blend.
7553 ///
7554 /// This matches the extremely common pattern for handling combined
7555 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7556 /// operations.
7557 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
7558                                                           SDValue V1,
7559                                                           SDValue V2,
7560                                                           ArrayRef<int> Mask,
7561                                                           SelectionDAG &DAG) {
7562   // Shuffle the input elements into the desired positions in V1 and V2 and
7563   // blend them together.
7564   SmallVector<int, 32> V1Mask(Mask.size(), -1);
7565   SmallVector<int, 32> V2Mask(Mask.size(), -1);
7566   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7567   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7568     if (Mask[i] >= 0 && Mask[i] < Size) {
7569       V1Mask[i] = Mask[i];
7570       BlendMask[i] = i;
7571     } else if (Mask[i] >= Size) {
7572       V2Mask[i] = Mask[i] - Size;
7573       BlendMask[i] = i + Size;
7574     }
7575
7576   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7577   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7578   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7579 }
7580
7581 /// \brief Try to lower a vector shuffle as a byte rotation.
7582 ///
7583 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7584 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7585 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7586 /// try to generically lower a vector shuffle through such an pattern. It
7587 /// does not check for the profitability of lowering either as PALIGNR or
7588 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7589 /// This matches shuffle vectors that look like:
7590 ///
7591 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7592 ///
7593 /// Essentially it concatenates V1 and V2, shifts right by some number of
7594 /// elements, and takes the low elements as the result. Note that while this is
7595 /// specified as a *right shift* because x86 is little-endian, it is a *left
7596 /// rotate* of the vector lanes.
7597 ///
7598 /// Note that this only handles 128-bit vector widths currently.
7599 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
7600                                               SDValue V2,
7601                                               ArrayRef<int> Mask,
7602                                               const X86Subtarget *Subtarget,
7603                                               SelectionDAG &DAG) {
7604   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7605
7606   // We need to detect various ways of spelling a rotation:
7607   //   [11, 12, 13, 14, 15,  0,  1,  2]
7608   //   [-1, 12, 13, 14, -1, -1,  1, -1]
7609   //   [-1, -1, -1, -1, -1, -1,  1,  2]
7610   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
7611   //   [-1,  4,  5,  6, -1, -1,  9, -1]
7612   //   [-1,  4,  5,  6, -1, -1, -1, -1]
7613   int Rotation = 0;
7614   SDValue Lo, Hi;
7615   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7616     if (Mask[i] == -1)
7617       continue;
7618     assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
7619
7620     // Based on the mod-Size value of this mask element determine where
7621     // a rotated vector would have started.
7622     int StartIdx = i - (Mask[i] % Size);
7623     if (StartIdx == 0)
7624       // The identity rotation isn't interesting, stop.
7625       return SDValue();
7626
7627     // If we found the tail of a vector the rotation must be the missing
7628     // front. If we found the head of a vector, it must be how much of the head.
7629     int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
7630
7631     if (Rotation == 0)
7632       Rotation = CandidateRotation;
7633     else if (Rotation != CandidateRotation)
7634       // The rotations don't match, so we can't match this mask.
7635       return SDValue();
7636
7637     // Compute which value this mask is pointing at.
7638     SDValue MaskV = Mask[i] < Size ? V1 : V2;
7639
7640     // Compute which of the two target values this index should be assigned to.
7641     // This reflects whether the high elements are remaining or the low elements
7642     // are remaining.
7643     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7644
7645     // Either set up this value if we've not encountered it before, or check
7646     // that it remains consistent.
7647     if (!TargetV)
7648       TargetV = MaskV;
7649     else if (TargetV != MaskV)
7650       // This may be a rotation, but it pulls from the inputs in some
7651       // unsupported interleaving.
7652       return SDValue();
7653   }
7654
7655   // Check that we successfully analyzed the mask, and normalize the results.
7656   assert(Rotation != 0 && "Failed to locate a viable rotation!");
7657   assert((Lo || Hi) && "Failed to find a rotated input vector!");
7658   if (!Lo)
7659     Lo = Hi;
7660   else if (!Hi)
7661     Hi = Lo;
7662
7663   assert(VT.getSizeInBits() == 128 &&
7664          "Rotate-based lowering only supports 128-bit lowering!");
7665   assert(Mask.size() <= 16 &&
7666          "Can shuffle at most 16 bytes in a 128-bit vector!");
7667
7668   // The actual rotate instruction rotates bytes, so we need to scale the
7669   // rotation based on how many bytes are in the vector.
7670   int Scale = 16 / Mask.size();
7671
7672   // SSSE3 targets can use the palignr instruction
7673   if (Subtarget->hasSSSE3()) {
7674     // Cast the inputs to v16i8 to match PALIGNR.
7675     Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
7676     Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
7677
7678     return DAG.getNode(ISD::BITCAST, DL, VT,
7679                        DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
7680                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
7681   }
7682
7683   // Default SSE2 implementation
7684   int LoByteShift = 16 - Rotation * Scale;
7685   int HiByteShift = Rotation * Scale;
7686
7687   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
7688   Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
7689   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
7690
7691   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
7692                                 DAG.getConstant(8 * LoByteShift, MVT::i8));
7693   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
7694                                 DAG.getConstant(8 * HiByteShift, MVT::i8));
7695   return DAG.getNode(ISD::BITCAST, DL, VT,
7696                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
7697 }
7698
7699 /// \brief Compute whether each element of a shuffle is zeroable.
7700 ///
7701 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7702 /// Either it is an undef element in the shuffle mask, the element of the input
7703 /// referenced is undef, or the element of the input referenced is known to be
7704 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7705 /// as many lanes with this technique as possible to simplify the remaining
7706 /// shuffle.
7707 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7708                                                      SDValue V1, SDValue V2) {
7709   SmallBitVector Zeroable(Mask.size(), false);
7710
7711   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7712   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7713
7714   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7715     int M = Mask[i];
7716     // Handle the easy cases.
7717     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7718       Zeroable[i] = true;
7719       continue;
7720     }
7721
7722     // If this is an index into a build_vector node, dig out the input value and
7723     // use it.
7724     SDValue V = M < Size ? V1 : V2;
7725     if (V.getOpcode() != ISD::BUILD_VECTOR)
7726       continue;
7727
7728     SDValue Input = V.getOperand(M % Size);
7729     // The UNDEF opcode check really should be dead code here, but not quite
7730     // worth asserting on (it isn't invalid, just unexpected).
7731     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
7732       Zeroable[i] = true;
7733   }
7734
7735   return Zeroable;
7736 }
7737
7738 /// \brief Try to emit a bitmask instruction for a shuffle.
7739 ///
7740 /// This handles cases where we can model a blend exactly as a bitmask due to
7741 /// one of the inputs being zeroable.
7742 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
7743                                            SDValue V2, ArrayRef<int> Mask,
7744                                            SelectionDAG &DAG) {
7745   MVT EltVT = VT.getScalarType();
7746   int NumEltBits = EltVT.getSizeInBits();
7747   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
7748   SDValue Zero = DAG.getConstant(0, IntEltVT);
7749   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
7750   if (EltVT.isFloatingPoint()) {
7751     Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
7752     AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
7753   }
7754   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
7755   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7756   SDValue V;
7757   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7758     if (Zeroable[i])
7759       continue;
7760     if (Mask[i] % Size != i)
7761       return SDValue(); // Not a blend.
7762     if (!V)
7763       V = Mask[i] < Size ? V1 : V2;
7764     else if (V != (Mask[i] < Size ? V1 : V2))
7765       return SDValue(); // Can only let one input through the mask.
7766
7767     VMaskOps[i] = AllOnes;
7768   }
7769   if (!V)
7770     return SDValue(); // No non-zeroable elements!
7771
7772   SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
7773   V = DAG.getNode(VT.isFloatingPoint()
7774                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
7775                   DL, VT, V, VMask);
7776   return V;
7777 }
7778
7779 /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
7780 ///
7781 /// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
7782 /// byte-shift instructions. The mask must consist of a shifted sequential
7783 /// shuffle from one of the input vectors and zeroable elements for the
7784 /// remaining 'shifted in' elements.
7785 ///
7786 /// Note that this only handles 128-bit vector widths currently.
7787 static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
7788                                              SDValue V2, ArrayRef<int> Mask,
7789                                              SelectionDAG &DAG) {
7790   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7791
7792   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7793
7794   int Size = Mask.size();
7795   int Scale = 16 / Size;
7796
7797   for (int Shift = 1; Shift < Size; Shift++) {
7798     int ByteShift = Shift * Scale;
7799
7800     // PSRLDQ : (little-endian) right byte shift
7801     // [ 5,  6,  7, zz, zz, zz, zz, zz]
7802     // [ -1, 5,  6,  7, zz, zz, zz, zz]
7803     // [  1, 2, -1, -1, -1, -1, zz, zz]
7804     bool ZeroableRight = true;
7805     for (int i = Size - Shift; i < Size; i++) {
7806       ZeroableRight &= Zeroable[i];
7807     }
7808
7809     if (ZeroableRight) {
7810       bool ValidShiftRight1 =
7811           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
7812       bool ValidShiftRight2 =
7813           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
7814
7815       if (ValidShiftRight1 || ValidShiftRight2) {
7816         // Cast the inputs to v2i64 to match PSRLDQ.
7817         SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
7818         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7819         SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
7820                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7821         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7822       }
7823     }
7824
7825     // PSLLDQ : (little-endian) left byte shift
7826     // [ zz,  0,  1,  2,  3,  4,  5,  6]
7827     // [ zz, zz, -1, -1,  2,  3,  4, -1]
7828     // [ zz, zz, zz, zz, zz, zz, -1,  1]
7829     bool ZeroableLeft = true;
7830     for (int i = 0; i < Shift; i++) {
7831       ZeroableLeft &= Zeroable[i];
7832     }
7833
7834     if (ZeroableLeft) {
7835       bool ValidShiftLeft1 =
7836           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
7837       bool ValidShiftLeft2 =
7838           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
7839
7840       if (ValidShiftLeft1 || ValidShiftLeft2) {
7841         // Cast the inputs to v2i64 to match PSLLDQ.
7842         SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
7843         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7844         SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
7845                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7846         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7847       }
7848     }
7849   }
7850
7851   return SDValue();
7852 }
7853
7854 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
7855 ///
7856 /// Attempts to match a shuffle mask against the PSRL(W/D/Q) and PSLL(W/D/Q)
7857 /// SSE2 and AVX2 logical bit-shift instructions. The function matches
7858 /// elements from one of the input vectors shuffled to the left or right
7859 /// with zeroable elements 'shifted in'.
7860 static SDValue lowerVectorShuffleAsBitShift(SDLoc DL, MVT VT, SDValue V1,
7861                                             SDValue V2, ArrayRef<int> Mask,
7862                                             SelectionDAG &DAG) {
7863   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7864
7865   int Size = Mask.size();
7866   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7867
7868   // PSRL : (little-endian) right bit shift.
7869   // [  1, zz,  3, zz]
7870   // [ -1, -1,  7, zz]
7871   // PSHL : (little-endian) left bit shift.
7872   // [ zz, 0, zz,  2 ]
7873   // [ -1, 4, zz, -1 ]
7874   auto MatchBitShift = [&](int Shift, int Scale) -> SDValue {
7875     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
7876     MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
7877     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
7878            "Illegal integer vector type");
7879
7880     bool MatchLeft = true, MatchRight = true;
7881     for (int i = 0; i != Size; i += Scale) {
7882       for (int j = 0; j != Shift; j++) {
7883         MatchLeft &= Zeroable[i + j];
7884       }
7885       for (int j = Scale - Shift; j != Scale; j++) {
7886         MatchRight &= Zeroable[i + j];
7887       }
7888     }
7889     if (!(MatchLeft || MatchRight))
7890       return SDValue();
7891
7892     bool MatchV1 = true, MatchV2 = true;
7893     for (int i = 0; i != Size; i += Scale) {
7894       unsigned Pos = MatchLeft ? i + Shift : i;
7895       unsigned Low = MatchLeft ? i : i + Shift;
7896       unsigned Len = Scale - Shift;
7897       MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
7898       MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + Size);
7899     }
7900     if (!(MatchV1 || MatchV2))
7901       return SDValue();
7902
7903     // Cast the inputs to ShiftVT to match VSRLI/VSHLI and back again.
7904     unsigned OpCode = MatchLeft ? X86ISD::VSHLI : X86ISD::VSRLI;
7905     int ShiftAmt = Shift * VT.getScalarSizeInBits();
7906     SDValue V = MatchV1 ? V1 : V2;
7907     V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
7908     V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
7909     return DAG.getNode(ISD::BITCAST, DL, VT, V);
7910   };
7911
7912   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
7913   // keep doubling the size of the integer elements up to that. We can
7914   // then shift the elements of the integer vector by whole multiples of
7915   // their width within the elements of the larger integer vector. Test each
7916   // multiple to see if we can find a match with the moved element indices
7917   // and that the shifted in elements are all zeroable.
7918   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 64; Scale *= 2)
7919     for (int Shift = 1; Shift != Scale; Shift++)
7920       if (SDValue BitShift = MatchBitShift(Shift, Scale))
7921         return BitShift;
7922
7923   // no match
7924   return SDValue();
7925 }
7926
7927 /// \brief Lower a vector shuffle as a zero or any extension.
7928 ///
7929 /// Given a specific number of elements, element bit width, and extension
7930 /// stride, produce either a zero or any extension based on the available
7931 /// features of the subtarget.
7932 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7933     SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
7934     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7935   assert(Scale > 1 && "Need a scale to extend.");
7936   int NumElements = VT.getVectorNumElements();
7937   int EltBits = VT.getScalarSizeInBits();
7938   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
7939          "Only 8, 16, and 32 bit elements can be extended.");
7940   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
7941
7942   // Found a valid zext mask! Try various lowering strategies based on the
7943   // input type and available ISA extensions.
7944   if (Subtarget->hasSSE41()) {
7945     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
7946                                  NumElements / Scale);
7947     return DAG.getNode(ISD::BITCAST, DL, VT,
7948                        DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
7949   }
7950
7951   // For any extends we can cheat for larger element sizes and use shuffle
7952   // instructions that can fold with a load and/or copy.
7953   if (AnyExt && EltBits == 32) {
7954     int PSHUFDMask[4] = {0, -1, 1, -1};
7955     return DAG.getNode(
7956         ISD::BITCAST, DL, VT,
7957         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7958                     DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7959                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
7960   }
7961   if (AnyExt && EltBits == 16 && Scale > 2) {
7962     int PSHUFDMask[4] = {0, -1, 0, -1};
7963     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7964                          DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7965                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
7966     int PSHUFHWMask[4] = {1, -1, -1, -1};
7967     return DAG.getNode(
7968         ISD::BITCAST, DL, VT,
7969         DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
7970                     DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
7971                     getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
7972   }
7973
7974   // If this would require more than 2 unpack instructions to expand, use
7975   // pshufb when available. We can only use more than 2 unpack instructions
7976   // when zero extending i8 elements which also makes it easier to use pshufb.
7977   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
7978     assert(NumElements == 16 && "Unexpected byte vector width!");
7979     SDValue PSHUFBMask[16];
7980     for (int i = 0; i < 16; ++i)
7981       PSHUFBMask[i] =
7982           DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
7983     InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
7984     return DAG.getNode(ISD::BITCAST, DL, VT,
7985                        DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
7986                                    DAG.getNode(ISD::BUILD_VECTOR, DL,
7987                                                MVT::v16i8, PSHUFBMask)));
7988   }
7989
7990   // Otherwise emit a sequence of unpacks.
7991   do {
7992     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7993     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
7994                          : getZeroVector(InputVT, Subtarget, DAG, DL);
7995     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7996     InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
7997     Scale /= 2;
7998     EltBits *= 2;
7999     NumElements /= 2;
8000   } while (Scale > 1);
8001   return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
8002 }
8003
8004 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8005 ///
8006 /// This routine will try to do everything in its power to cleverly lower
8007 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
8008 /// check for the profitability of this lowering,  it tries to aggressively
8009 /// match this pattern. It will use all of the micro-architectural details it
8010 /// can to emit an efficient lowering. It handles both blends with all-zero
8011 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
8012 /// masking out later).
8013 ///
8014 /// The reason we have dedicated lowering for zext-style shuffles is that they
8015 /// are both incredibly common and often quite performance sensitive.
8016 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
8017     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8018     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8019   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8020
8021   int Bits = VT.getSizeInBits();
8022   int NumElements = VT.getVectorNumElements();
8023   assert(VT.getScalarSizeInBits() <= 32 &&
8024          "Exceeds 32-bit integer zero extension limit");
8025   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
8026
8027   // Define a helper function to check a particular ext-scale and lower to it if
8028   // valid.
8029   auto Lower = [&](int Scale) -> SDValue {
8030     SDValue InputV;
8031     bool AnyExt = true;
8032     for (int i = 0; i < NumElements; ++i) {
8033       if (Mask[i] == -1)
8034         continue; // Valid anywhere but doesn't tell us anything.
8035       if (i % Scale != 0) {
8036         // Each of the extended elements need to be zeroable.
8037         if (!Zeroable[i])
8038           return SDValue();
8039
8040         // We no longer are in the anyext case.
8041         AnyExt = false;
8042         continue;
8043       }
8044
8045       // Each of the base elements needs to be consecutive indices into the
8046       // same input vector.
8047       SDValue V = Mask[i] < NumElements ? V1 : V2;
8048       if (!InputV)
8049         InputV = V;
8050       else if (InputV != V)
8051         return SDValue(); // Flip-flopping inputs.
8052
8053       if (Mask[i] % NumElements != i / Scale)
8054         return SDValue(); // Non-consecutive strided elements.
8055     }
8056
8057     // If we fail to find an input, we have a zero-shuffle which should always
8058     // have already been handled.
8059     // FIXME: Maybe handle this here in case during blending we end up with one?
8060     if (!InputV)
8061       return SDValue();
8062
8063     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8064         DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
8065   };
8066
8067   // The widest scale possible for extending is to a 64-bit integer.
8068   assert(Bits % 64 == 0 &&
8069          "The number of bits in a vector must be divisible by 64 on x86!");
8070   int NumExtElements = Bits / 64;
8071
8072   // Each iteration, try extending the elements half as much, but into twice as
8073   // many elements.
8074   for (; NumExtElements < NumElements; NumExtElements *= 2) {
8075     assert(NumElements % NumExtElements == 0 &&
8076            "The input vector size must be divisible by the extended size.");
8077     if (SDValue V = Lower(NumElements / NumExtElements))
8078       return V;
8079   }
8080
8081   // General extends failed, but 128-bit vectors may be able to use MOVQ.
8082   if (Bits != 128)
8083     return SDValue();
8084
8085   // Returns one of the source operands if the shuffle can be reduced to a
8086   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
8087   auto CanZExtLowHalf = [&]() {
8088     for (int i = NumElements / 2; i != NumElements; i++)
8089       if (!Zeroable[i])
8090         return SDValue();
8091     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
8092       return V1;
8093     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
8094       return V2;
8095     return SDValue();
8096   };
8097
8098   if (SDValue V = CanZExtLowHalf()) {
8099     V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
8100     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
8101     return DAG.getNode(ISD::BITCAST, DL, VT, V);
8102   }
8103
8104   // No viable ext lowering found.
8105   return SDValue();
8106 }
8107
8108 /// \brief Try to get a scalar value for a specific element of a vector.
8109 ///
8110 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
8111 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
8112                                               SelectionDAG &DAG) {
8113   MVT VT = V.getSimpleValueType();
8114   MVT EltVT = VT.getVectorElementType();
8115   while (V.getOpcode() == ISD::BITCAST)
8116     V = V.getOperand(0);
8117   // If the bitcasts shift the element size, we can't extract an equivalent
8118   // element from it.
8119   MVT NewVT = V.getSimpleValueType();
8120   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
8121     return SDValue();
8122
8123   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8124       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
8125     return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
8126
8127   return SDValue();
8128 }
8129
8130 /// \brief Helper to test for a load that can be folded with x86 shuffles.
8131 ///
8132 /// This is particularly important because the set of instructions varies
8133 /// significantly based on whether the operand is a load or not.
8134 static bool isShuffleFoldableLoad(SDValue V) {
8135   while (V.getOpcode() == ISD::BITCAST)
8136     V = V.getOperand(0);
8137
8138   return ISD::isNON_EXTLoad(V.getNode());
8139 }
8140
8141 /// \brief Try to lower insertion of a single element into a zero vector.
8142 ///
8143 /// This is a common pattern that we have especially efficient patterns to lower
8144 /// across all subtarget feature sets.
8145 static SDValue lowerVectorShuffleAsElementInsertion(
8146     MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8147     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8148   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8149   MVT ExtVT = VT;
8150   MVT EltVT = VT.getVectorElementType();
8151
8152   int V2Index = std::find_if(Mask.begin(), Mask.end(),
8153                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
8154                 Mask.begin();
8155   bool IsV1Zeroable = true;
8156   for (int i = 0, Size = Mask.size(); i < Size; ++i)
8157     if (i != V2Index && !Zeroable[i]) {
8158       IsV1Zeroable = false;
8159       break;
8160     }
8161
8162   // Check for a single input from a SCALAR_TO_VECTOR node.
8163   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8164   // all the smarts here sunk into that routine. However, the current
8165   // lowering of BUILD_VECTOR makes that nearly impossible until the old
8166   // vector shuffle lowering is dead.
8167   if (SDValue V2S = getScalarValueForVectorElement(
8168           V2, Mask[V2Index] - Mask.size(), DAG)) {
8169     // We need to zext the scalar if it is smaller than an i32.
8170     V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
8171     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8172       // Using zext to expand a narrow element won't work for non-zero
8173       // insertions.
8174       if (!IsV1Zeroable)
8175         return SDValue();
8176
8177       // Zero-extend directly to i32.
8178       ExtVT = MVT::v4i32;
8179       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8180     }
8181     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8182   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8183              EltVT == MVT::i16) {
8184     // Either not inserting from the low element of the input or the input
8185     // element size is too small to use VZEXT_MOVL to clear the high bits.
8186     return SDValue();
8187   }
8188
8189   if (!IsV1Zeroable) {
8190     // If V1 can't be treated as a zero vector we have fewer options to lower
8191     // this. We can't support integer vectors or non-zero targets cheaply, and
8192     // the V1 elements can't be permuted in any way.
8193     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8194     if (!VT.isFloatingPoint() || V2Index != 0)
8195       return SDValue();
8196     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8197     V1Mask[V2Index] = -1;
8198     if (!isNoopShuffleMask(V1Mask))
8199       return SDValue();
8200     // This is essentially a special case blend operation, but if we have
8201     // general purpose blend operations, they are always faster. Bail and let
8202     // the rest of the lowering handle these as blends.
8203     if (Subtarget->hasSSE41())
8204       return SDValue();
8205
8206     // Otherwise, use MOVSD or MOVSS.
8207     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8208            "Only two types of floating point element types to handle!");
8209     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8210                        ExtVT, V1, V2);
8211   }
8212
8213   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8214   if (ExtVT != VT)
8215     V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8216
8217   if (V2Index != 0) {
8218     // If we have 4 or fewer lanes we can cheaply shuffle the element into
8219     // the desired position. Otherwise it is more efficient to do a vector
8220     // shift left. We know that we can do a vector shift left because all
8221     // the inputs are zero.
8222     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8223       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8224       V2Shuffle[V2Index] = 0;
8225       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8226     } else {
8227       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
8228       V2 = DAG.getNode(
8229           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
8230           DAG.getConstant(
8231               V2Index * EltVT.getSizeInBits(),
8232               DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
8233       V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8234     }
8235   }
8236   return V2;
8237 }
8238
8239 /// \brief Try to lower broadcast of a single element.
8240 ///
8241 /// For convenience, this code also bundles all of the subtarget feature set
8242 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8243 /// a convenient way to factor it out.
8244 static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
8245                                              ArrayRef<int> Mask,
8246                                              const X86Subtarget *Subtarget,
8247                                              SelectionDAG &DAG) {
8248   if (!Subtarget->hasAVX())
8249     return SDValue();
8250   if (VT.isInteger() && !Subtarget->hasAVX2())
8251     return SDValue();
8252
8253   // Check that the mask is a broadcast.
8254   int BroadcastIdx = -1;
8255   for (int M : Mask)
8256     if (M >= 0 && BroadcastIdx == -1)
8257       BroadcastIdx = M;
8258     else if (M >= 0 && M != BroadcastIdx)
8259       return SDValue();
8260
8261   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8262                                             "a sorted mask where the broadcast "
8263                                             "comes from V1.");
8264
8265   // Go up the chain of (vector) values to try and find a scalar load that
8266   // we can combine with the broadcast.
8267   for (;;) {
8268     switch (V.getOpcode()) {
8269     case ISD::CONCAT_VECTORS: {
8270       int OperandSize = Mask.size() / V.getNumOperands();
8271       V = V.getOperand(BroadcastIdx / OperandSize);
8272       BroadcastIdx %= OperandSize;
8273       continue;
8274     }
8275
8276     case ISD::INSERT_SUBVECTOR: {
8277       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8278       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8279       if (!ConstantIdx)
8280         break;
8281
8282       int BeginIdx = (int)ConstantIdx->getZExtValue();
8283       int EndIdx =
8284           BeginIdx + (int)VInner.getValueType().getVectorNumElements();
8285       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8286         BroadcastIdx -= BeginIdx;
8287         V = VInner;
8288       } else {
8289         V = VOuter;
8290       }
8291       continue;
8292     }
8293     }
8294     break;
8295   }
8296
8297   // Check if this is a broadcast of a scalar. We special case lowering
8298   // for scalars so that we can more effectively fold with loads.
8299   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8300       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8301     V = V.getOperand(BroadcastIdx);
8302
8303     // If the scalar isn't a load we can't broadcast from it in AVX1, only with
8304     // AVX2.
8305     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
8306       return SDValue();
8307   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
8308     // We can't broadcast from a vector register w/o AVX2, and we can only
8309     // broadcast from the zero-element of a vector register.
8310     return SDValue();
8311   }
8312
8313   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
8314 }
8315
8316 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8317 // INSERTPS when the V1 elements are already in the correct locations
8318 // because otherwise we can just always use two SHUFPS instructions which
8319 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8320 // perform INSERTPS if a single V1 element is out of place and all V2
8321 // elements are zeroable.
8322 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8323                                             ArrayRef<int> Mask,
8324                                             SelectionDAG &DAG) {
8325   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8326   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8327   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8328   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8329
8330   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8331
8332   unsigned ZMask = 0;
8333   int V1DstIndex = -1;
8334   int V2DstIndex = -1;
8335   bool V1UsedInPlace = false;
8336
8337   for (int i = 0; i < 4; i++) {
8338     // Synthesize a zero mask from the zeroable elements (includes undefs).
8339     if (Zeroable[i]) {
8340       ZMask |= 1 << i;
8341       continue;
8342     }
8343
8344     // Flag if we use any V1 inputs in place.
8345     if (i == Mask[i]) {
8346       V1UsedInPlace = true;
8347       continue;
8348     }
8349
8350     // We can only insert a single non-zeroable element.
8351     if (V1DstIndex != -1 || V2DstIndex != -1)
8352       return SDValue();
8353
8354     if (Mask[i] < 4) {
8355       // V1 input out of place for insertion.
8356       V1DstIndex = i;
8357     } else {
8358       // V2 input for insertion.
8359       V2DstIndex = i;
8360     }
8361   }
8362
8363   // Don't bother if we have no (non-zeroable) element for insertion.
8364   if (V1DstIndex == -1 && V2DstIndex == -1)
8365     return SDValue();
8366
8367   // Determine element insertion src/dst indices. The src index is from the
8368   // start of the inserted vector, not the start of the concatenated vector.
8369   unsigned V2SrcIndex = 0;
8370   if (V1DstIndex != -1) {
8371     // If we have a V1 input out of place, we use V1 as the V2 element insertion
8372     // and don't use the original V2 at all.
8373     V2SrcIndex = Mask[V1DstIndex];
8374     V2DstIndex = V1DstIndex;
8375     V2 = V1;
8376   } else {
8377     V2SrcIndex = Mask[V2DstIndex] - 4;
8378   }
8379
8380   // If no V1 inputs are used in place, then the result is created only from
8381   // the zero mask and the V2 insertion - so remove V1 dependency.
8382   if (!V1UsedInPlace)
8383     V1 = DAG.getUNDEF(MVT::v4f32);
8384
8385   unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8386   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8387
8388   // Insert the V2 element into the desired position.
8389   SDLoc DL(Op);
8390   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8391                      DAG.getConstant(InsertPSMask, MVT::i8));
8392 }
8393
8394 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8395 ///
8396 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8397 /// support for floating point shuffles but not integer shuffles. These
8398 /// instructions will incur a domain crossing penalty on some chips though so
8399 /// it is better to avoid lowering through this for integer vectors where
8400 /// possible.
8401 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8402                                        const X86Subtarget *Subtarget,
8403                                        SelectionDAG &DAG) {
8404   SDLoc DL(Op);
8405   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
8406   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8407   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8408   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8409   ArrayRef<int> Mask = SVOp->getMask();
8410   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8411
8412   if (isSingleInputShuffleMask(Mask)) {
8413     // Use low duplicate instructions for masks that match their pattern.
8414     if (Subtarget->hasSSE3())
8415       if (isShuffleEquivalent(Mask, 0, 0))
8416         return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
8417
8418     // Straight shuffle of a single input vector. Simulate this by using the
8419     // single input as both of the "inputs" to this instruction..
8420     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8421
8422     if (Subtarget->hasAVX()) {
8423       // If we have AVX, we can use VPERMILPS which will allow folding a load
8424       // into the shuffle.
8425       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8426                          DAG.getConstant(SHUFPDMask, MVT::i8));
8427     }
8428
8429     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
8430                        DAG.getConstant(SHUFPDMask, MVT::i8));
8431   }
8432   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8433   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8434
8435   // Use dedicated unpack instructions for masks that match their pattern.
8436   if (isShuffleEquivalent(Mask, 0, 2))
8437     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
8438   if (isShuffleEquivalent(Mask, 1, 3))
8439     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
8440
8441   // If we have a single input, insert that into V1 if we can do so cheaply.
8442   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8443     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8444             MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
8445       return Insertion;
8446     // Try inverting the insertion since for v2 masks it is easy to do and we
8447     // can't reliably sort the mask one way or the other.
8448     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8449                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8450     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8451             MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
8452       return Insertion;
8453   }
8454
8455   // Try to use one of the special instruction patterns to handle two common
8456   // blend patterns if a zero-blend above didn't work.
8457   if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
8458     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8459       // We can either use a special instruction to load over the low double or
8460       // to move just the low double.
8461       return DAG.getNode(
8462           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8463           DL, MVT::v2f64, V2,
8464           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8465
8466   if (Subtarget->hasSSE41())
8467     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8468                                                   Subtarget, DAG))
8469       return Blend;
8470
8471   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8472   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
8473                      DAG.getConstant(SHUFPDMask, MVT::i8));
8474 }
8475
8476 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8477 ///
8478 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8479 /// the integer unit to minimize domain crossing penalties. However, for blends
8480 /// it falls back to the floating point shuffle operation with appropriate bit
8481 /// casting.
8482 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8483                                        const X86Subtarget *Subtarget,
8484                                        SelectionDAG &DAG) {
8485   SDLoc DL(Op);
8486   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
8487   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8488   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8489   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8490   ArrayRef<int> Mask = SVOp->getMask();
8491   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8492
8493   if (isSingleInputShuffleMask(Mask)) {
8494     // Check for being able to broadcast a single element.
8495     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
8496                                                           Mask, Subtarget, DAG))
8497       return Broadcast;
8498
8499     // Straight shuffle of a single input vector. For everything from SSE2
8500     // onward this has a single fast instruction with no scary immediates.
8501     // We have to map the mask as it is actually a v4i32 shuffle instruction.
8502     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
8503     int WidenedMask[4] = {
8504         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
8505         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
8506     return DAG.getNode(
8507         ISD::BITCAST, DL, MVT::v2i64,
8508         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
8509                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
8510   }
8511
8512   // Try to use byte shift instructions.
8513   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8514           DL, MVT::v2i64, V1, V2, Mask, DAG))
8515     return Shift;
8516
8517   // If we have a single input from V2 insert that into V1 if we can do so
8518   // cheaply.
8519   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8520     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8521             MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
8522       return Insertion;
8523     // Try inverting the insertion since for v2 masks it is easy to do and we
8524     // can't reliably sort the mask one way or the other.
8525     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8526                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8527     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8528             MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
8529       return Insertion;
8530   }
8531
8532   // Use dedicated unpack instructions for masks that match their pattern.
8533   if (isShuffleEquivalent(Mask, 0, 2))
8534     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
8535   if (isShuffleEquivalent(Mask, 1, 3))
8536     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
8537
8538   if (Subtarget->hasSSE41())
8539     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
8540                                                   Subtarget, DAG))
8541       return Blend;
8542
8543   // Try to use byte rotation instructions.
8544   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8545   if (Subtarget->hasSSSE3())
8546     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8547             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8548       return Rotate;
8549
8550   // We implement this with SHUFPD which is pretty lame because it will likely
8551   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
8552   // However, all the alternatives are still more cycles and newer chips don't
8553   // have this problem. It would be really nice if x86 had better shuffles here.
8554   V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
8555   V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
8556   return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
8557                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
8558 }
8559
8560 /// \brief Lower a vector shuffle using the SHUFPS instruction.
8561 ///
8562 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
8563 /// It makes no assumptions about whether this is the *best* lowering, it simply
8564 /// uses it.
8565 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
8566                                             ArrayRef<int> Mask, SDValue V1,
8567                                             SDValue V2, SelectionDAG &DAG) {
8568   SDValue LowV = V1, HighV = V2;
8569   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
8570
8571   int NumV2Elements =
8572       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8573
8574   if (NumV2Elements == 1) {
8575     int V2Index =
8576         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8577         Mask.begin();
8578
8579     // Compute the index adjacent to V2Index and in the same half by toggling
8580     // the low bit.
8581     int V2AdjIndex = V2Index ^ 1;
8582
8583     if (Mask[V2AdjIndex] == -1) {
8584       // Handles all the cases where we have a single V2 element and an undef.
8585       // This will only ever happen in the high lanes because we commute the
8586       // vector otherwise.
8587       if (V2Index < 2)
8588         std::swap(LowV, HighV);
8589       NewMask[V2Index] -= 4;
8590     } else {
8591       // Handle the case where the V2 element ends up adjacent to a V1 element.
8592       // To make this work, blend them together as the first step.
8593       int V1Index = V2AdjIndex;
8594       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
8595       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
8596                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8597
8598       // Now proceed to reconstruct the final blend as we have the necessary
8599       // high or low half formed.
8600       if (V2Index < 2) {
8601         LowV = V2;
8602         HighV = V1;
8603       } else {
8604         HighV = V2;
8605       }
8606       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
8607       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
8608     }
8609   } else if (NumV2Elements == 2) {
8610     if (Mask[0] < 4 && Mask[1] < 4) {
8611       // Handle the easy case where we have V1 in the low lanes and V2 in the
8612       // high lanes.
8613       NewMask[2] -= 4;
8614       NewMask[3] -= 4;
8615     } else if (Mask[2] < 4 && Mask[3] < 4) {
8616       // We also handle the reversed case because this utility may get called
8617       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
8618       // arrange things in the right direction.
8619       NewMask[0] -= 4;
8620       NewMask[1] -= 4;
8621       HighV = V1;
8622       LowV = V2;
8623     } else {
8624       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
8625       // trying to place elements directly, just blend them and set up the final
8626       // shuffle to place them.
8627
8628       // The first two blend mask elements are for V1, the second two are for
8629       // V2.
8630       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
8631                           Mask[2] < 4 ? Mask[2] : Mask[3],
8632                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
8633                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
8634       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
8635                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8636
8637       // Now we do a normal shuffle of V1 by giving V1 as both operands to
8638       // a blend.
8639       LowV = HighV = V1;
8640       NewMask[0] = Mask[0] < 4 ? 0 : 2;
8641       NewMask[1] = Mask[0] < 4 ? 2 : 0;
8642       NewMask[2] = Mask[2] < 4 ? 1 : 3;
8643       NewMask[3] = Mask[2] < 4 ? 3 : 1;
8644     }
8645   }
8646   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
8647                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
8648 }
8649
8650 /// \brief Lower 4-lane 32-bit floating point shuffles.
8651 ///
8652 /// Uses instructions exclusively from the floating point unit to minimize
8653 /// domain crossing penalties, as these are sufficient to implement all v4f32
8654 /// shuffles.
8655 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8656                                        const X86Subtarget *Subtarget,
8657                                        SelectionDAG &DAG) {
8658   SDLoc DL(Op);
8659   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8660   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8661   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8662   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8663   ArrayRef<int> Mask = SVOp->getMask();
8664   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8665
8666   int NumV2Elements =
8667       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8668
8669   if (NumV2Elements == 0) {
8670     // Check for being able to broadcast a single element.
8671     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8672                                                           Mask, Subtarget, DAG))
8673       return Broadcast;
8674
8675     // Use even/odd duplicate instructions for masks that match their pattern.
8676     if (Subtarget->hasSSE3()) {
8677       if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
8678         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
8679       if (isShuffleEquivalent(Mask, 1, 1, 3, 3))
8680         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
8681     }
8682
8683     if (Subtarget->hasAVX()) {
8684       // If we have AVX, we can use VPERMILPS which will allow folding a load
8685       // into the shuffle.
8686       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
8687                          getV4X86ShuffleImm8ForMask(Mask, DAG));
8688     }
8689
8690     // Otherwise, use a straight shuffle of a single input vector. We pass the
8691     // input vector to both operands to simulate this with a SHUFPS.
8692     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
8693                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8694   }
8695
8696   // Use dedicated unpack instructions for masks that match their pattern.
8697   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8698     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
8699   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8700     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
8701
8702   // There are special ways we can lower some single-element blends. However, we
8703   // have custom ways we can lower more complex single-element blends below that
8704   // we defer to if both this and BLENDPS fail to match, so restrict this to
8705   // when the V2 input is targeting element 0 of the mask -- that is the fast
8706   // case here.
8707   if (NumV2Elements == 1 && Mask[0] >= 4)
8708     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
8709                                                          Mask, Subtarget, DAG))
8710       return V;
8711
8712   if (Subtarget->hasSSE41()) {
8713     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
8714                                                   Subtarget, DAG))
8715       return Blend;
8716
8717     // Use INSERTPS if we can complete the shuffle efficiently.
8718     if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8719       return V;
8720   }
8721
8722   // Otherwise fall back to a SHUFPS lowering strategy.
8723   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
8724 }
8725
8726 /// \brief Lower 4-lane i32 vector shuffles.
8727 ///
8728 /// We try to handle these with integer-domain shuffles where we can, but for
8729 /// blends we use the floating point domain blend instructions.
8730 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8731                                        const X86Subtarget *Subtarget,
8732                                        SelectionDAG &DAG) {
8733   SDLoc DL(Op);
8734   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
8735   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8736   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8737   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8738   ArrayRef<int> Mask = SVOp->getMask();
8739   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8740
8741   // Whenever we can lower this as a zext, that instruction is strictly faster
8742   // than any alternative. It also allows us to fold memory operands into the
8743   // shuffle in many cases.
8744   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
8745                                                          Mask, Subtarget, DAG))
8746     return ZExt;
8747
8748   int NumV2Elements =
8749       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8750
8751   if (NumV2Elements == 0) {
8752     // Check for being able to broadcast a single element.
8753     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8754                                                           Mask, Subtarget, DAG))
8755       return Broadcast;
8756
8757     // Straight shuffle of a single input vector. For everything from SSE2
8758     // onward this has a single fast instruction with no scary immediates.
8759     // We coerce the shuffle pattern to be compatible with UNPCK instructions
8760     // but we aren't actually going to use the UNPCK instruction because doing
8761     // so prevents folding a load into this instruction or making a copy.
8762     const int UnpackLoMask[] = {0, 0, 1, 1};
8763     const int UnpackHiMask[] = {2, 2, 3, 3};
8764     if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
8765       Mask = UnpackLoMask;
8766     else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
8767       Mask = UnpackHiMask;
8768
8769     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
8770                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8771   }
8772
8773   // Try to use bit shift instructions.
8774   if (SDValue Shift = lowerVectorShuffleAsBitShift(
8775           DL, MVT::v4i32, V1, V2, Mask, DAG))
8776     return Shift;
8777
8778   // Try to use byte shift instructions.
8779   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8780           DL, MVT::v4i32, V1, V2, Mask, DAG))
8781     return Shift;
8782
8783   // There are special ways we can lower some single-element blends.
8784   if (NumV2Elements == 1)
8785     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
8786                                                          Mask, Subtarget, DAG))
8787       return V;
8788
8789   if (Subtarget->hasSSE41())
8790     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
8791                                                   Subtarget, DAG))
8792       return Blend;
8793
8794   if (SDValue Masked =
8795           lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
8796     return Masked;
8797
8798   // Use dedicated unpack instructions for masks that match their pattern.
8799   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8800     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
8801   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8802     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
8803
8804   // Try to use byte rotation instructions.
8805   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8806   if (Subtarget->hasSSSE3())
8807     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8808             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
8809       return Rotate;
8810
8811   // We implement this with SHUFPS because it can blend from two vectors.
8812   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
8813   // up the inputs, bypassing domain shift penalties that we would encur if we
8814   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
8815   // relevant.
8816   return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
8817                      DAG.getVectorShuffle(
8818                          MVT::v4f32, DL,
8819                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
8820                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
8821 }
8822
8823 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
8824 /// shuffle lowering, and the most complex part.
8825 ///
8826 /// The lowering strategy is to try to form pairs of input lanes which are
8827 /// targeted at the same half of the final vector, and then use a dword shuffle
8828 /// to place them onto the right half, and finally unpack the paired lanes into
8829 /// their final position.
8830 ///
8831 /// The exact breakdown of how to form these dword pairs and align them on the
8832 /// correct sides is really tricky. See the comments within the function for
8833 /// more of the details.
8834 static SDValue lowerV8I16SingleInputVectorShuffle(
8835     SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
8836     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8837   assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
8838   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
8839   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
8840
8841   SmallVector<int, 4> LoInputs;
8842   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
8843                [](int M) { return M >= 0; });
8844   std::sort(LoInputs.begin(), LoInputs.end());
8845   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
8846   SmallVector<int, 4> HiInputs;
8847   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
8848                [](int M) { return M >= 0; });
8849   std::sort(HiInputs.begin(), HiInputs.end());
8850   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
8851   int NumLToL =
8852       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
8853   int NumHToL = LoInputs.size() - NumLToL;
8854   int NumLToH =
8855       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
8856   int NumHToH = HiInputs.size() - NumLToH;
8857   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
8858   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
8859   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
8860   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
8861
8862   // Check for being able to broadcast a single element.
8863   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
8864                                                         Mask, Subtarget, DAG))
8865     return Broadcast;
8866
8867   // Try to use bit shift instructions.
8868   if (SDValue Shift = lowerVectorShuffleAsBitShift(
8869           DL, MVT::v8i16, V, V, Mask, DAG))
8870     return Shift;
8871
8872   // Try to use byte shift instructions.
8873   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8874           DL, MVT::v8i16, V, V, Mask, DAG))
8875     return Shift;
8876
8877   // Use dedicated unpack instructions for masks that match their pattern.
8878   if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
8879     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
8880   if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
8881     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
8882
8883   // Try to use byte rotation instructions.
8884   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8885           DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
8886     return Rotate;
8887
8888   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
8889   // such inputs we can swap two of the dwords across the half mark and end up
8890   // with <=2 inputs to each half in each half. Once there, we can fall through
8891   // to the generic code below. For example:
8892   //
8893   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8894   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
8895   //
8896   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
8897   // and an existing 2-into-2 on the other half. In this case we may have to
8898   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
8899   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
8900   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
8901   // because any other situation (including a 3-into-1 or 1-into-3 in the other
8902   // half than the one we target for fixing) will be fixed when we re-enter this
8903   // path. We will also combine away any sequence of PSHUFD instructions that
8904   // result into a single instruction. Here is an example of the tricky case:
8905   //
8906   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8907   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
8908   //
8909   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
8910   //
8911   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
8912   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
8913   //
8914   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
8915   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
8916   //
8917   // The result is fine to be handled by the generic logic.
8918   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
8919                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
8920                           int AOffset, int BOffset) {
8921     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
8922            "Must call this with A having 3 or 1 inputs from the A half.");
8923     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
8924            "Must call this with B having 1 or 3 inputs from the B half.");
8925     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
8926            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
8927
8928     // Compute the index of dword with only one word among the three inputs in
8929     // a half by taking the sum of the half with three inputs and subtracting
8930     // the sum of the actual three inputs. The difference is the remaining
8931     // slot.
8932     int ADWord, BDWord;
8933     int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
8934     int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
8935     int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
8936     ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
8937     int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
8938     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
8939     int TripleNonInputIdx =
8940         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
8941     TripleDWord = TripleNonInputIdx / 2;
8942
8943     // We use xor with one to compute the adjacent DWord to whichever one the
8944     // OneInput is in.
8945     OneInputDWord = (OneInput / 2) ^ 1;
8946
8947     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
8948     // and BToA inputs. If there is also such a problem with the BToB and AToB
8949     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
8950     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
8951     // is essential that we don't *create* a 3<-1 as then we might oscillate.
8952     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
8953       // Compute how many inputs will be flipped by swapping these DWords. We
8954       // need
8955       // to balance this to ensure we don't form a 3-1 shuffle in the other
8956       // half.
8957       int NumFlippedAToBInputs =
8958           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
8959           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
8960       int NumFlippedBToBInputs =
8961           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
8962           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
8963       if ((NumFlippedAToBInputs == 1 &&
8964            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
8965           (NumFlippedBToBInputs == 1 &&
8966            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
8967         // We choose whether to fix the A half or B half based on whether that
8968         // half has zero flipped inputs. At zero, we may not be able to fix it
8969         // with that half. We also bias towards fixing the B half because that
8970         // will more commonly be the high half, and we have to bias one way.
8971         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
8972                                                        ArrayRef<int> Inputs) {
8973           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
8974           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
8975                                          PinnedIdx ^ 1) != Inputs.end();
8976           // Determine whether the free index is in the flipped dword or the
8977           // unflipped dword based on where the pinned index is. We use this bit
8978           // in an xor to conditionally select the adjacent dword.
8979           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
8980           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8981                                              FixFreeIdx) != Inputs.end();
8982           if (IsFixIdxInput == IsFixFreeIdxInput)
8983             FixFreeIdx += 1;
8984           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8985                                         FixFreeIdx) != Inputs.end();
8986           assert(IsFixIdxInput != IsFixFreeIdxInput &&
8987                  "We need to be changing the number of flipped inputs!");
8988           int PSHUFHalfMask[] = {0, 1, 2, 3};
8989           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
8990           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
8991                           MVT::v8i16, V,
8992                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
8993
8994           for (int &M : Mask)
8995             if (M != -1 && M == FixIdx)
8996               M = FixFreeIdx;
8997             else if (M != -1 && M == FixFreeIdx)
8998               M = FixIdx;
8999         };
9000         if (NumFlippedBToBInputs != 0) {
9001           int BPinnedIdx =
9002               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9003           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
9004         } else {
9005           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
9006           int APinnedIdx =
9007               AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9008           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
9009         }
9010       }
9011     }
9012
9013     int PSHUFDMask[] = {0, 1, 2, 3};
9014     PSHUFDMask[ADWord] = BDWord;
9015     PSHUFDMask[BDWord] = ADWord;
9016     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9017                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9018                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9019                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9020
9021     // Adjust the mask to match the new locations of A and B.
9022     for (int &M : Mask)
9023       if (M != -1 && M/2 == ADWord)
9024         M = 2 * BDWord + M % 2;
9025       else if (M != -1 && M/2 == BDWord)
9026         M = 2 * ADWord + M % 2;
9027
9028     // Recurse back into this routine to re-compute state now that this isn't
9029     // a 3 and 1 problem.
9030     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9031                                 Mask);
9032   };
9033   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
9034     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
9035   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
9036     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
9037
9038   // At this point there are at most two inputs to the low and high halves from
9039   // each half. That means the inputs can always be grouped into dwords and
9040   // those dwords can then be moved to the correct half with a dword shuffle.
9041   // We use at most one low and one high word shuffle to collect these paired
9042   // inputs into dwords, and finally a dword shuffle to place them.
9043   int PSHUFLMask[4] = {-1, -1, -1, -1};
9044   int PSHUFHMask[4] = {-1, -1, -1, -1};
9045   int PSHUFDMask[4] = {-1, -1, -1, -1};
9046
9047   // First fix the masks for all the inputs that are staying in their
9048   // original halves. This will then dictate the targets of the cross-half
9049   // shuffles.
9050   auto fixInPlaceInputs =
9051       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
9052                     MutableArrayRef<int> SourceHalfMask,
9053                     MutableArrayRef<int> HalfMask, int HalfOffset) {
9054     if (InPlaceInputs.empty())
9055       return;
9056     if (InPlaceInputs.size() == 1) {
9057       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9058           InPlaceInputs[0] - HalfOffset;
9059       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
9060       return;
9061     }
9062     if (IncomingInputs.empty()) {
9063       // Just fix all of the in place inputs.
9064       for (int Input : InPlaceInputs) {
9065         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
9066         PSHUFDMask[Input / 2] = Input / 2;
9067       }
9068       return;
9069     }
9070
9071     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
9072     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9073         InPlaceInputs[0] - HalfOffset;
9074     // Put the second input next to the first so that they are packed into
9075     // a dword. We find the adjacent index by toggling the low bit.
9076     int AdjIndex = InPlaceInputs[0] ^ 1;
9077     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
9078     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
9079     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
9080   };
9081   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
9082   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
9083
9084   // Now gather the cross-half inputs and place them into a free dword of
9085   // their target half.
9086   // FIXME: This operation could almost certainly be simplified dramatically to
9087   // look more like the 3-1 fixing operation.
9088   auto moveInputsToRightHalf = [&PSHUFDMask](
9089       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
9090       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
9091       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
9092       int DestOffset) {
9093     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
9094       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
9095     };
9096     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
9097                                                int Word) {
9098       int LowWord = Word & ~1;
9099       int HighWord = Word | 1;
9100       return isWordClobbered(SourceHalfMask, LowWord) ||
9101              isWordClobbered(SourceHalfMask, HighWord);
9102     };
9103
9104     if (IncomingInputs.empty())
9105       return;
9106
9107     if (ExistingInputs.empty()) {
9108       // Map any dwords with inputs from them into the right half.
9109       for (int Input : IncomingInputs) {
9110         // If the source half mask maps over the inputs, turn those into
9111         // swaps and use the swapped lane.
9112         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
9113           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
9114             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
9115                 Input - SourceOffset;
9116             // We have to swap the uses in our half mask in one sweep.
9117             for (int &M : HalfMask)
9118               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
9119                 M = Input;
9120               else if (M == Input)
9121                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9122           } else {
9123             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
9124                        Input - SourceOffset &&
9125                    "Previous placement doesn't match!");
9126           }
9127           // Note that this correctly re-maps both when we do a swap and when
9128           // we observe the other side of the swap above. We rely on that to
9129           // avoid swapping the members of the input list directly.
9130           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9131         }
9132
9133         // Map the input's dword into the correct half.
9134         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
9135           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
9136         else
9137           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
9138                      Input / 2 &&
9139                  "Previous placement doesn't match!");
9140       }
9141
9142       // And just directly shift any other-half mask elements to be same-half
9143       // as we will have mirrored the dword containing the element into the
9144       // same position within that half.
9145       for (int &M : HalfMask)
9146         if (M >= SourceOffset && M < SourceOffset + 4) {
9147           M = M - SourceOffset + DestOffset;
9148           assert(M >= 0 && "This should never wrap below zero!");
9149         }
9150       return;
9151     }
9152
9153     // Ensure we have the input in a viable dword of its current half. This
9154     // is particularly tricky because the original position may be clobbered
9155     // by inputs being moved and *staying* in that half.
9156     if (IncomingInputs.size() == 1) {
9157       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9158         int InputFixed = std::find(std::begin(SourceHalfMask),
9159                                    std::end(SourceHalfMask), -1) -
9160                          std::begin(SourceHalfMask) + SourceOffset;
9161         SourceHalfMask[InputFixed - SourceOffset] =
9162             IncomingInputs[0] - SourceOffset;
9163         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9164                      InputFixed);
9165         IncomingInputs[0] = InputFixed;
9166       }
9167     } else if (IncomingInputs.size() == 2) {
9168       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9169           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9170         // We have two non-adjacent or clobbered inputs we need to extract from
9171         // the source half. To do this, we need to map them into some adjacent
9172         // dword slot in the source mask.
9173         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9174                               IncomingInputs[1] - SourceOffset};
9175
9176         // If there is a free slot in the source half mask adjacent to one of
9177         // the inputs, place the other input in it. We use (Index XOR 1) to
9178         // compute an adjacent index.
9179         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9180             SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
9181           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9182           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9183           InputsFixed[1] = InputsFixed[0] ^ 1;
9184         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9185                    SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
9186           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9187           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9188           InputsFixed[0] = InputsFixed[1] ^ 1;
9189         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
9190                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
9191           // The two inputs are in the same DWord but it is clobbered and the
9192           // adjacent DWord isn't used at all. Move both inputs to the free
9193           // slot.
9194           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9195           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9196           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9197           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9198         } else {
9199           // The only way we hit this point is if there is no clobbering
9200           // (because there are no off-half inputs to this half) and there is no
9201           // free slot adjacent to one of the inputs. In this case, we have to
9202           // swap an input with a non-input.
9203           for (int i = 0; i < 4; ++i)
9204             assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
9205                    "We can't handle any clobbers here!");
9206           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9207                  "Cannot have adjacent inputs here!");
9208
9209           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9210           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9211
9212           // We also have to update the final source mask in this case because
9213           // it may need to undo the above swap.
9214           for (int &M : FinalSourceHalfMask)
9215             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9216               M = InputsFixed[1] + SourceOffset;
9217             else if (M == InputsFixed[1] + SourceOffset)
9218               M = (InputsFixed[0] ^ 1) + SourceOffset;
9219
9220           InputsFixed[1] = InputsFixed[0] ^ 1;
9221         }
9222
9223         // Point everything at the fixed inputs.
9224         for (int &M : HalfMask)
9225           if (M == IncomingInputs[0])
9226             M = InputsFixed[0] + SourceOffset;
9227           else if (M == IncomingInputs[1])
9228             M = InputsFixed[1] + SourceOffset;
9229
9230         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9231         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9232       }
9233     } else {
9234       llvm_unreachable("Unhandled input size!");
9235     }
9236
9237     // Now hoist the DWord down to the right half.
9238     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
9239     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
9240     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9241     for (int &M : HalfMask)
9242       for (int Input : IncomingInputs)
9243         if (M == Input)
9244           M = FreeDWord * 2 + Input % 2;
9245   };
9246   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9247                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
9248   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9249                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
9250
9251   // Now enact all the shuffles we've computed to move the inputs into their
9252   // target half.
9253   if (!isNoopShuffleMask(PSHUFLMask))
9254     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9255                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
9256   if (!isNoopShuffleMask(PSHUFHMask))
9257     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9258                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
9259   if (!isNoopShuffleMask(PSHUFDMask))
9260     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9261                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9262                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9263                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9264
9265   // At this point, each half should contain all its inputs, and we can then
9266   // just shuffle them into their final position.
9267   assert(std::count_if(LoMask.begin(), LoMask.end(),
9268                        [](int M) { return M >= 4; }) == 0 &&
9269          "Failed to lift all the high half inputs to the low mask!");
9270   assert(std::count_if(HiMask.begin(), HiMask.end(),
9271                        [](int M) { return M >= 0 && M < 4; }) == 0 &&
9272          "Failed to lift all the low half inputs to the high mask!");
9273
9274   // Do a half shuffle for the low mask.
9275   if (!isNoopShuffleMask(LoMask))
9276     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9277                     getV4X86ShuffleImm8ForMask(LoMask, DAG));
9278
9279   // Do a half shuffle with the high mask after shifting its values down.
9280   for (int &M : HiMask)
9281     if (M >= 0)
9282       M -= 4;
9283   if (!isNoopShuffleMask(HiMask))
9284     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9285                     getV4X86ShuffleImm8ForMask(HiMask, DAG));
9286
9287   return V;
9288 }
9289
9290 /// \brief Detect whether the mask pattern should be lowered through
9291 /// interleaving.
9292 ///
9293 /// This essentially tests whether viewing the mask as an interleaving of two
9294 /// sub-sequences reduces the cross-input traffic of a blend operation. If so,
9295 /// lowering it through interleaving is a significantly better strategy.
9296 static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
9297   int NumEvenInputs[2] = {0, 0};
9298   int NumOddInputs[2] = {0, 0};
9299   int NumLoInputs[2] = {0, 0};
9300   int NumHiInputs[2] = {0, 0};
9301   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9302     if (Mask[i] < 0)
9303       continue;
9304
9305     int InputIdx = Mask[i] >= Size;
9306
9307     if (i < Size / 2)
9308       ++NumLoInputs[InputIdx];
9309     else
9310       ++NumHiInputs[InputIdx];
9311
9312     if ((i % 2) == 0)
9313       ++NumEvenInputs[InputIdx];
9314     else
9315       ++NumOddInputs[InputIdx];
9316   }
9317
9318   // The minimum number of cross-input results for both the interleaved and
9319   // split cases. If interleaving results in fewer cross-input results, return
9320   // true.
9321   int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
9322                                     NumEvenInputs[0] + NumOddInputs[1]);
9323   int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
9324                               NumLoInputs[0] + NumHiInputs[1]);
9325   return InterleavedCrosses < SplitCrosses;
9326 }
9327
9328 /// \brief Blend two v8i16 vectors using a naive unpack strategy.
9329 ///
9330 /// This strategy only works when the inputs from each vector fit into a single
9331 /// half of that vector, and generally there are not so many inputs as to leave
9332 /// the in-place shuffles required highly constrained (and thus expensive). It
9333 /// shifts all the inputs into a single side of both input vectors and then
9334 /// uses an unpack to interleave these inputs in a single vector. At that
9335 /// point, we will fall back on the generic single input shuffle lowering.
9336 static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
9337                                                  SDValue V2,
9338                                                  MutableArrayRef<int> Mask,
9339                                                  const X86Subtarget *Subtarget,
9340                                                  SelectionDAG &DAG) {
9341   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9342   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9343   SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
9344   for (int i = 0; i < 8; ++i)
9345     if (Mask[i] >= 0 && Mask[i] < 4)
9346       LoV1Inputs.push_back(i);
9347     else if (Mask[i] >= 4 && Mask[i] < 8)
9348       HiV1Inputs.push_back(i);
9349     else if (Mask[i] >= 8 && Mask[i] < 12)
9350       LoV2Inputs.push_back(i);
9351     else if (Mask[i] >= 12)
9352       HiV2Inputs.push_back(i);
9353
9354   int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
9355   int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
9356   (void)NumV1Inputs;
9357   (void)NumV2Inputs;
9358   assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
9359   assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
9360   assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
9361
9362   bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
9363                      HiV1Inputs.size() + HiV2Inputs.size();
9364
9365   auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
9366                               ArrayRef<int> HiInputs, bool MoveToLo,
9367                               int MaskOffset) {
9368     ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
9369     ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
9370     if (BadInputs.empty())
9371       return V;
9372
9373     int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9374     int MoveOffset = MoveToLo ? 0 : 4;
9375
9376     if (GoodInputs.empty()) {
9377       for (int BadInput : BadInputs) {
9378         MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
9379         Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
9380       }
9381     } else {
9382       if (GoodInputs.size() == 2) {
9383         // If the low inputs are spread across two dwords, pack them into
9384         // a single dword.
9385         MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
9386         MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
9387         Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
9388         Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
9389       } else {
9390         // Otherwise pin the good inputs.
9391         for (int GoodInput : GoodInputs)
9392           MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
9393       }
9394
9395       if (BadInputs.size() == 2) {
9396         // If we have two bad inputs then there may be either one or two good
9397         // inputs fixed in place. Find a fixed input, and then find the *other*
9398         // two adjacent indices by using modular arithmetic.
9399         int GoodMaskIdx =
9400             std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
9401                          [](int M) { return M >= 0; }) -
9402             std::begin(MoveMask);
9403         int MoveMaskIdx =
9404             ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
9405         assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
9406         assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
9407         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9408         MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
9409         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9410         Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
9411       } else {
9412         assert(BadInputs.size() == 1 && "All sizes handled");
9413         int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
9414                                     std::end(MoveMask), -1) -
9415                           std::begin(MoveMask);
9416         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9417         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9418       }
9419     }
9420
9421     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9422                                 MoveMask);
9423   };
9424   V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
9425                         /*MaskOffset*/ 0);
9426   V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
9427                         /*MaskOffset*/ 8);
9428
9429   // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
9430   // cross-half traffic in the final shuffle.
9431
9432   // Munge the mask to be a single-input mask after the unpack merges the
9433   // results.
9434   for (int &M : Mask)
9435     if (M != -1)
9436       M = 2 * (M % 4) + (M / 8);
9437
9438   return DAG.getVectorShuffle(
9439       MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
9440                                   DL, MVT::v8i16, V1, V2),
9441       DAG.getUNDEF(MVT::v8i16), Mask);
9442 }
9443
9444 /// \brief Generic lowering of 8-lane i16 shuffles.
9445 ///
9446 /// This handles both single-input shuffles and combined shuffle/blends with
9447 /// two inputs. The single input shuffles are immediately delegated to
9448 /// a dedicated lowering routine.
9449 ///
9450 /// The blends are lowered in one of three fundamental ways. If there are few
9451 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9452 /// of the input is significantly cheaper when lowered as an interleaving of
9453 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9454 /// halves of the inputs separately (making them have relatively few inputs)
9455 /// and then concatenate them.
9456 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9457                                        const X86Subtarget *Subtarget,
9458                                        SelectionDAG &DAG) {
9459   SDLoc DL(Op);
9460   assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
9461   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9462   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9463   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9464   ArrayRef<int> OrigMask = SVOp->getMask();
9465   int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9466                         OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
9467   MutableArrayRef<int> Mask(MaskStorage);
9468
9469   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9470
9471   // Whenever we can lower this as a zext, that instruction is strictly faster
9472   // than any alternative.
9473   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9474           DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
9475     return ZExt;
9476
9477   auto isV1 = [](int M) { return M >= 0 && M < 8; };
9478   auto isV2 = [](int M) { return M >= 8; };
9479
9480   int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
9481   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
9482
9483   if (NumV2Inputs == 0)
9484     return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
9485
9486   assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
9487                             "to be V1-input shuffles.");
9488
9489   // Try to use bit shift instructions.
9490   if (SDValue Shift = lowerVectorShuffleAsBitShift(
9491           DL, MVT::v8i16, V1, V2, Mask, DAG))
9492     return Shift;
9493
9494   // Try to use byte shift instructions.
9495   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9496           DL, MVT::v8i16, V1, V2, Mask, DAG))
9497     return Shift;
9498
9499   // There are special ways we can lower some single-element blends.
9500   if (NumV2Inputs == 1)
9501     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
9502                                                          Mask, Subtarget, DAG))
9503       return V;
9504
9505   if (Subtarget->hasSSE41())
9506     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9507                                                   Subtarget, DAG))
9508       return Blend;
9509
9510   if (SDValue Masked =
9511           lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
9512     return Masked;
9513
9514   // Use dedicated unpack instructions for masks that match their pattern.
9515   if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
9516     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
9517   if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
9518     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
9519
9520   // Try to use byte rotation instructions.
9521   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9522           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9523     return Rotate;
9524
9525   if (NumV1Inputs + NumV2Inputs <= 4)
9526     return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
9527
9528   // Check whether an interleaving lowering is likely to be more efficient.
9529   // This isn't perfect but it is a strong heuristic that tends to work well on
9530   // the kinds of shuffles that show up in practice.
9531   //
9532   // FIXME: Handle 1x, 2x, and 4x interleaving.
9533   if (shouldLowerAsInterleaving(Mask)) {
9534     // FIXME: Figure out whether we should pack these into the low or high
9535     // halves.
9536
9537     int EMask[8], OMask[8];
9538     for (int i = 0; i < 4; ++i) {
9539       EMask[i] = Mask[2*i];
9540       OMask[i] = Mask[2*i + 1];
9541       EMask[i + 4] = -1;
9542       OMask[i + 4] = -1;
9543     }
9544
9545     SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
9546     SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
9547
9548     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
9549   }
9550
9551   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9552   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9553
9554   for (int i = 0; i < 4; ++i) {
9555     LoBlendMask[i] = Mask[i];
9556     HiBlendMask[i] = Mask[i + 4];
9557   }
9558
9559   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9560   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9561   LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
9562   HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
9563
9564   return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9565                      DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
9566 }
9567
9568 /// \brief Check whether a compaction lowering can be done by dropping even
9569 /// elements and compute how many times even elements must be dropped.
9570 ///
9571 /// This handles shuffles which take every Nth element where N is a power of
9572 /// two. Example shuffle masks:
9573 ///
9574 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
9575 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
9576 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
9577 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
9578 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
9579 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
9580 ///
9581 /// Any of these lanes can of course be undef.
9582 ///
9583 /// This routine only supports N <= 3.
9584 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
9585 /// for larger N.
9586 ///
9587 /// \returns N above, or the number of times even elements must be dropped if
9588 /// there is such a number. Otherwise returns zero.
9589 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
9590   // Figure out whether we're looping over two inputs or just one.
9591   bool IsSingleInput = isSingleInputShuffleMask(Mask);
9592
9593   // The modulus for the shuffle vector entries is based on whether this is
9594   // a single input or not.
9595   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
9596   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
9597          "We should only be called with masks with a power-of-2 size!");
9598
9599   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
9600
9601   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
9602   // and 2^3 simultaneously. This is because we may have ambiguity with
9603   // partially undef inputs.
9604   bool ViableForN[3] = {true, true, true};
9605
9606   for (int i = 0, e = Mask.size(); i < e; ++i) {
9607     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
9608     // want.
9609     if (Mask[i] == -1)
9610       continue;
9611
9612     bool IsAnyViable = false;
9613     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9614       if (ViableForN[j]) {
9615         uint64_t N = j + 1;
9616
9617         // The shuffle mask must be equal to (i * 2^N) % M.
9618         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
9619           IsAnyViable = true;
9620         else
9621           ViableForN[j] = false;
9622       }
9623     // Early exit if we exhaust the possible powers of two.
9624     if (!IsAnyViable)
9625       break;
9626   }
9627
9628   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9629     if (ViableForN[j])
9630       return j + 1;
9631
9632   // Return 0 as there is no viable power of two.
9633   return 0;
9634 }
9635
9636 /// \brief Generic lowering of v16i8 shuffles.
9637 ///
9638 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
9639 /// detect any complexity reducing interleaving. If that doesn't help, it uses
9640 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
9641 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
9642 /// back together.
9643 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9644                                        const X86Subtarget *Subtarget,
9645                                        SelectionDAG &DAG) {
9646   SDLoc DL(Op);
9647   assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
9648   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9649   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9650   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9651   ArrayRef<int> OrigMask = SVOp->getMask();
9652   assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
9653
9654   // Try to use bit shift instructions.
9655   if (SDValue Shift = lowerVectorShuffleAsBitShift(
9656           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9657     return Shift;
9658
9659   // Try to use byte shift instructions.
9660   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9661           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9662     return Shift;
9663
9664   // Try to use byte rotation instructions.
9665   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9666           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9667     return Rotate;
9668
9669   // Try to use a zext lowering.
9670   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9671           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9672     return ZExt;
9673
9674   int MaskStorage[16] = {
9675       OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
9676       OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
9677       OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
9678       OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
9679   MutableArrayRef<int> Mask(MaskStorage);
9680   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
9681   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
9682
9683   int NumV2Elements =
9684       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
9685
9686   // For single-input shuffles, there are some nicer lowering tricks we can use.
9687   if (NumV2Elements == 0) {
9688     // Check for being able to broadcast a single element.
9689     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9690                                                           Mask, Subtarget, DAG))
9691       return Broadcast;
9692
9693     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
9694     // Notably, this handles splat and partial-splat shuffles more efficiently.
9695     // However, it only makes sense if the pre-duplication shuffle simplifies
9696     // things significantly. Currently, this means we need to be able to
9697     // express the pre-duplication shuffle as an i16 shuffle.
9698     //
9699     // FIXME: We should check for other patterns which can be widened into an
9700     // i16 shuffle as well.
9701     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
9702       for (int i = 0; i < 16; i += 2)
9703         if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
9704           return false;
9705
9706       return true;
9707     };
9708     auto tryToWidenViaDuplication = [&]() -> SDValue {
9709       if (!canWidenViaDuplication(Mask))
9710         return SDValue();
9711       SmallVector<int, 4> LoInputs;
9712       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
9713                    [](int M) { return M >= 0 && M < 8; });
9714       std::sort(LoInputs.begin(), LoInputs.end());
9715       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
9716                      LoInputs.end());
9717       SmallVector<int, 4> HiInputs;
9718       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
9719                    [](int M) { return M >= 8; });
9720       std::sort(HiInputs.begin(), HiInputs.end());
9721       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
9722                      HiInputs.end());
9723
9724       bool TargetLo = LoInputs.size() >= HiInputs.size();
9725       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
9726       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
9727
9728       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9729       SmallDenseMap<int, int, 8> LaneMap;
9730       for (int I : InPlaceInputs) {
9731         PreDupI16Shuffle[I/2] = I/2;
9732         LaneMap[I] = I;
9733       }
9734       int j = TargetLo ? 0 : 4, je = j + 4;
9735       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
9736         // Check if j is already a shuffle of this input. This happens when
9737         // there are two adjacent bytes after we move the low one.
9738         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
9739           // If we haven't yet mapped the input, search for a slot into which
9740           // we can map it.
9741           while (j < je && PreDupI16Shuffle[j] != -1)
9742             ++j;
9743
9744           if (j == je)
9745             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
9746             return SDValue();
9747
9748           // Map this input with the i16 shuffle.
9749           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
9750         }
9751
9752         // Update the lane map based on the mapping we ended up with.
9753         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
9754       }
9755       V1 = DAG.getNode(
9756           ISD::BITCAST, DL, MVT::v16i8,
9757           DAG.getVectorShuffle(MVT::v8i16, DL,
9758                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9759                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
9760
9761       // Unpack the bytes to form the i16s that will be shuffled into place.
9762       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9763                        MVT::v16i8, V1, V1);
9764
9765       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9766       for (int i = 0; i < 16; ++i)
9767         if (Mask[i] != -1) {
9768           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
9769           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
9770           if (PostDupI16Shuffle[i / 2] == -1)
9771             PostDupI16Shuffle[i / 2] = MappedMask;
9772           else
9773             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
9774                    "Conflicting entrties in the original shuffle!");
9775         }
9776       return DAG.getNode(
9777           ISD::BITCAST, DL, MVT::v16i8,
9778           DAG.getVectorShuffle(MVT::v8i16, DL,
9779                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9780                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
9781     };
9782     if (SDValue V = tryToWidenViaDuplication())
9783       return V;
9784   }
9785
9786   // Check whether an interleaving lowering is likely to be more efficient.
9787   // This isn't perfect but it is a strong heuristic that tends to work well on
9788   // the kinds of shuffles that show up in practice.
9789   //
9790   // FIXME: We need to handle other interleaving widths (i16, i32, ...).
9791   if (shouldLowerAsInterleaving(Mask)) {
9792     int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9793       return (M >= 0 && M < 8) || (M >= 16 && M < 24);
9794     });
9795     int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9796       return (M >= 8 && M < 16) || M >= 24;
9797     });
9798     int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9799                      -1, -1, -1, -1, -1, -1, -1, -1};
9800     int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9801                      -1, -1, -1, -1, -1, -1, -1, -1};
9802     bool UnpackLo = NumLoHalf >= NumHiHalf;
9803     MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
9804     MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
9805     for (int i = 0; i < 8; ++i) {
9806       TargetEMask[i] = Mask[2 * i];
9807       TargetOMask[i] = Mask[2 * i + 1];
9808     }
9809
9810     SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
9811     SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
9812
9813     return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9814                        MVT::v16i8, Evens, Odds);
9815   }
9816
9817   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
9818   // with PSHUFB. It is important to do this before we attempt to generate any
9819   // blends but after all of the single-input lowerings. If the single input
9820   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
9821   // want to preserve that and we can DAG combine any longer sequences into
9822   // a PSHUFB in the end. But once we start blending from multiple inputs,
9823   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
9824   // and there are *very* few patterns that would actually be faster than the
9825   // PSHUFB approach because of its ability to zero lanes.
9826   //
9827   // FIXME: The only exceptions to the above are blends which are exact
9828   // interleavings with direct instructions supporting them. We currently don't
9829   // handle those well here.
9830   if (Subtarget->hasSSSE3()) {
9831     SDValue V1Mask[16];
9832     SDValue V2Mask[16];
9833     bool V1InUse = false;
9834     bool V2InUse = false;
9835     SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9836
9837     for (int i = 0; i < 16; ++i) {
9838       if (Mask[i] == -1) {
9839         V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9840       } else {
9841         const int ZeroMask = 0x80;
9842         int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
9843         int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
9844         if (Zeroable[i])
9845           V1Idx = V2Idx = ZeroMask;
9846         V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
9847         V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
9848         V1InUse |= (ZeroMask != V1Idx);
9849         V2InUse |= (ZeroMask != V2Idx);
9850       }
9851     }
9852
9853     if (V1InUse)
9854       V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
9855                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
9856     if (V2InUse)
9857       V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
9858                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
9859
9860     // If we need shuffled inputs from both, blend the two.
9861     if (V1InUse && V2InUse)
9862       return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9863     if (V1InUse)
9864       return V1; // Single inputs are easy.
9865     if (V2InUse)
9866       return V2; // Single inputs are easy.
9867     // Shuffling to a zeroable vector.
9868     return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
9869   }
9870
9871   // There are special ways we can lower some single-element blends.
9872   if (NumV2Elements == 1)
9873     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
9874                                                          Mask, Subtarget, DAG))
9875       return V;
9876
9877   // Check whether a compaction lowering can be done. This handles shuffles
9878   // which take every Nth element for some even N. See the helper function for
9879   // details.
9880   //
9881   // We special case these as they can be particularly efficiently handled with
9882   // the PACKUSB instruction on x86 and they show up in common patterns of
9883   // rearranging bytes to truncate wide elements.
9884   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
9885     // NumEvenDrops is the power of two stride of the elements. Another way of
9886     // thinking about it is that we need to drop the even elements this many
9887     // times to get the original input.
9888     bool IsSingleInput = isSingleInputShuffleMask(Mask);
9889
9890     // First we need to zero all the dropped bytes.
9891     assert(NumEvenDrops <= 3 &&
9892            "No support for dropping even elements more than 3 times.");
9893     // We use the mask type to pick which bytes are preserved based on how many
9894     // elements are dropped.
9895     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
9896     SDValue ByteClearMask =
9897         DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
9898                     DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
9899     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
9900     if (!IsSingleInput)
9901       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
9902
9903     // Now pack things back together.
9904     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
9905     V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
9906     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
9907     for (int i = 1; i < NumEvenDrops; ++i) {
9908       Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
9909       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
9910     }
9911
9912     return Result;
9913   }
9914
9915   int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9916   int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9917   int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9918   int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9919
9920   auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
9921                             MutableArrayRef<int> V1HalfBlendMask,
9922                             MutableArrayRef<int> V2HalfBlendMask) {
9923     for (int i = 0; i < 8; ++i)
9924       if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
9925         V1HalfBlendMask[i] = HalfMask[i];
9926         HalfMask[i] = i;
9927       } else if (HalfMask[i] >= 16) {
9928         V2HalfBlendMask[i] = HalfMask[i] - 16;
9929         HalfMask[i] = i + 8;
9930       }
9931   };
9932   buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
9933   buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
9934
9935   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
9936
9937   auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
9938                              MutableArrayRef<int> HiBlendMask) {
9939     SDValue V1, V2;
9940     // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
9941     // them out and avoid using UNPCK{L,H} to extract the elements of V as
9942     // i16s.
9943     if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
9944                      [](int M) { return M >= 0 && M % 2 == 1; }) &&
9945         std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
9946                      [](int M) { return M >= 0 && M % 2 == 1; })) {
9947       // Use a mask to drop the high bytes.
9948       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
9949       V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
9950                        DAG.getConstant(0x00FF, MVT::v8i16));
9951
9952       // This will be a single vector shuffle instead of a blend so nuke V2.
9953       V2 = DAG.getUNDEF(MVT::v8i16);
9954
9955       // Squash the masks to point directly into V1.
9956       for (int &M : LoBlendMask)
9957         if (M >= 0)
9958           M /= 2;
9959       for (int &M : HiBlendMask)
9960         if (M >= 0)
9961           M /= 2;
9962     } else {
9963       // Otherwise just unpack the low half of V into V1 and the high half into
9964       // V2 so that we can blend them as i16s.
9965       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9966                        DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
9967       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9968                        DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
9969     }
9970
9971     SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9972     SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9973     return std::make_pair(BlendedLo, BlendedHi);
9974   };
9975   SDValue V1Lo, V1Hi, V2Lo, V2Hi;
9976   std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
9977   std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
9978
9979   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
9980   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
9981
9982   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
9983 }
9984
9985 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
9986 ///
9987 /// This routine breaks down the specific type of 128-bit shuffle and
9988 /// dispatches to the lowering routines accordingly.
9989 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9990                                         MVT VT, const X86Subtarget *Subtarget,
9991                                         SelectionDAG &DAG) {
9992   switch (VT.SimpleTy) {
9993   case MVT::v2i64:
9994     return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9995   case MVT::v2f64:
9996     return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9997   case MVT::v4i32:
9998     return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9999   case MVT::v4f32:
10000     return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10001   case MVT::v8i16:
10002     return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10003   case MVT::v16i8:
10004     return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10005
10006   default:
10007     llvm_unreachable("Unimplemented!");
10008   }
10009 }
10010
10011 /// \brief Helper function to test whether a shuffle mask could be
10012 /// simplified by widening the elements being shuffled.
10013 ///
10014 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
10015 /// leaves it in an unspecified state.
10016 ///
10017 /// NOTE: This must handle normal vector shuffle masks and *target* vector
10018 /// shuffle masks. The latter have the special property of a '-2' representing
10019 /// a zero-ed lane of a vector.
10020 static bool canWidenShuffleElements(ArrayRef<int> Mask,
10021                                     SmallVectorImpl<int> &WidenedMask) {
10022   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
10023     // If both elements are undef, its trivial.
10024     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
10025       WidenedMask.push_back(SM_SentinelUndef);
10026       continue;
10027     }
10028
10029     // Check for an undef mask and a mask value properly aligned to fit with
10030     // a pair of values. If we find such a case, use the non-undef mask's value.
10031     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
10032       WidenedMask.push_back(Mask[i + 1] / 2);
10033       continue;
10034     }
10035     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
10036       WidenedMask.push_back(Mask[i] / 2);
10037       continue;
10038     }
10039
10040     // When zeroing, we need to spread the zeroing across both lanes to widen.
10041     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
10042       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
10043           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
10044         WidenedMask.push_back(SM_SentinelZero);
10045         continue;
10046       }
10047       return false;
10048     }
10049
10050     // Finally check if the two mask values are adjacent and aligned with
10051     // a pair.
10052     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
10053       WidenedMask.push_back(Mask[i] / 2);
10054       continue;
10055     }
10056
10057     // Otherwise we can't safely widen the elements used in this shuffle.
10058     return false;
10059   }
10060   assert(WidenedMask.size() == Mask.size() / 2 &&
10061          "Incorrect size of mask after widening the elements!");
10062
10063   return true;
10064 }
10065
10066 /// \brief Generic routine to split ector shuffle into half-sized shuffles.
10067 ///
10068 /// This routine just extracts two subvectors, shuffles them independently, and
10069 /// then concatenates them back together. This should work effectively with all
10070 /// AVX vector shuffle types.
10071 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10072                                           SDValue V2, ArrayRef<int> Mask,
10073                                           SelectionDAG &DAG) {
10074   assert(VT.getSizeInBits() >= 256 &&
10075          "Only for 256-bit or wider vector shuffles!");
10076   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
10077   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
10078
10079   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
10080   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
10081
10082   int NumElements = VT.getVectorNumElements();
10083   int SplitNumElements = NumElements / 2;
10084   MVT ScalarVT = VT.getScalarType();
10085   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
10086
10087   SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
10088                              DAG.getIntPtrConstant(0));
10089   SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
10090                              DAG.getIntPtrConstant(SplitNumElements));
10091   SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
10092                              DAG.getIntPtrConstant(0));
10093   SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
10094                              DAG.getIntPtrConstant(SplitNumElements));
10095
10096   // Now create two 4-way blends of these half-width vectors.
10097   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
10098     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
10099     SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
10100     for (int i = 0; i < SplitNumElements; ++i) {
10101       int M = HalfMask[i];
10102       if (M >= NumElements) {
10103         if (M >= NumElements + SplitNumElements)
10104           UseHiV2 = true;
10105         else
10106           UseLoV2 = true;
10107         V2BlendMask.push_back(M - NumElements);
10108         V1BlendMask.push_back(-1);
10109         BlendMask.push_back(SplitNumElements + i);
10110       } else if (M >= 0) {
10111         if (M >= SplitNumElements)
10112           UseHiV1 = true;
10113         else
10114           UseLoV1 = true;
10115         V2BlendMask.push_back(-1);
10116         V1BlendMask.push_back(M);
10117         BlendMask.push_back(i);
10118       } else {
10119         V2BlendMask.push_back(-1);
10120         V1BlendMask.push_back(-1);
10121         BlendMask.push_back(-1);
10122       }
10123     }
10124
10125     // Because the lowering happens after all combining takes place, we need to
10126     // manually combine these blend masks as much as possible so that we create
10127     // a minimal number of high-level vector shuffle nodes.
10128
10129     // First try just blending the halves of V1 or V2.
10130     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
10131       return DAG.getUNDEF(SplitVT);
10132     if (!UseLoV2 && !UseHiV2)
10133       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10134     if (!UseLoV1 && !UseHiV1)
10135       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10136
10137     SDValue V1Blend, V2Blend;
10138     if (UseLoV1 && UseHiV1) {
10139       V1Blend =
10140         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10141     } else {
10142       // We only use half of V1 so map the usage down into the final blend mask.
10143       V1Blend = UseLoV1 ? LoV1 : HiV1;
10144       for (int i = 0; i < SplitNumElements; ++i)
10145         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
10146           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
10147     }
10148     if (UseLoV2 && UseHiV2) {
10149       V2Blend =
10150         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10151     } else {
10152       // We only use half of V2 so map the usage down into the final blend mask.
10153       V2Blend = UseLoV2 ? LoV2 : HiV2;
10154       for (int i = 0; i < SplitNumElements; ++i)
10155         if (BlendMask[i] >= SplitNumElements)
10156           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
10157     }
10158     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
10159   };
10160   SDValue Lo = HalfBlend(LoMask);
10161   SDValue Hi = HalfBlend(HiMask);
10162   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
10163 }
10164
10165 /// \brief Either split a vector in halves or decompose the shuffles and the
10166 /// blend.
10167 ///
10168 /// This is provided as a good fallback for many lowerings of non-single-input
10169 /// shuffles with more than one 128-bit lane. In those cases, we want to select
10170 /// between splitting the shuffle into 128-bit components and stitching those
10171 /// back together vs. extracting the single-input shuffles and blending those
10172 /// results.
10173 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
10174                                                 SDValue V2, ArrayRef<int> Mask,
10175                                                 SelectionDAG &DAG) {
10176   assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
10177                                             "lower single-input shuffles as it "
10178                                             "could then recurse on itself.");
10179   int Size = Mask.size();
10180
10181   // If this can be modeled as a broadcast of two elements followed by a blend,
10182   // prefer that lowering. This is especially important because broadcasts can
10183   // often fold with memory operands.
10184   auto DoBothBroadcast = [&] {
10185     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10186     for (int M : Mask)
10187       if (M >= Size) {
10188         if (V2BroadcastIdx == -1)
10189           V2BroadcastIdx = M - Size;
10190         else if (M - Size != V2BroadcastIdx)
10191           return false;
10192       } else if (M >= 0) {
10193         if (V1BroadcastIdx == -1)
10194           V1BroadcastIdx = M;
10195         else if (M != V1BroadcastIdx)
10196           return false;
10197       }
10198     return true;
10199   };
10200   if (DoBothBroadcast())
10201     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10202                                                       DAG);
10203
10204   // If the inputs all stem from a single 128-bit lane of each input, then we
10205   // split them rather than blending because the split will decompose to
10206   // unusually few instructions.
10207   int LaneCount = VT.getSizeInBits() / 128;
10208   int LaneSize = Size / LaneCount;
10209   SmallBitVector LaneInputs[2];
10210   LaneInputs[0].resize(LaneCount, false);
10211   LaneInputs[1].resize(LaneCount, false);
10212   for (int i = 0; i < Size; ++i)
10213     if (Mask[i] >= 0)
10214       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10215   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10216     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10217
10218   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10219   // that the decomposed single-input shuffles don't end up here.
10220   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10221 }
10222
10223 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10224 /// a permutation and blend of those lanes.
10225 ///
10226 /// This essentially blends the out-of-lane inputs to each lane into the lane
10227 /// from a permuted copy of the vector. This lowering strategy results in four
10228 /// instructions in the worst case for a single-input cross lane shuffle which
10229 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10230 /// of. Special cases for each particular shuffle pattern should be handled
10231 /// prior to trying this lowering.
10232 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
10233                                                        SDValue V1, SDValue V2,
10234                                                        ArrayRef<int> Mask,
10235                                                        SelectionDAG &DAG) {
10236   // FIXME: This should probably be generalized for 512-bit vectors as well.
10237   assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
10238   int LaneSize = Mask.size() / 2;
10239
10240   // If there are only inputs from one 128-bit lane, splitting will in fact be
10241   // less expensive. The flags track wether the given lane contains an element
10242   // that crosses to another lane.
10243   bool LaneCrossing[2] = {false, false};
10244   for (int i = 0, Size = Mask.size(); i < Size; ++i)
10245     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10246       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10247   if (!LaneCrossing[0] || !LaneCrossing[1])
10248     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10249
10250   if (isSingleInputShuffleMask(Mask)) {
10251     SmallVector<int, 32> FlippedBlendMask;
10252     for (int i = 0, Size = Mask.size(); i < Size; ++i)
10253       FlippedBlendMask.push_back(
10254           Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10255                                   ? Mask[i]
10256                                   : Mask[i] % LaneSize +
10257                                         (i / LaneSize) * LaneSize + Size));
10258
10259     // Flip the vector, and blend the results which should now be in-lane. The
10260     // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10261     // 5 for the high source. The value 3 selects the high half of source 2 and
10262     // the value 2 selects the low half of source 2. We only use source 2 to
10263     // allow folding it into a memory operand.
10264     unsigned PERMMask = 3 | 2 << 4;
10265     SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10266                                   V1, DAG.getConstant(PERMMask, MVT::i8));
10267     return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10268   }
10269
10270   // This now reduces to two single-input shuffles of V1 and V2 which at worst
10271   // will be handled by the above logic and a blend of the results, much like
10272   // other patterns in AVX.
10273   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10274 }
10275
10276 /// \brief Handle lowering 2-lane 128-bit shuffles.
10277 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10278                                         SDValue V2, ArrayRef<int> Mask,
10279                                         const X86Subtarget *Subtarget,
10280                                         SelectionDAG &DAG) {
10281   // Blends are faster and handle all the non-lane-crossing cases.
10282   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10283                                                 Subtarget, DAG))
10284     return Blend;
10285
10286   MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10287                                VT.getVectorNumElements() / 2);
10288   // Check for patterns which can be matched with a single insert of a 128-bit
10289   // subvector.
10290   if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
10291       isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
10292     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10293                               DAG.getIntPtrConstant(0));
10294     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10295                               Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
10296     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10297   }
10298   if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
10299     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10300                               DAG.getIntPtrConstant(0));
10301     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
10302                               DAG.getIntPtrConstant(2));
10303     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10304   }
10305
10306   // Otherwise form a 128-bit permutation.
10307   // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
10308   unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
10309   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10310                      DAG.getConstant(PermMask, MVT::i8));
10311 }
10312
10313 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10314 /// shuffling each lane.
10315 ///
10316 /// This will only succeed when the result of fixing the 128-bit lanes results
10317 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10318 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10319 /// the lane crosses early and then use simpler shuffles within each lane.
10320 ///
10321 /// FIXME: It might be worthwhile at some point to support this without
10322 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10323 /// in x86 only floating point has interesting non-repeating shuffles, and even
10324 /// those are still *marginally* more expensive.
10325 static SDValue lowerVectorShuffleByMerging128BitLanes(
10326     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10327     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
10328   assert(!isSingleInputShuffleMask(Mask) &&
10329          "This is only useful with multiple inputs.");
10330
10331   int Size = Mask.size();
10332   int LaneSize = 128 / VT.getScalarSizeInBits();
10333   int NumLanes = Size / LaneSize;
10334   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10335
10336   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10337   // check whether the in-128-bit lane shuffles share a repeating pattern.
10338   SmallVector<int, 4> Lanes;
10339   Lanes.resize(NumLanes, -1);
10340   SmallVector<int, 4> InLaneMask;
10341   InLaneMask.resize(LaneSize, -1);
10342   for (int i = 0; i < Size; ++i) {
10343     if (Mask[i] < 0)
10344       continue;
10345
10346     int j = i / LaneSize;
10347
10348     if (Lanes[j] < 0) {
10349       // First entry we've seen for this lane.
10350       Lanes[j] = Mask[i] / LaneSize;
10351     } else if (Lanes[j] != Mask[i] / LaneSize) {
10352       // This doesn't match the lane selected previously!
10353       return SDValue();
10354     }
10355
10356     // Check that within each lane we have a consistent shuffle mask.
10357     int k = i % LaneSize;
10358     if (InLaneMask[k] < 0) {
10359       InLaneMask[k] = Mask[i] % LaneSize;
10360     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10361       // This doesn't fit a repeating in-lane mask.
10362       return SDValue();
10363     }
10364   }
10365
10366   // First shuffle the lanes into place.
10367   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10368                                 VT.getSizeInBits() / 64);
10369   SmallVector<int, 8> LaneMask;
10370   LaneMask.resize(NumLanes * 2, -1);
10371   for (int i = 0; i < NumLanes; ++i)
10372     if (Lanes[i] >= 0) {
10373       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10374       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10375     }
10376
10377   V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
10378   V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
10379   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10380
10381   // Cast it back to the type we actually want.
10382   LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
10383
10384   // Now do a simple shuffle that isn't lane crossing.
10385   SmallVector<int, 8> NewMask;
10386   NewMask.resize(Size, -1);
10387   for (int i = 0; i < Size; ++i)
10388     if (Mask[i] >= 0)
10389       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10390   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10391          "Must not introduce lane crosses at this point!");
10392
10393   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10394 }
10395
10396 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10397 /// given mask.
10398 ///
10399 /// This returns true if the elements from a particular input are already in the
10400 /// slot required by the given mask and require no permutation.
10401 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10402   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10403   int Size = Mask.size();
10404   for (int i = 0; i < Size; ++i)
10405     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10406       return false;
10407
10408   return true;
10409 }
10410
10411 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
10412 ///
10413 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
10414 /// isn't available.
10415 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10416                                        const X86Subtarget *Subtarget,
10417                                        SelectionDAG &DAG) {
10418   SDLoc DL(Op);
10419   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10420   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10421   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10422   ArrayRef<int> Mask = SVOp->getMask();
10423   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10424
10425   SmallVector<int, 4> WidenedMask;
10426   if (canWidenShuffleElements(Mask, WidenedMask))
10427     return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
10428                                     DAG);
10429
10430   if (isSingleInputShuffleMask(Mask)) {
10431     // Check for being able to broadcast a single element.
10432     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
10433                                                           Mask, Subtarget, DAG))
10434       return Broadcast;
10435
10436     // Use low duplicate instructions for masks that match their pattern.
10437     if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
10438       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
10439
10440     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
10441       // Non-half-crossing single input shuffles can be lowerid with an
10442       // interleaved permutation.
10443       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
10444                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
10445       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
10446                          DAG.getConstant(VPERMILPMask, MVT::i8));
10447     }
10448
10449     // With AVX2 we have direct support for this permutation.
10450     if (Subtarget->hasAVX2())
10451       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
10452                          getV4X86ShuffleImm8ForMask(Mask, DAG));
10453
10454     // Otherwise, fall back.
10455     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
10456                                                    DAG);
10457   }
10458
10459   // X86 has dedicated unpack instructions that can handle specific blend
10460   // operations: UNPCKH and UNPCKL.
10461   if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10462     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
10463   if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10464     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
10465
10466   // If we have a single input to the zero element, insert that into V1 if we
10467   // can do so cheaply.
10468   int NumV2Elements =
10469       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
10470   if (NumV2Elements == 1 && Mask[0] >= 4)
10471     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10472             MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
10473       return Insertion;
10474
10475   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
10476                                                 Subtarget, DAG))
10477     return Blend;
10478
10479   // Check if the blend happens to exactly fit that of SHUFPD.
10480   if ((Mask[0] == -1 || Mask[0] < 2) &&
10481       (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
10482       (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
10483       (Mask[3] == -1 || Mask[3] >= 6)) {
10484     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
10485                           ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
10486     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
10487                        DAG.getConstant(SHUFPDMask, MVT::i8));
10488   }
10489   if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
10490       (Mask[1] == -1 || Mask[1] < 2) &&
10491       (Mask[2] == -1 || Mask[2] >= 6) &&
10492       (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
10493     unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
10494                           ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
10495     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
10496                        DAG.getConstant(SHUFPDMask, MVT::i8));
10497   }
10498
10499   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10500   // shuffle. However, if we have AVX2 and either inputs are already in place,
10501   // we will be able to shuffle even across lanes the other input in a single
10502   // instruction so skip this pattern.
10503   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10504                                  isShuffleMaskInputInPlace(1, Mask))))
10505     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10506             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10507       return Result;
10508
10509   // If we have AVX2 then we always want to lower with a blend because an v4 we
10510   // can fully permute the elements.
10511   if (Subtarget->hasAVX2())
10512     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
10513                                                       Mask, DAG);
10514
10515   // Otherwise fall back on generic lowering.
10516   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
10517 }
10518
10519 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
10520 ///
10521 /// This routine is only called when we have AVX2 and thus a reasonable
10522 /// instruction set for v4i64 shuffling..
10523 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10524                                        const X86Subtarget *Subtarget,
10525                                        SelectionDAG &DAG) {
10526   SDLoc DL(Op);
10527   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10528   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10529   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10530   ArrayRef<int> Mask = SVOp->getMask();
10531   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10532   assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
10533
10534   SmallVector<int, 4> WidenedMask;
10535   if (canWidenShuffleElements(Mask, WidenedMask))
10536     return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
10537                                     DAG);
10538
10539   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
10540                                                 Subtarget, DAG))
10541     return Blend;
10542
10543   // Check for being able to broadcast a single element.
10544   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
10545                                                         Mask, Subtarget, DAG))
10546     return Broadcast;
10547
10548   // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
10549   // use lower latency instructions that will operate on both 128-bit lanes.
10550   SmallVector<int, 2> RepeatedMask;
10551   if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
10552     if (isSingleInputShuffleMask(Mask)) {
10553       int PSHUFDMask[] = {-1, -1, -1, -1};
10554       for (int i = 0; i < 2; ++i)
10555         if (RepeatedMask[i] >= 0) {
10556           PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
10557           PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
10558         }
10559       return DAG.getNode(
10560           ISD::BITCAST, DL, MVT::v4i64,
10561           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
10562                       DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
10563                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
10564     }
10565
10566     // Use dedicated unpack instructions for masks that match their pattern.
10567     if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10568       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
10569     if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10570       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
10571   }
10572
10573   // AVX2 provides a direct instruction for permuting a single input across
10574   // lanes.
10575   if (isSingleInputShuffleMask(Mask))
10576     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
10577                        getV4X86ShuffleImm8ForMask(Mask, DAG));
10578
10579   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10580   // shuffle. However, if we have AVX2 and either inputs are already in place,
10581   // we will be able to shuffle even across lanes the other input in a single
10582   // instruction so skip this pattern.
10583   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10584                                  isShuffleMaskInputInPlace(1, Mask))))
10585     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10586             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
10587       return Result;
10588
10589   // Otherwise fall back on generic blend lowering.
10590   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
10591                                                     Mask, DAG);
10592 }
10593
10594 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
10595 ///
10596 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
10597 /// isn't available.
10598 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10599                                        const X86Subtarget *Subtarget,
10600                                        SelectionDAG &DAG) {
10601   SDLoc DL(Op);
10602   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10603   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10604   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10605   ArrayRef<int> Mask = SVOp->getMask();
10606   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10607
10608   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
10609                                                 Subtarget, DAG))
10610     return Blend;
10611
10612   // Check for being able to broadcast a single element.
10613   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
10614                                                         Mask, Subtarget, DAG))
10615     return Broadcast;
10616
10617   // If the shuffle mask is repeated in each 128-bit lane, we have many more
10618   // options to efficiently lower the shuffle.
10619   SmallVector<int, 4> RepeatedMask;
10620   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
10621     assert(RepeatedMask.size() == 4 &&
10622            "Repeated masks must be half the mask width!");
10623
10624     // Use even/odd duplicate instructions for masks that match their pattern.
10625     if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6))
10626       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
10627     if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7))
10628       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
10629
10630     if (isSingleInputShuffleMask(Mask))
10631       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
10632                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10633
10634     // Use dedicated unpack instructions for masks that match their pattern.
10635     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10636       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
10637     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10638       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
10639
10640     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
10641     // have already handled any direct blends. We also need to squash the
10642     // repeated mask into a simulated v4f32 mask.
10643     for (int i = 0; i < 4; ++i)
10644       if (RepeatedMask[i] >= 8)
10645         RepeatedMask[i] -= 4;
10646     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
10647   }
10648
10649   // If we have a single input shuffle with different shuffle patterns in the
10650   // two 128-bit lanes use the variable mask to VPERMILPS.
10651   if (isSingleInputShuffleMask(Mask)) {
10652     SDValue VPermMask[8];
10653     for (int i = 0; i < 8; ++i)
10654       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10655                                  : DAG.getConstant(Mask[i], MVT::i32);
10656     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
10657       return DAG.getNode(
10658           X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
10659           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
10660
10661     if (Subtarget->hasAVX2())
10662       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
10663                          DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
10664                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
10665                                                  MVT::v8i32, VPermMask)),
10666                          V1);
10667
10668     // Otherwise, fall back.
10669     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
10670                                                    DAG);
10671   }
10672
10673   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10674   // shuffle.
10675   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10676           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
10677     return Result;
10678
10679   // If we have AVX2 then we always want to lower with a blend because at v8 we
10680   // can fully permute the elements.
10681   if (Subtarget->hasAVX2())
10682     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
10683                                                       Mask, DAG);
10684
10685   // Otherwise fall back on generic lowering.
10686   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
10687 }
10688
10689 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
10690 ///
10691 /// This routine is only called when we have AVX2 and thus a reasonable
10692 /// instruction set for v8i32 shuffling..
10693 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10694                                        const X86Subtarget *Subtarget,
10695                                        SelectionDAG &DAG) {
10696   SDLoc DL(Op);
10697   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10698   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10699   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10700   ArrayRef<int> Mask = SVOp->getMask();
10701   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10702   assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
10703
10704   // Whenever we can lower this as a zext, that instruction is strictly faster
10705   // than any alternative. It also allows us to fold memory operands into the
10706   // shuffle in many cases.
10707   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
10708                                                          Mask, Subtarget, DAG))
10709     return ZExt;
10710
10711   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
10712                                                 Subtarget, DAG))
10713     return Blend;
10714
10715   // Check for being able to broadcast a single element.
10716   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
10717                                                         Mask, Subtarget, DAG))
10718     return Broadcast;
10719
10720   // If the shuffle mask is repeated in each 128-bit lane we can use more
10721   // efficient instructions that mirror the shuffles across the two 128-bit
10722   // lanes.
10723   SmallVector<int, 4> RepeatedMask;
10724   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
10725     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
10726     if (isSingleInputShuffleMask(Mask))
10727       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
10728                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10729
10730     // Use dedicated unpack instructions for masks that match their pattern.
10731     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10732       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
10733     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10734       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
10735   }
10736
10737   // If the shuffle patterns aren't repeated but it is a single input, directly
10738   // generate a cross-lane VPERMD instruction.
10739   if (isSingleInputShuffleMask(Mask)) {
10740     SDValue VPermMask[8];
10741     for (int i = 0; i < 8; ++i)
10742       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10743                                  : DAG.getConstant(Mask[i], MVT::i32);
10744     return DAG.getNode(
10745         X86ISD::VPERMV, DL, MVT::v8i32,
10746         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
10747   }
10748
10749   // Try to use bit shift instructions.
10750   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10751           DL, MVT::v8i32, V1, V2, Mask, DAG))
10752     return Shift;
10753
10754   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10755   // shuffle.
10756   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10757           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
10758     return Result;
10759
10760   // Otherwise fall back on generic blend lowering.
10761   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
10762                                                     Mask, DAG);
10763 }
10764
10765 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
10766 ///
10767 /// This routine is only called when we have AVX2 and thus a reasonable
10768 /// instruction set for v16i16 shuffling..
10769 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10770                                         const X86Subtarget *Subtarget,
10771                                         SelectionDAG &DAG) {
10772   SDLoc DL(Op);
10773   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10774   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10775   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10776   ArrayRef<int> Mask = SVOp->getMask();
10777   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10778   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
10779
10780   // Whenever we can lower this as a zext, that instruction is strictly faster
10781   // than any alternative. It also allows us to fold memory operands into the
10782   // shuffle in many cases.
10783   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
10784                                                          Mask, Subtarget, DAG))
10785     return ZExt;
10786
10787   // Check for being able to broadcast a single element.
10788   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
10789                                                         Mask, Subtarget, DAG))
10790     return Broadcast;
10791
10792   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
10793                                                 Subtarget, DAG))
10794     return Blend;
10795
10796   // Use dedicated unpack instructions for masks that match their pattern.
10797   if (isShuffleEquivalent(Mask,
10798                           // First 128-bit lane:
10799                           0, 16, 1, 17, 2, 18, 3, 19,
10800                           // Second 128-bit lane:
10801                           8, 24, 9, 25, 10, 26, 11, 27))
10802     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
10803   if (isShuffleEquivalent(Mask,
10804                           // First 128-bit lane:
10805                           4, 20, 5, 21, 6, 22, 7, 23,
10806                           // Second 128-bit lane:
10807                           12, 28, 13, 29, 14, 30, 15, 31))
10808     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
10809
10810   if (isSingleInputShuffleMask(Mask)) {
10811     // There are no generalized cross-lane shuffle operations available on i16
10812     // element types.
10813     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
10814       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
10815                                                      Mask, DAG);
10816
10817     SDValue PSHUFBMask[32];
10818     for (int i = 0; i < 16; ++i) {
10819       if (Mask[i] == -1) {
10820         PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
10821         continue;
10822       }
10823
10824       int M = i < 8 ? Mask[i] : Mask[i] - 8;
10825       assert(M >= 0 && M < 8 && "Invalid single-input mask!");
10826       PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
10827       PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
10828     }
10829     return DAG.getNode(
10830         ISD::BITCAST, DL, MVT::v16i16,
10831         DAG.getNode(
10832             X86ISD::PSHUFB, DL, MVT::v32i8,
10833             DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
10834             DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
10835   }
10836
10837   // Try to use bit shift instructions.
10838   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10839           DL, MVT::v16i16, V1, V2, Mask, DAG))
10840     return Shift;
10841
10842   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10843   // shuffle.
10844   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10845           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
10846     return Result;
10847
10848   // Otherwise fall back on generic lowering.
10849   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
10850 }
10851
10852 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
10853 ///
10854 /// This routine is only called when we have AVX2 and thus a reasonable
10855 /// instruction set for v32i8 shuffling..
10856 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10857                                        const X86Subtarget *Subtarget,
10858                                        SelectionDAG &DAG) {
10859   SDLoc DL(Op);
10860   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10861   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10862   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10863   ArrayRef<int> Mask = SVOp->getMask();
10864   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10865   assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
10866
10867   // Whenever we can lower this as a zext, that instruction is strictly faster
10868   // than any alternative. It also allows us to fold memory operands into the
10869   // shuffle in many cases.
10870   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
10871                                                          Mask, Subtarget, DAG))
10872     return ZExt;
10873
10874   // Check for being able to broadcast a single element.
10875   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
10876                                                         Mask, Subtarget, DAG))
10877     return Broadcast;
10878
10879   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
10880                                                 Subtarget, DAG))
10881     return Blend;
10882
10883   // Use dedicated unpack instructions for masks that match their pattern.
10884   // Note that these are repeated 128-bit lane unpacks, not unpacks across all
10885   // 256-bit lanes.
10886   if (isShuffleEquivalent(
10887           Mask,
10888           // First 128-bit lane:
10889           0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
10890           // Second 128-bit lane:
10891           16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
10892     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
10893   if (isShuffleEquivalent(
10894           Mask,
10895           // First 128-bit lane:
10896           8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
10897           // Second 128-bit lane:
10898           24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
10899     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
10900
10901   if (isSingleInputShuffleMask(Mask)) {
10902     // There are no generalized cross-lane shuffle operations available on i8
10903     // element types.
10904     if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
10905       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
10906                                                      Mask, DAG);
10907
10908     SDValue PSHUFBMask[32];
10909     for (int i = 0; i < 32; ++i)
10910       PSHUFBMask[i] =
10911           Mask[i] < 0
10912               ? DAG.getUNDEF(MVT::i8)
10913               : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
10914
10915     return DAG.getNode(
10916         X86ISD::PSHUFB, DL, MVT::v32i8, V1,
10917         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
10918   }
10919
10920   // Try to use bit shift instructions.
10921   if (SDValue Shift = lowerVectorShuffleAsBitShift(
10922           DL, MVT::v32i8, V1, V2, Mask, DAG))
10923     return Shift;
10924
10925   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10926   // shuffle.
10927   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10928           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
10929     return Result;
10930
10931   // Otherwise fall back on generic lowering.
10932   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
10933 }
10934
10935 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
10936 ///
10937 /// This routine either breaks down the specific type of a 256-bit x86 vector
10938 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
10939 /// together based on the available instructions.
10940 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10941                                         MVT VT, const X86Subtarget *Subtarget,
10942                                         SelectionDAG &DAG) {
10943   SDLoc DL(Op);
10944   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10945   ArrayRef<int> Mask = SVOp->getMask();
10946
10947   // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
10948   // check for those subtargets here and avoid much of the subtarget querying in
10949   // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
10950   // ability to manipulate a 256-bit vector with integer types. Since we'll use
10951   // floating point types there eventually, just immediately cast everything to
10952   // a float and operate entirely in that domain.
10953   if (VT.isInteger() && !Subtarget->hasAVX2()) {
10954     int ElementBits = VT.getScalarSizeInBits();
10955     if (ElementBits < 32)
10956       // No floating point type available, decompose into 128-bit vectors.
10957       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10958
10959     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
10960                                 VT.getVectorNumElements());
10961     V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
10962     V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
10963     return DAG.getNode(ISD::BITCAST, DL, VT,
10964                        DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
10965   }
10966
10967   switch (VT.SimpleTy) {
10968   case MVT::v4f64:
10969     return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10970   case MVT::v4i64:
10971     return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10972   case MVT::v8f32:
10973     return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10974   case MVT::v8i32:
10975     return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10976   case MVT::v16i16:
10977     return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10978   case MVT::v32i8:
10979     return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10980
10981   default:
10982     llvm_unreachable("Not a valid 256-bit x86 vector type!");
10983   }
10984 }
10985
10986 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
10987 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10988                                        const X86Subtarget *Subtarget,
10989                                        SelectionDAG &DAG) {
10990   SDLoc DL(Op);
10991   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10992   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10993   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10994   ArrayRef<int> Mask = SVOp->getMask();
10995   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10996
10997   // X86 has dedicated unpack instructions that can handle specific blend
10998   // operations: UNPCKH and UNPCKL.
10999   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
11000     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
11001   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11002     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
11003
11004   // FIXME: Implement direct support for this type!
11005   return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
11006 }
11007
11008 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
11009 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11010                                        const X86Subtarget *Subtarget,
11011                                        SelectionDAG &DAG) {
11012   SDLoc DL(Op);
11013   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11014   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11015   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11016   ArrayRef<int> Mask = SVOp->getMask();
11017   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11018
11019   // Use dedicated unpack instructions for masks that match their pattern.
11020   if (isShuffleEquivalent(Mask,
11021                           0, 16, 1, 17, 4, 20, 5, 21,
11022                           8, 24, 9, 25, 12, 28, 13, 29))
11023     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
11024   if (isShuffleEquivalent(Mask,
11025                           2, 18, 3, 19, 6, 22, 7, 23,
11026                           10, 26, 11, 27, 14, 30, 15, 31))
11027     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
11028
11029   // FIXME: Implement direct support for this type!
11030   return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
11031 }
11032
11033 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
11034 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11035                                        const X86Subtarget *Subtarget,
11036                                        SelectionDAG &DAG) {
11037   SDLoc DL(Op);
11038   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11039   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11040   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11041   ArrayRef<int> Mask = SVOp->getMask();
11042   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11043
11044   // X86 has dedicated unpack instructions that can handle specific blend
11045   // operations: UNPCKH and UNPCKL.
11046   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
11047     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
11048   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
11049     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
11050
11051   // FIXME: Implement direct support for this type!
11052   return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
11053 }
11054
11055 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
11056 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11057                                        const X86Subtarget *Subtarget,
11058                                        SelectionDAG &DAG) {
11059   SDLoc DL(Op);
11060   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11061   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11062   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11063   ArrayRef<int> Mask = SVOp->getMask();
11064   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11065
11066   // Use dedicated unpack instructions for masks that match their pattern.
11067   if (isShuffleEquivalent(Mask,
11068                           0, 16, 1, 17, 4, 20, 5, 21,
11069                           8, 24, 9, 25, 12, 28, 13, 29))
11070     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
11071   if (isShuffleEquivalent(Mask,
11072                           2, 18, 3, 19, 6, 22, 7, 23,
11073                           10, 26, 11, 27, 14, 30, 15, 31))
11074     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
11075
11076   // FIXME: Implement direct support for this type!
11077   return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
11078 }
11079
11080 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
11081 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11082                                         const X86Subtarget *Subtarget,
11083                                         SelectionDAG &DAG) {
11084   SDLoc DL(Op);
11085   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11086   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11087   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11088   ArrayRef<int> Mask = SVOp->getMask();
11089   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11090   assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
11091
11092   // FIXME: Implement direct support for this type!
11093   return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
11094 }
11095
11096 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
11097 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11098                                        const X86Subtarget *Subtarget,
11099                                        SelectionDAG &DAG) {
11100   SDLoc DL(Op);
11101   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11102   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11103   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11104   ArrayRef<int> Mask = SVOp->getMask();
11105   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
11106   assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
11107
11108   // FIXME: Implement direct support for this type!
11109   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
11110 }
11111
11112 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
11113 ///
11114 /// This routine either breaks down the specific type of a 512-bit x86 vector
11115 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
11116 /// together based on the available instructions.
11117 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
11118                                         MVT VT, const X86Subtarget *Subtarget,
11119                                         SelectionDAG &DAG) {
11120   SDLoc DL(Op);
11121   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11122   ArrayRef<int> Mask = SVOp->getMask();
11123   assert(Subtarget->hasAVX512() &&
11124          "Cannot lower 512-bit vectors w/ basic ISA!");
11125
11126   // Check for being able to broadcast a single element.
11127   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
11128                                                         Mask, Subtarget, DAG))
11129     return Broadcast;
11130
11131   // Dispatch to each element type for lowering. If we don't have supprot for
11132   // specific element type shuffles at 512 bits, immediately split them and
11133   // lower them. Each lowering routine of a given type is allowed to assume that
11134   // the requisite ISA extensions for that element type are available.
11135   switch (VT.SimpleTy) {
11136   case MVT::v8f64:
11137     return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11138   case MVT::v16f32:
11139     return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11140   case MVT::v8i64:
11141     return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
11142   case MVT::v16i32:
11143     return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
11144   case MVT::v32i16:
11145     if (Subtarget->hasBWI())
11146       return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
11147     break;
11148   case MVT::v64i8:
11149     if (Subtarget->hasBWI())
11150       return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
11151     break;
11152
11153   default:
11154     llvm_unreachable("Not a valid 512-bit x86 vector type!");
11155   }
11156
11157   // Otherwise fall back on splitting.
11158   return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11159 }
11160
11161 /// \brief Top-level lowering for x86 vector shuffles.
11162 ///
11163 /// This handles decomposition, canonicalization, and lowering of all x86
11164 /// vector shuffles. Most of the specific lowering strategies are encapsulated
11165 /// above in helper routines. The canonicalization attempts to widen shuffles
11166 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
11167 /// s.t. only one of the two inputs needs to be tested, etc.
11168 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
11169                                   SelectionDAG &DAG) {
11170   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11171   ArrayRef<int> Mask = SVOp->getMask();
11172   SDValue V1 = Op.getOperand(0);
11173   SDValue V2 = Op.getOperand(1);
11174   MVT VT = Op.getSimpleValueType();
11175   int NumElements = VT.getVectorNumElements();
11176   SDLoc dl(Op);
11177
11178   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
11179
11180   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
11181   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11182   if (V1IsUndef && V2IsUndef)
11183     return DAG.getUNDEF(VT);
11184
11185   // When we create a shuffle node we put the UNDEF node to second operand,
11186   // but in some cases the first operand may be transformed to UNDEF.
11187   // In this case we should just commute the node.
11188   if (V1IsUndef)
11189     return DAG.getCommutedVectorShuffle(*SVOp);
11190
11191   // Check for non-undef masks pointing at an undef vector and make the masks
11192   // undef as well. This makes it easier to match the shuffle based solely on
11193   // the mask.
11194   if (V2IsUndef)
11195     for (int M : Mask)
11196       if (M >= NumElements) {
11197         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
11198         for (int &M : NewMask)
11199           if (M >= NumElements)
11200             M = -1;
11201         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
11202       }
11203
11204   // Try to collapse shuffles into using a vector type with fewer elements but
11205   // wider element types. We cap this to not form integers or floating point
11206   // elements wider than 64 bits, but it might be interesting to form i128
11207   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
11208   SmallVector<int, 16> WidenedMask;
11209   if (VT.getScalarSizeInBits() < 64 &&
11210       canWidenShuffleElements(Mask, WidenedMask)) {
11211     MVT NewEltVT = VT.isFloatingPoint()
11212                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
11213                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
11214     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
11215     // Make sure that the new vector type is legal. For example, v2f64 isn't
11216     // legal on SSE1.
11217     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
11218       V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
11219       V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
11220       return DAG.getNode(ISD::BITCAST, dl, VT,
11221                          DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
11222     }
11223   }
11224
11225   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
11226   for (int M : SVOp->getMask())
11227     if (M < 0)
11228       ++NumUndefElements;
11229     else if (M < NumElements)
11230       ++NumV1Elements;
11231     else
11232       ++NumV2Elements;
11233
11234   // Commute the shuffle as needed such that more elements come from V1 than
11235   // V2. This allows us to match the shuffle pattern strictly on how many
11236   // elements come from V1 without handling the symmetric cases.
11237   if (NumV2Elements > NumV1Elements)
11238     return DAG.getCommutedVectorShuffle(*SVOp);
11239
11240   // When the number of V1 and V2 elements are the same, try to minimize the
11241   // number of uses of V2 in the low half of the vector. When that is tied,
11242   // ensure that the sum of indices for V1 is equal to or lower than the sum
11243   // indices for V2. When those are equal, try to ensure that the number of odd
11244   // indices for V1 is lower than the number of odd indices for V2.
11245   if (NumV1Elements == NumV2Elements) {
11246     int LowV1Elements = 0, LowV2Elements = 0;
11247     for (int M : SVOp->getMask().slice(0, NumElements / 2))
11248       if (M >= NumElements)
11249         ++LowV2Elements;
11250       else if (M >= 0)
11251         ++LowV1Elements;
11252     if (LowV2Elements > LowV1Elements) {
11253       return DAG.getCommutedVectorShuffle(*SVOp);
11254     } else if (LowV2Elements == LowV1Elements) {
11255       int SumV1Indices = 0, SumV2Indices = 0;
11256       for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11257         if (SVOp->getMask()[i] >= NumElements)
11258           SumV2Indices += i;
11259         else if (SVOp->getMask()[i] >= 0)
11260           SumV1Indices += i;
11261       if (SumV2Indices < SumV1Indices) {
11262         return DAG.getCommutedVectorShuffle(*SVOp);
11263       } else if (SumV2Indices == SumV1Indices) {
11264         int NumV1OddIndices = 0, NumV2OddIndices = 0;
11265         for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11266           if (SVOp->getMask()[i] >= NumElements)
11267             NumV2OddIndices += i % 2;
11268           else if (SVOp->getMask()[i] >= 0)
11269             NumV1OddIndices += i % 2;
11270         if (NumV2OddIndices < NumV1OddIndices)
11271           return DAG.getCommutedVectorShuffle(*SVOp);
11272       }
11273     }
11274   }
11275
11276   // For each vector width, delegate to a specialized lowering routine.
11277   if (VT.getSizeInBits() == 128)
11278     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11279
11280   if (VT.getSizeInBits() == 256)
11281     return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11282
11283   // Force AVX-512 vectors to be scalarized for now.
11284   // FIXME: Implement AVX-512 support!
11285   if (VT.getSizeInBits() == 512)
11286     return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11287
11288   llvm_unreachable("Unimplemented!");
11289 }
11290
11291
11292 //===----------------------------------------------------------------------===//
11293 // Legacy vector shuffle lowering
11294 //
11295 // This code is the legacy code handling vector shuffles until the above
11296 // replaces its functionality and performance.
11297 //===----------------------------------------------------------------------===//
11298
11299 static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
11300                         bool hasInt256, unsigned *MaskOut = nullptr) {
11301   MVT EltVT = VT.getVectorElementType();
11302
11303   // There is no blend with immediate in AVX-512.
11304   if (VT.is512BitVector())
11305     return false;
11306
11307   if (!hasSSE41 || EltVT == MVT::i8)
11308     return false;
11309   if (!hasInt256 && VT == MVT::v16i16)
11310     return false;
11311
11312   unsigned MaskValue = 0;
11313   unsigned NumElems = VT.getVectorNumElements();
11314   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
11315   unsigned NumLanes = (NumElems - 1) / 8 + 1;
11316   unsigned NumElemsInLane = NumElems / NumLanes;
11317
11318   // Blend for v16i16 should be symetric for the both lanes.
11319   for (unsigned i = 0; i < NumElemsInLane; ++i) {
11320
11321     int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
11322     int EltIdx = MaskVals[i];
11323
11324     if ((EltIdx < 0 || EltIdx == (int)i) &&
11325         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
11326       continue;
11327
11328     if (((unsigned)EltIdx == (i + NumElems)) &&
11329         (SndLaneEltIdx < 0 ||
11330          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
11331       MaskValue |= (1 << i);
11332     else
11333       return false;
11334   }
11335
11336   if (MaskOut)
11337     *MaskOut = MaskValue;
11338   return true;
11339 }
11340
11341 // Try to lower a shuffle node into a simple blend instruction.
11342 // This function assumes isBlendMask returns true for this
11343 // SuffleVectorSDNode
11344 static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
11345                                           unsigned MaskValue,
11346                                           const X86Subtarget *Subtarget,
11347                                           SelectionDAG &DAG) {
11348   MVT VT = SVOp->getSimpleValueType(0);
11349   MVT EltVT = VT.getVectorElementType();
11350   assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
11351                      Subtarget->hasInt256() && "Trying to lower a "
11352                                                "VECTOR_SHUFFLE to a Blend but "
11353                                                "with the wrong mask"));
11354   SDValue V1 = SVOp->getOperand(0);
11355   SDValue V2 = SVOp->getOperand(1);
11356   SDLoc dl(SVOp);
11357   unsigned NumElems = VT.getVectorNumElements();
11358
11359   // Convert i32 vectors to floating point if it is not AVX2.
11360   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
11361   MVT BlendVT = VT;
11362   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
11363     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
11364                                NumElems);
11365     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
11366     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
11367   }
11368
11369   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
11370                             DAG.getConstant(MaskValue, MVT::i32));
11371   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
11372 }
11373
11374 /// In vector type \p VT, return true if the element at index \p InputIdx
11375 /// falls on a different 128-bit lane than \p OutputIdx.
11376 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
11377                                      unsigned OutputIdx) {
11378   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
11379   return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
11380 }
11381
11382 /// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
11383 /// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
11384 /// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
11385 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
11386 /// zero.
11387 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
11388                          SelectionDAG &DAG) {
11389   MVT VT = V1.getSimpleValueType();
11390   assert(VT.is128BitVector() || VT.is256BitVector());
11391
11392   MVT EltVT = VT.getVectorElementType();
11393   unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
11394   unsigned NumElts = VT.getVectorNumElements();
11395
11396   SmallVector<SDValue, 32> PshufbMask;
11397   for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
11398     int InputIdx = MaskVals[OutputIdx];
11399     unsigned InputByteIdx;
11400
11401     if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
11402       InputByteIdx = 0x80;
11403     else {
11404       // Cross lane is not allowed.
11405       if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
11406         return SDValue();
11407       InputByteIdx = InputIdx * EltSizeInBytes;
11408       // Index is an byte offset within the 128-bit lane.
11409       InputByteIdx &= 0xf;
11410     }
11411
11412     for (unsigned j = 0; j < EltSizeInBytes; ++j) {
11413       PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
11414       if (InputByteIdx != 0x80)
11415         ++InputByteIdx;
11416     }
11417   }
11418
11419   MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
11420   if (ShufVT != VT)
11421     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
11422   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
11423                      DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
11424 }
11425
11426 // v8i16 shuffles - Prefer shuffles in the following order:
11427 // 1. [all]   pshuflw, pshufhw, optional move
11428 // 2. [ssse3] 1 x pshufb
11429 // 3. [ssse3] 2 x pshufb + 1 x por
11430 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
11431 static SDValue
11432 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
11433                          SelectionDAG &DAG) {
11434   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11435   SDValue V1 = SVOp->getOperand(0);
11436   SDValue V2 = SVOp->getOperand(1);
11437   SDLoc dl(SVOp);
11438   SmallVector<int, 8> MaskVals;
11439
11440   // Determine if more than 1 of the words in each of the low and high quadwords
11441   // of the result come from the same quadword of one of the two inputs.  Undef
11442   // mask values count as coming from any quadword, for better codegen.
11443   //
11444   // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
11445   // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
11446   unsigned LoQuad[] = { 0, 0, 0, 0 };
11447   unsigned HiQuad[] = { 0, 0, 0, 0 };
11448   // Indices of quads used.
11449   std::bitset<4> InputQuads;
11450   for (unsigned i = 0; i < 8; ++i) {
11451     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
11452     int EltIdx = SVOp->getMaskElt(i);
11453     MaskVals.push_back(EltIdx);
11454     if (EltIdx < 0) {
11455       ++Quad[0];
11456       ++Quad[1];
11457       ++Quad[2];
11458       ++Quad[3];
11459       continue;
11460     }
11461     ++Quad[EltIdx / 4];
11462     InputQuads.set(EltIdx / 4);
11463   }
11464
11465   int BestLoQuad = -1;
11466   unsigned MaxQuad = 1;
11467   for (unsigned i = 0; i < 4; ++i) {
11468     if (LoQuad[i] > MaxQuad) {
11469       BestLoQuad = i;
11470       MaxQuad = LoQuad[i];
11471     }
11472   }
11473
11474   int BestHiQuad = -1;
11475   MaxQuad = 1;
11476   for (unsigned i = 0; i < 4; ++i) {
11477     if (HiQuad[i] > MaxQuad) {
11478       BestHiQuad = i;
11479       MaxQuad = HiQuad[i];
11480     }
11481   }
11482
11483   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
11484   // of the two input vectors, shuffle them into one input vector so only a
11485   // single pshufb instruction is necessary. If there are more than 2 input
11486   // quads, disable the next transformation since it does not help SSSE3.
11487   bool V1Used = InputQuads[0] || InputQuads[1];
11488   bool V2Used = InputQuads[2] || InputQuads[3];
11489   if (Subtarget->hasSSSE3()) {
11490     if (InputQuads.count() == 2 && V1Used && V2Used) {
11491       BestLoQuad = InputQuads[0] ? 0 : 1;
11492       BestHiQuad = InputQuads[2] ? 2 : 3;
11493     }
11494     if (InputQuads.count() > 2) {
11495       BestLoQuad = -1;
11496       BestHiQuad = -1;
11497     }
11498   }
11499
11500   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
11501   // the shuffle mask.  If a quad is scored as -1, that means that it contains
11502   // words from all 4 input quadwords.
11503   SDValue NewV;
11504   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
11505     int MaskV[] = {
11506       BestLoQuad < 0 ? 0 : BestLoQuad,
11507       BestHiQuad < 0 ? 1 : BestHiQuad
11508     };
11509     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
11510                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
11511                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
11512     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
11513
11514     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
11515     // source words for the shuffle, to aid later transformations.
11516     bool AllWordsInNewV = true;
11517     bool InOrder[2] = { true, true };
11518     for (unsigned i = 0; i != 8; ++i) {
11519       int idx = MaskVals[i];
11520       if (idx != (int)i)
11521         InOrder[i/4] = false;
11522       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
11523         continue;
11524       AllWordsInNewV = false;
11525       break;
11526     }
11527
11528     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
11529     if (AllWordsInNewV) {
11530       for (int i = 0; i != 8; ++i) {
11531         int idx = MaskVals[i];
11532         if (idx < 0)
11533           continue;
11534         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
11535         if ((idx != i) && idx < 4)
11536           pshufhw = false;
11537         if ((idx != i) && idx > 3)
11538           pshuflw = false;
11539       }
11540       V1 = NewV;
11541       V2Used = false;
11542       BestLoQuad = 0;
11543       BestHiQuad = 1;
11544     }
11545
11546     // If we've eliminated the use of V2, and the new mask is a pshuflw or
11547     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
11548     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
11549       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
11550       unsigned TargetMask = 0;
11551       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
11552                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
11553       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11554       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
11555                              getShufflePSHUFLWImmediate(SVOp);
11556       V1 = NewV.getOperand(0);
11557       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
11558     }
11559   }
11560
11561   // Promote splats to a larger type which usually leads to more efficient code.
11562   // FIXME: Is this true if pshufb is available?
11563   if (SVOp->isSplat())
11564     return PromoteSplat(SVOp, DAG);
11565
11566   // If we have SSSE3, and all words of the result are from 1 input vector,
11567   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
11568   // is present, fall back to case 4.
11569   if (Subtarget->hasSSSE3()) {
11570     SmallVector<SDValue,16> pshufbMask;
11571
11572     // If we have elements from both input vectors, set the high bit of the
11573     // shuffle mask element to zero out elements that come from V2 in the V1
11574     // mask, and elements that come from V1 in the V2 mask, so that the two
11575     // results can be OR'd together.
11576     bool TwoInputs = V1Used && V2Used;
11577     V1 = getPSHUFB(MaskVals, V1, dl, DAG);
11578     if (!TwoInputs)
11579       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11580
11581     // Calculate the shuffle mask for the second input, shuffle it, and
11582     // OR it with the first shuffled input.
11583     CommuteVectorShuffleMask(MaskVals, 8);
11584     V2 = getPSHUFB(MaskVals, V2, dl, DAG);
11585     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11586     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11587   }
11588
11589   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
11590   // and update MaskVals with new element order.
11591   std::bitset<8> InOrder;
11592   if (BestLoQuad >= 0) {
11593     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
11594     for (int i = 0; i != 4; ++i) {
11595       int idx = MaskVals[i];
11596       if (idx < 0) {
11597         InOrder.set(i);
11598       } else if ((idx / 4) == BestLoQuad) {
11599         MaskV[i] = idx & 3;
11600         InOrder.set(i);
11601       }
11602     }
11603     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11604                                 &MaskV[0]);
11605
11606     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11607       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11608       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
11609                                   NewV.getOperand(0),
11610                                   getShufflePSHUFLWImmediate(SVOp), DAG);
11611     }
11612   }
11613
11614   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
11615   // and update MaskVals with the new element order.
11616   if (BestHiQuad >= 0) {
11617     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
11618     for (unsigned i = 4; i != 8; ++i) {
11619       int idx = MaskVals[i];
11620       if (idx < 0) {
11621         InOrder.set(i);
11622       } else if ((idx / 4) == BestHiQuad) {
11623         MaskV[i] = (idx & 3) + 4;
11624         InOrder.set(i);
11625       }
11626     }
11627     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11628                                 &MaskV[0]);
11629
11630     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11631       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11632       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
11633                                   NewV.getOperand(0),
11634                                   getShufflePSHUFHWImmediate(SVOp), DAG);
11635     }
11636   }
11637
11638   // In case BestHi & BestLo were both -1, which means each quadword has a word
11639   // from each of the four input quadwords, calculate the InOrder bitvector now
11640   // before falling through to the insert/extract cleanup.
11641   if (BestLoQuad == -1 && BestHiQuad == -1) {
11642     NewV = V1;
11643     for (int i = 0; i != 8; ++i)
11644       if (MaskVals[i] < 0 || MaskVals[i] == i)
11645         InOrder.set(i);
11646   }
11647
11648   // The other elements are put in the right place using pextrw and pinsrw.
11649   for (unsigned i = 0; i != 8; ++i) {
11650     if (InOrder[i])
11651       continue;
11652     int EltIdx = MaskVals[i];
11653     if (EltIdx < 0)
11654       continue;
11655     SDValue ExtOp = (EltIdx < 8) ?
11656       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
11657                   DAG.getIntPtrConstant(EltIdx)) :
11658       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
11659                   DAG.getIntPtrConstant(EltIdx - 8));
11660     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
11661                        DAG.getIntPtrConstant(i));
11662   }
11663   return NewV;
11664 }
11665
11666 /// \brief v16i16 shuffles
11667 ///
11668 /// FIXME: We only support generation of a single pshufb currently.  We can
11669 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
11670 /// well (e.g 2 x pshufb + 1 x por).
11671 static SDValue
11672 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
11673   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11674   SDValue V1 = SVOp->getOperand(0);
11675   SDValue V2 = SVOp->getOperand(1);
11676   SDLoc dl(SVOp);
11677
11678   if (V2.getOpcode() != ISD::UNDEF)
11679     return SDValue();
11680
11681   SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11682   return getPSHUFB(MaskVals, V1, dl, DAG);
11683 }
11684
11685 // v16i8 shuffles - Prefer shuffles in the following order:
11686 // 1. [ssse3] 1 x pshufb
11687 // 2. [ssse3] 2 x pshufb + 1 x por
11688 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
11689 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
11690                                         const X86Subtarget* Subtarget,
11691                                         SelectionDAG &DAG) {
11692   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11693   SDValue V1 = SVOp->getOperand(0);
11694   SDValue V2 = SVOp->getOperand(1);
11695   SDLoc dl(SVOp);
11696   ArrayRef<int> MaskVals = SVOp->getMask();
11697
11698   // Promote splats to a larger type which usually leads to more efficient code.
11699   // FIXME: Is this true if pshufb is available?
11700   if (SVOp->isSplat())
11701     return PromoteSplat(SVOp, DAG);
11702
11703   // If we have SSSE3, case 1 is generated when all result bytes come from
11704   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
11705   // present, fall back to case 3.
11706
11707   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
11708   if (Subtarget->hasSSSE3()) {
11709     SmallVector<SDValue,16> pshufbMask;
11710
11711     // If all result elements are from one input vector, then only translate
11712     // undef mask values to 0x80 (zero out result) in the pshufb mask.
11713     //
11714     // Otherwise, we have elements from both input vectors, and must zero out
11715     // elements that come from V2 in the first mask, and V1 in the second mask
11716     // so that we can OR them together.
11717     for (unsigned i = 0; i != 16; ++i) {
11718       int EltIdx = MaskVals[i];
11719       if (EltIdx < 0 || EltIdx >= 16)
11720         EltIdx = 0x80;
11721       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11722     }
11723     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
11724                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11725                                  MVT::v16i8, pshufbMask));
11726
11727     // As PSHUFB will zero elements with negative indices, it's safe to ignore
11728     // the 2nd operand if it's undefined or zero.
11729     if (V2.getOpcode() == ISD::UNDEF ||
11730         ISD::isBuildVectorAllZeros(V2.getNode()))
11731       return V1;
11732
11733     // Calculate the shuffle mask for the second input, shuffle it, and
11734     // OR it with the first shuffled input.
11735     pshufbMask.clear();
11736     for (unsigned i = 0; i != 16; ++i) {
11737       int EltIdx = MaskVals[i];
11738       EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
11739       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11740     }
11741     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
11742                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11743                                  MVT::v16i8, pshufbMask));
11744     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11745   }
11746
11747   // No SSSE3 - Calculate in place words and then fix all out of place words
11748   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
11749   // the 16 different words that comprise the two doublequadword input vectors.
11750   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11751   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
11752   SDValue NewV = V1;
11753   for (int i = 0; i != 8; ++i) {
11754     int Elt0 = MaskVals[i*2];
11755     int Elt1 = MaskVals[i*2+1];
11756
11757     // This word of the result is all undef, skip it.
11758     if (Elt0 < 0 && Elt1 < 0)
11759       continue;
11760
11761     // This word of the result is already in the correct place, skip it.
11762     if ((Elt0 == i*2) && (Elt1 == i*2+1))
11763       continue;
11764
11765     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
11766     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
11767     SDValue InsElt;
11768
11769     // If Elt0 and Elt1 are defined, are consecutive, and can be load
11770     // using a single extract together, load it and store it.
11771     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
11772       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11773                            DAG.getIntPtrConstant(Elt1 / 2));
11774       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11775                         DAG.getIntPtrConstant(i));
11776       continue;
11777     }
11778
11779     // If Elt1 is defined, extract it from the appropriate source.  If the
11780     // source byte is not also odd, shift the extracted word left 8 bits
11781     // otherwise clear the bottom 8 bits if we need to do an or.
11782     if (Elt1 >= 0) {
11783       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11784                            DAG.getIntPtrConstant(Elt1 / 2));
11785       if ((Elt1 & 1) == 0)
11786         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
11787                              DAG.getConstant(8,
11788                                   TLI.getShiftAmountTy(InsElt.getValueType())));
11789       else if (Elt0 >= 0)
11790         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
11791                              DAG.getConstant(0xFF00, MVT::i16));
11792     }
11793     // If Elt0 is defined, extract it from the appropriate source.  If the
11794     // source byte is not also even, shift the extracted word right 8 bits. If
11795     // Elt1 was also defined, OR the extracted values together before
11796     // inserting them in the result.
11797     if (Elt0 >= 0) {
11798       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
11799                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
11800       if ((Elt0 & 1) != 0)
11801         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
11802                               DAG.getConstant(8,
11803                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
11804       else if (Elt1 >= 0)
11805         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
11806                              DAG.getConstant(0x00FF, MVT::i16));
11807       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
11808                          : InsElt0;
11809     }
11810     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11811                        DAG.getIntPtrConstant(i));
11812   }
11813   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
11814 }
11815
11816 // v32i8 shuffles - Translate to VPSHUFB if possible.
11817 static
11818 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
11819                                  const X86Subtarget *Subtarget,
11820                                  SelectionDAG &DAG) {
11821   MVT VT = SVOp->getSimpleValueType(0);
11822   SDValue V1 = SVOp->getOperand(0);
11823   SDValue V2 = SVOp->getOperand(1);
11824   SDLoc dl(SVOp);
11825   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11826
11827   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11828   bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
11829   bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
11830
11831   // VPSHUFB may be generated if
11832   // (1) one of input vector is undefined or zeroinitializer.
11833   // The mask value 0x80 puts 0 in the corresponding slot of the vector.
11834   // And (2) the mask indexes don't cross the 128-bit lane.
11835   if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
11836       (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
11837     return SDValue();
11838
11839   if (V1IsAllZero && !V2IsAllZero) {
11840     CommuteVectorShuffleMask(MaskVals, 32);
11841     V1 = V2;
11842   }
11843   return getPSHUFB(MaskVals, V1, dl, DAG);
11844 }
11845
11846 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
11847 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
11848 /// done when every pair / quad of shuffle mask elements point to elements in
11849 /// the right sequence. e.g.
11850 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
11851 static
11852 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
11853                                  SelectionDAG &DAG) {
11854   MVT VT = SVOp->getSimpleValueType(0);
11855   SDLoc dl(SVOp);
11856   unsigned NumElems = VT.getVectorNumElements();
11857   MVT NewVT;
11858   unsigned Scale;
11859   switch (VT.SimpleTy) {
11860   default: llvm_unreachable("Unexpected!");
11861   case MVT::v2i64:
11862   case MVT::v2f64:
11863            return SDValue(SVOp, 0);
11864   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
11865   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
11866   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
11867   case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
11868   case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
11869   case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
11870   }
11871
11872   SmallVector<int, 8> MaskVec;
11873   for (unsigned i = 0; i != NumElems; i += Scale) {
11874     int StartIdx = -1;
11875     for (unsigned j = 0; j != Scale; ++j) {
11876       int EltIdx = SVOp->getMaskElt(i+j);
11877       if (EltIdx < 0)
11878         continue;
11879       if (StartIdx < 0)
11880         StartIdx = (EltIdx / Scale);
11881       if (EltIdx != (int)(StartIdx*Scale + j))
11882         return SDValue();
11883     }
11884     MaskVec.push_back(StartIdx);
11885   }
11886
11887   SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
11888   SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
11889   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
11890 }
11891
11892 /// getVZextMovL - Return a zero-extending vector move low node.
11893 ///
11894 static SDValue getVZextMovL(MVT VT, MVT OpVT,
11895                             SDValue SrcOp, SelectionDAG &DAG,
11896                             const X86Subtarget *Subtarget, SDLoc dl) {
11897   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
11898     LoadSDNode *LD = nullptr;
11899     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
11900       LD = dyn_cast<LoadSDNode>(SrcOp);
11901     if (!LD) {
11902       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
11903       // instead.
11904       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
11905       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
11906           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
11907           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
11908           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
11909         // PR2108
11910         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
11911         return DAG.getNode(ISD::BITCAST, dl, VT,
11912                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11913                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11914                                                    OpVT,
11915                                                    SrcOp.getOperand(0)
11916                                                           .getOperand(0))));
11917       }
11918     }
11919   }
11920
11921   return DAG.getNode(ISD::BITCAST, dl, VT,
11922                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11923                                  DAG.getNode(ISD::BITCAST, dl,
11924                                              OpVT, SrcOp)));
11925 }
11926
11927 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
11928 /// which could not be matched by any known target speficic shuffle
11929 static SDValue
11930 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11931
11932   SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
11933   if (NewOp.getNode())
11934     return NewOp;
11935
11936   MVT VT = SVOp->getSimpleValueType(0);
11937
11938   unsigned NumElems = VT.getVectorNumElements();
11939   unsigned NumLaneElems = NumElems / 2;
11940
11941   SDLoc dl(SVOp);
11942   MVT EltVT = VT.getVectorElementType();
11943   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
11944   SDValue Output[2];
11945
11946   SmallVector<int, 16> Mask;
11947   for (unsigned l = 0; l < 2; ++l) {
11948     // Build a shuffle mask for the output, discovering on the fly which
11949     // input vectors to use as shuffle operands (recorded in InputUsed).
11950     // If building a suitable shuffle vector proves too hard, then bail
11951     // out with UseBuildVector set.
11952     bool UseBuildVector = false;
11953     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
11954     unsigned LaneStart = l * NumLaneElems;
11955     for (unsigned i = 0; i != NumLaneElems; ++i) {
11956       // The mask element.  This indexes into the input.
11957       int Idx = SVOp->getMaskElt(i+LaneStart);
11958       if (Idx < 0) {
11959         // the mask element does not index into any input vector.
11960         Mask.push_back(-1);
11961         continue;
11962       }
11963
11964       // The input vector this mask element indexes into.
11965       int Input = Idx / NumLaneElems;
11966
11967       // Turn the index into an offset from the start of the input vector.
11968       Idx -= Input * NumLaneElems;
11969
11970       // Find or create a shuffle vector operand to hold this input.
11971       unsigned OpNo;
11972       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
11973         if (InputUsed[OpNo] == Input)
11974           // This input vector is already an operand.
11975           break;
11976         if (InputUsed[OpNo] < 0) {
11977           // Create a new operand for this input vector.
11978           InputUsed[OpNo] = Input;
11979           break;
11980         }
11981       }
11982
11983       if (OpNo >= array_lengthof(InputUsed)) {
11984         // More than two input vectors used!  Give up on trying to create a
11985         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
11986         UseBuildVector = true;
11987         break;
11988       }
11989
11990       // Add the mask index for the new shuffle vector.
11991       Mask.push_back(Idx + OpNo * NumLaneElems);
11992     }
11993
11994     if (UseBuildVector) {
11995       SmallVector<SDValue, 16> SVOps;
11996       for (unsigned i = 0; i != NumLaneElems; ++i) {
11997         // The mask element.  This indexes into the input.
11998         int Idx = SVOp->getMaskElt(i+LaneStart);
11999         if (Idx < 0) {
12000           SVOps.push_back(DAG.getUNDEF(EltVT));
12001           continue;
12002         }
12003
12004         // The input vector this mask element indexes into.
12005         int Input = Idx / NumElems;
12006
12007         // Turn the index into an offset from the start of the input vector.
12008         Idx -= Input * NumElems;
12009
12010         // Extract the vector element by hand.
12011         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
12012                                     SVOp->getOperand(Input),
12013                                     DAG.getIntPtrConstant(Idx)));
12014       }
12015
12016       // Construct the output using a BUILD_VECTOR.
12017       Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
12018     } else if (InputUsed[0] < 0) {
12019       // No input vectors were used! The result is undefined.
12020       Output[l] = DAG.getUNDEF(NVT);
12021     } else {
12022       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
12023                                         (InputUsed[0] % 2) * NumLaneElems,
12024                                         DAG, dl);
12025       // If only one input was used, use an undefined vector for the other.
12026       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
12027         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
12028                             (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
12029       // At least one input vector was used. Create a new shuffle vector.
12030       Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
12031     }
12032
12033     Mask.clear();
12034   }
12035
12036   // Concatenate the result back
12037   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
12038 }
12039
12040 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
12041 /// 4 elements, and match them with several different shuffle types.
12042 static SDValue
12043 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
12044   SDValue V1 = SVOp->getOperand(0);
12045   SDValue V2 = SVOp->getOperand(1);
12046   SDLoc dl(SVOp);
12047   MVT VT = SVOp->getSimpleValueType(0);
12048
12049   assert(VT.is128BitVector() && "Unsupported vector size");
12050
12051   std::pair<int, int> Locs[4];
12052   int Mask1[] = { -1, -1, -1, -1 };
12053   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
12054
12055   unsigned NumHi = 0;
12056   unsigned NumLo = 0;
12057   for (unsigned i = 0; i != 4; ++i) {
12058     int Idx = PermMask[i];
12059     if (Idx < 0) {
12060       Locs[i] = std::make_pair(-1, -1);
12061     } else {
12062       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
12063       if (Idx < 4) {
12064         Locs[i] = std::make_pair(0, NumLo);
12065         Mask1[NumLo] = Idx;
12066         NumLo++;
12067       } else {
12068         Locs[i] = std::make_pair(1, NumHi);
12069         if (2+NumHi < 4)
12070           Mask1[2+NumHi] = Idx;
12071         NumHi++;
12072       }
12073     }
12074   }
12075
12076   if (NumLo <= 2 && NumHi <= 2) {
12077     // If no more than two elements come from either vector. This can be
12078     // implemented with two shuffles. First shuffle gather the elements.
12079     // The second shuffle, which takes the first shuffle as both of its
12080     // vector operands, put the elements into the right order.
12081     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12082
12083     int Mask2[] = { -1, -1, -1, -1 };
12084
12085     for (unsigned i = 0; i != 4; ++i)
12086       if (Locs[i].first != -1) {
12087         unsigned Idx = (i < 2) ? 0 : 4;
12088         Idx += Locs[i].first * 2 + Locs[i].second;
12089         Mask2[i] = Idx;
12090       }
12091
12092     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
12093   }
12094
12095   if (NumLo == 3 || NumHi == 3) {
12096     // Otherwise, we must have three elements from one vector, call it X, and
12097     // one element from the other, call it Y.  First, use a shufps to build an
12098     // intermediate vector with the one element from Y and the element from X
12099     // that will be in the same half in the final destination (the indexes don't
12100     // matter). Then, use a shufps to build the final vector, taking the half
12101     // containing the element from Y from the intermediate, and the other half
12102     // from X.
12103     if (NumHi == 3) {
12104       // Normalize it so the 3 elements come from V1.
12105       CommuteVectorShuffleMask(PermMask, 4);
12106       std::swap(V1, V2);
12107     }
12108
12109     // Find the element from V2.
12110     unsigned HiIndex;
12111     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
12112       int Val = PermMask[HiIndex];
12113       if (Val < 0)
12114         continue;
12115       if (Val >= 4)
12116         break;
12117     }
12118
12119     Mask1[0] = PermMask[HiIndex];
12120     Mask1[1] = -1;
12121     Mask1[2] = PermMask[HiIndex^1];
12122     Mask1[3] = -1;
12123     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12124
12125     if (HiIndex >= 2) {
12126       Mask1[0] = PermMask[0];
12127       Mask1[1] = PermMask[1];
12128       Mask1[2] = HiIndex & 1 ? 6 : 4;
12129       Mask1[3] = HiIndex & 1 ? 4 : 6;
12130       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
12131     }
12132
12133     Mask1[0] = HiIndex & 1 ? 2 : 0;
12134     Mask1[1] = HiIndex & 1 ? 0 : 2;
12135     Mask1[2] = PermMask[2];
12136     Mask1[3] = PermMask[3];
12137     if (Mask1[2] >= 0)
12138       Mask1[2] += 4;
12139     if (Mask1[3] >= 0)
12140       Mask1[3] += 4;
12141     return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
12142   }
12143
12144   // Break it into (shuffle shuffle_hi, shuffle_lo).
12145   int LoMask[] = { -1, -1, -1, -1 };
12146   int HiMask[] = { -1, -1, -1, -1 };
12147
12148   int *MaskPtr = LoMask;
12149   unsigned MaskIdx = 0;
12150   unsigned LoIdx = 0;
12151   unsigned HiIdx = 2;
12152   for (unsigned i = 0; i != 4; ++i) {
12153     if (i == 2) {
12154       MaskPtr = HiMask;
12155       MaskIdx = 1;
12156       LoIdx = 0;
12157       HiIdx = 2;
12158     }
12159     int Idx = PermMask[i];
12160     if (Idx < 0) {
12161       Locs[i] = std::make_pair(-1, -1);
12162     } else if (Idx < 4) {
12163       Locs[i] = std::make_pair(MaskIdx, LoIdx);
12164       MaskPtr[LoIdx] = Idx;
12165       LoIdx++;
12166     } else {
12167       Locs[i] = std::make_pair(MaskIdx, HiIdx);
12168       MaskPtr[HiIdx] = Idx;
12169       HiIdx++;
12170     }
12171   }
12172
12173   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
12174   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
12175   int MaskOps[] = { -1, -1, -1, -1 };
12176   for (unsigned i = 0; i != 4; ++i)
12177     if (Locs[i].first != -1)
12178       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
12179   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
12180 }
12181
12182 static bool MayFoldVectorLoad(SDValue V) {
12183   while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
12184     V = V.getOperand(0);
12185
12186   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
12187     V = V.getOperand(0);
12188   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
12189       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
12190     // BUILD_VECTOR (load), undef
12191     V = V.getOperand(0);
12192
12193   return MayFoldLoad(V);
12194 }
12195
12196 static
12197 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
12198   MVT VT = Op.getSimpleValueType();
12199
12200   // Canonizalize to v2f64.
12201   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
12202   return DAG.getNode(ISD::BITCAST, dl, VT,
12203                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
12204                                           V1, DAG));
12205 }
12206
12207 static
12208 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
12209                         bool HasSSE2) {
12210   SDValue V1 = Op.getOperand(0);
12211   SDValue V2 = Op.getOperand(1);
12212   MVT VT = Op.getSimpleValueType();
12213
12214   assert(VT != MVT::v2i64 && "unsupported shuffle type");
12215
12216   if (HasSSE2 && VT == MVT::v2f64)
12217     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
12218
12219   // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
12220   return DAG.getNode(ISD::BITCAST, dl, VT,
12221                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
12222                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
12223                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
12224 }
12225
12226 static
12227 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
12228   SDValue V1 = Op.getOperand(0);
12229   SDValue V2 = Op.getOperand(1);
12230   MVT VT = Op.getSimpleValueType();
12231
12232   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
12233          "unsupported shuffle type");
12234
12235   if (V2.getOpcode() == ISD::UNDEF)
12236     V2 = V1;
12237
12238   // v4i32 or v4f32
12239   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
12240 }
12241
12242 static
12243 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
12244   SDValue V1 = Op.getOperand(0);
12245   SDValue V2 = Op.getOperand(1);
12246   MVT VT = Op.getSimpleValueType();
12247   unsigned NumElems = VT.getVectorNumElements();
12248
12249   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
12250   // operand of these instructions is only memory, so check if there's a
12251   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
12252   // same masks.
12253   bool CanFoldLoad = false;
12254
12255   // Trivial case, when V2 comes from a load.
12256   if (MayFoldVectorLoad(V2))
12257     CanFoldLoad = true;
12258
12259   // When V1 is a load, it can be folded later into a store in isel, example:
12260   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
12261   //    turns into:
12262   //  (MOVLPSmr addr:$src1, VR128:$src2)
12263   // So, recognize this potential and also use MOVLPS or MOVLPD
12264   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
12265     CanFoldLoad = true;
12266
12267   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12268   if (CanFoldLoad) {
12269     if (HasSSE2 && NumElems == 2)
12270       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
12271
12272     if (NumElems == 4)
12273       // If we don't care about the second element, proceed to use movss.
12274       if (SVOp->getMaskElt(1) != -1)
12275         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
12276   }
12277
12278   // movl and movlp will both match v2i64, but v2i64 is never matched by
12279   // movl earlier because we make it strict to avoid messing with the movlp load
12280   // folding logic (see the code above getMOVLP call). Match it here then,
12281   // this is horrible, but will stay like this until we move all shuffle
12282   // matching to x86 specific nodes. Note that for the 1st condition all
12283   // types are matched with movsd.
12284   if (HasSSE2) {
12285     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
12286     // as to remove this logic from here, as much as possible
12287     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
12288       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12289     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12290   }
12291
12292   assert(VT != MVT::v4i32 && "unsupported shuffle type");
12293
12294   // Invert the operand order and use SHUFPS to match it.
12295   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
12296                               getShuffleSHUFImmediate(SVOp), DAG);
12297 }
12298
12299 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
12300                                          SelectionDAG &DAG) {
12301   SDLoc dl(Load);
12302   MVT VT = Load->getSimpleValueType(0);
12303   MVT EVT = VT.getVectorElementType();
12304   SDValue Addr = Load->getOperand(1);
12305   SDValue NewAddr = DAG.getNode(
12306       ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
12307       DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
12308
12309   SDValue NewLoad =
12310       DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
12311                   DAG.getMachineFunction().getMachineMemOperand(
12312                       Load->getMemOperand(), 0, EVT.getStoreSize()));
12313   return NewLoad;
12314 }
12315
12316 // It is only safe to call this function if isINSERTPSMask is true for
12317 // this shufflevector mask.
12318 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
12319                            SelectionDAG &DAG) {
12320   // Generate an insertps instruction when inserting an f32 from memory onto a
12321   // v4f32 or when copying a member from one v4f32 to another.
12322   // We also use it for transferring i32 from one register to another,
12323   // since it simply copies the same bits.
12324   // If we're transferring an i32 from memory to a specific element in a
12325   // register, we output a generic DAG that will match the PINSRD
12326   // instruction.
12327   MVT VT = SVOp->getSimpleValueType(0);
12328   MVT EVT = VT.getVectorElementType();
12329   SDValue V1 = SVOp->getOperand(0);
12330   SDValue V2 = SVOp->getOperand(1);
12331   auto Mask = SVOp->getMask();
12332   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
12333          "unsupported vector type for insertps/pinsrd");
12334
12335   auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
12336   auto FromV2Predicate = [](const int &i) { return i >= 4; };
12337   int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
12338
12339   SDValue From;
12340   SDValue To;
12341   unsigned DestIndex;
12342   if (FromV1 == 1) {
12343     From = V1;
12344     To = V2;
12345     DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
12346                 Mask.begin();
12347
12348     // If we have 1 element from each vector, we have to check if we're
12349     // changing V1's element's place. If so, we're done. Otherwise, we
12350     // should assume we're changing V2's element's place and behave
12351     // accordingly.
12352     int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
12353     assert(DestIndex <= INT32_MAX && "truncated destination index");
12354     if (FromV1 == FromV2 &&
12355         static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
12356       From = V2;
12357       To = V1;
12358       DestIndex =
12359           std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12360     }
12361   } else {
12362     assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
12363            "More than one element from V1 and from V2, or no elements from one "
12364            "of the vectors. This case should not have returned true from "
12365            "isINSERTPSMask");
12366     From = V2;
12367     To = V1;
12368     DestIndex =
12369         std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12370   }
12371
12372   // Get an index into the source vector in the range [0,4) (the mask is
12373   // in the range [0,8) because it can address V1 and V2)
12374   unsigned SrcIndex = Mask[DestIndex] % 4;
12375   if (MayFoldLoad(From)) {
12376     // Trivial case, when From comes from a load and is only used by the
12377     // shuffle. Make it use insertps from the vector that we need from that
12378     // load.
12379     SDValue NewLoad =
12380         NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
12381     if (!NewLoad.getNode())
12382       return SDValue();
12383
12384     if (EVT == MVT::f32) {
12385       // Create this as a scalar to vector to match the instruction pattern.
12386       SDValue LoadScalarToVector =
12387           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
12388       SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
12389       return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
12390                          InsertpsMask);
12391     } else { // EVT == MVT::i32
12392       // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
12393       // instruction, to match the PINSRD instruction, which loads an i32 to a
12394       // certain vector element.
12395       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
12396                          DAG.getConstant(DestIndex, MVT::i32));
12397     }
12398   }
12399
12400   // Vector-element-to-vector
12401   SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
12402   return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
12403 }
12404
12405 // Reduce a vector shuffle to zext.
12406 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
12407                                     SelectionDAG &DAG) {
12408   // PMOVZX is only available from SSE41.
12409   if (!Subtarget->hasSSE41())
12410     return SDValue();
12411
12412   MVT VT = Op.getSimpleValueType();
12413
12414   // Only AVX2 support 256-bit vector integer extending.
12415   if (!Subtarget->hasInt256() && VT.is256BitVector())
12416     return SDValue();
12417
12418   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12419   SDLoc DL(Op);
12420   SDValue V1 = Op.getOperand(0);
12421   SDValue V2 = Op.getOperand(1);
12422   unsigned NumElems = VT.getVectorNumElements();
12423
12424   // Extending is an unary operation and the element type of the source vector
12425   // won't be equal to or larger than i64.
12426   if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
12427       VT.getVectorElementType() == MVT::i64)
12428     return SDValue();
12429
12430   // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
12431   unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
12432   while ((1U << Shift) < NumElems) {
12433     if (SVOp->getMaskElt(1U << Shift) == 1)
12434       break;
12435     Shift += 1;
12436     // The maximal ratio is 8, i.e. from i8 to i64.
12437     if (Shift > 3)
12438       return SDValue();
12439   }
12440
12441   // Check the shuffle mask.
12442   unsigned Mask = (1U << Shift) - 1;
12443   for (unsigned i = 0; i != NumElems; ++i) {
12444     int EltIdx = SVOp->getMaskElt(i);
12445     if ((i & Mask) != 0 && EltIdx != -1)
12446       return SDValue();
12447     if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
12448       return SDValue();
12449   }
12450
12451   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
12452   MVT NeVT = MVT::getIntegerVT(NBits);
12453   MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
12454
12455   if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
12456     return SDValue();
12457
12458   return DAG.getNode(ISD::BITCAST, DL, VT,
12459                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
12460 }
12461
12462 static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
12463                                       SelectionDAG &DAG) {
12464   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12465   MVT VT = Op.getSimpleValueType();
12466   SDLoc dl(Op);
12467   SDValue V1 = Op.getOperand(0);
12468   SDValue V2 = Op.getOperand(1);
12469
12470   if (isZeroShuffle(SVOp))
12471     return getZeroVector(VT, Subtarget, DAG, dl);
12472
12473   // Handle splat operations
12474   if (SVOp->isSplat()) {
12475     // Use vbroadcast whenever the splat comes from a foldable load
12476     SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
12477     if (Broadcast.getNode())
12478       return Broadcast;
12479   }
12480
12481   // Check integer expanding shuffles.
12482   SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
12483   if (NewOp.getNode())
12484     return NewOp;
12485
12486   // If the shuffle can be profitably rewritten as a narrower shuffle, then
12487   // do it!
12488   if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
12489       VT == MVT::v32i8) {
12490     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12491     if (NewOp.getNode())
12492       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
12493   } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
12494     // FIXME: Figure out a cleaner way to do this.
12495     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
12496       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12497       if (NewOp.getNode()) {
12498         MVT NewVT = NewOp.getSimpleValueType();
12499         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
12500                                NewVT, true, false))
12501           return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
12502                               dl);
12503       }
12504     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
12505       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12506       if (NewOp.getNode()) {
12507         MVT NewVT = NewOp.getSimpleValueType();
12508         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
12509           return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
12510                               dl);
12511       }
12512     }
12513   }
12514   return SDValue();
12515 }
12516
12517 SDValue
12518 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
12519   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12520   SDValue V1 = Op.getOperand(0);
12521   SDValue V2 = Op.getOperand(1);
12522   MVT VT = Op.getSimpleValueType();
12523   SDLoc dl(Op);
12524   unsigned NumElems = VT.getVectorNumElements();
12525   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
12526   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
12527   bool V1IsSplat = false;
12528   bool V2IsSplat = false;
12529   bool HasSSE2 = Subtarget->hasSSE2();
12530   bool HasFp256    = Subtarget->hasFp256();
12531   bool HasInt256   = Subtarget->hasInt256();
12532   MachineFunction &MF = DAG.getMachineFunction();
12533   bool OptForSize = MF.getFunction()->getAttributes().
12534     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
12535
12536   // Check if we should use the experimental vector shuffle lowering. If so,
12537   // delegate completely to that code path.
12538   if (ExperimentalVectorShuffleLowering)
12539     return lowerVectorShuffle(Op, Subtarget, DAG);
12540
12541   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
12542
12543   if (V1IsUndef && V2IsUndef)
12544     return DAG.getUNDEF(VT);
12545
12546   // When we create a shuffle node we put the UNDEF node to second operand,
12547   // but in some cases the first operand may be transformed to UNDEF.
12548   // In this case we should just commute the node.
12549   if (V1IsUndef)
12550     return DAG.getCommutedVectorShuffle(*SVOp);
12551
12552   // Vector shuffle lowering takes 3 steps:
12553   //
12554   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
12555   //    narrowing and commutation of operands should be handled.
12556   // 2) Matching of shuffles with known shuffle masks to x86 target specific
12557   //    shuffle nodes.
12558   // 3) Rewriting of unmatched masks into new generic shuffle operations,
12559   //    so the shuffle can be broken into other shuffles and the legalizer can
12560   //    try the lowering again.
12561   //
12562   // The general idea is that no vector_shuffle operation should be left to
12563   // be matched during isel, all of them must be converted to a target specific
12564   // node here.
12565
12566   // Normalize the input vectors. Here splats, zeroed vectors, profitable
12567   // narrowing and commutation of operands should be handled. The actual code
12568   // doesn't include all of those, work in progress...
12569   SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
12570   if (NewOp.getNode())
12571     return NewOp;
12572
12573   SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
12574
12575   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
12576   // unpckh_undef). Only use pshufd if speed is more important than size.
12577   if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12578     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12579   if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12580     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12581
12582   if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
12583       V2IsUndef && MayFoldVectorLoad(V1))
12584     return getMOVDDup(Op, dl, V1, DAG);
12585
12586   if (isMOVHLPS_v_undef_Mask(M, VT))
12587     return getMOVHighToLow(Op, dl, DAG);
12588
12589   // Use to match splats
12590   if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
12591       (VT == MVT::v2f64 || VT == MVT::v2i64))
12592     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12593
12594   if (isPSHUFDMask(M, VT)) {
12595     // The actual implementation will match the mask in the if above and then
12596     // during isel it can match several different instructions, not only pshufd
12597     // as its name says, sad but true, emulate the behavior for now...
12598     if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
12599       return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
12600
12601     unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
12602
12603     if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
12604       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
12605
12606     if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
12607       return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
12608                                   DAG);
12609
12610     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
12611                                 TargetMask, DAG);
12612   }
12613
12614   if (isPALIGNRMask(M, VT, Subtarget))
12615     return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
12616                                 getShufflePALIGNRImmediate(SVOp),
12617                                 DAG);
12618
12619   if (isVALIGNMask(M, VT, Subtarget))
12620     return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
12621                                 getShuffleVALIGNImmediate(SVOp),
12622                                 DAG);
12623
12624   // Check if this can be converted into a logical shift.
12625   bool isLeft = false;
12626   unsigned ShAmt = 0;
12627   SDValue ShVal;
12628   bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
12629   if (isShift && ShVal.hasOneUse()) {
12630     // If the shifted value has multiple uses, it may be cheaper to use
12631     // v_set0 + movlhps or movhlps, etc.
12632     MVT EltVT = VT.getVectorElementType();
12633     ShAmt *= EltVT.getSizeInBits();
12634     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12635   }
12636
12637   if (isMOVLMask(M, VT)) {
12638     if (ISD::isBuildVectorAllZeros(V1.getNode()))
12639       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
12640     if (!isMOVLPMask(M, VT)) {
12641       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
12642         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12643
12644       if (VT == MVT::v4i32 || VT == MVT::v4f32)
12645         return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12646     }
12647   }
12648
12649   // FIXME: fold these into legal mask.
12650   if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
12651     return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
12652
12653   if (isMOVHLPSMask(M, VT))
12654     return getMOVHighToLow(Op, dl, DAG);
12655
12656   if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
12657     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
12658
12659   if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
12660     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
12661
12662   if (isMOVLPMask(M, VT))
12663     return getMOVLP(Op, dl, DAG, HasSSE2);
12664
12665   if (ShouldXformToMOVHLPS(M, VT) ||
12666       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
12667     return DAG.getCommutedVectorShuffle(*SVOp);
12668
12669   if (isShift) {
12670     // No better options. Use a vshldq / vsrldq.
12671     MVT EltVT = VT.getVectorElementType();
12672     ShAmt *= EltVT.getSizeInBits();
12673     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12674   }
12675
12676   bool Commuted = false;
12677   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
12678   // 1,1,1,1 -> v8i16 though.
12679   BitVector UndefElements;
12680   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
12681     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12682       V1IsSplat = true;
12683   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
12684     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12685       V2IsSplat = true;
12686
12687   // Canonicalize the splat or undef, if present, to be on the RHS.
12688   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
12689     CommuteVectorShuffleMask(M, NumElems);
12690     std::swap(V1, V2);
12691     std::swap(V1IsSplat, V2IsSplat);
12692     Commuted = true;
12693   }
12694
12695   if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
12696     // Shuffling low element of v1 into undef, just return v1.
12697     if (V2IsUndef)
12698       return V1;
12699     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
12700     // the instruction selector will not match, so get a canonical MOVL with
12701     // swapped operands to undo the commute.
12702     return getMOVL(DAG, dl, VT, V2, V1);
12703   }
12704
12705   if (isUNPCKLMask(M, VT, HasInt256))
12706     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12707
12708   if (isUNPCKHMask(M, VT, HasInt256))
12709     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12710
12711   if (V2IsSplat) {
12712     // Normalize mask so all entries that point to V2 points to its first
12713     // element then try to match unpck{h|l} again. If match, return a
12714     // new vector_shuffle with the corrected mask.p
12715     SmallVector<int, 8> NewMask(M.begin(), M.end());
12716     NormalizeMask(NewMask, NumElems);
12717     if (isUNPCKLMask(NewMask, VT, HasInt256, true))
12718       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12719     if (isUNPCKHMask(NewMask, VT, HasInt256, true))
12720       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12721   }
12722
12723   if (Commuted) {
12724     // Commute is back and try unpck* again.
12725     // FIXME: this seems wrong.
12726     CommuteVectorShuffleMask(M, NumElems);
12727     std::swap(V1, V2);
12728     std::swap(V1IsSplat, V2IsSplat);
12729
12730     if (isUNPCKLMask(M, VT, HasInt256))
12731       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12732
12733     if (isUNPCKHMask(M, VT, HasInt256))
12734       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12735   }
12736
12737   // Normalize the node to match x86 shuffle ops if needed
12738   if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
12739     return DAG.getCommutedVectorShuffle(*SVOp);
12740
12741   // The checks below are all present in isShuffleMaskLegal, but they are
12742   // inlined here right now to enable us to directly emit target specific
12743   // nodes, and remove one by one until they don't return Op anymore.
12744
12745   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
12746       SVOp->getSplatIndex() == 0 && V2IsUndef) {
12747     if (VT == MVT::v2f64 || VT == MVT::v2i64)
12748       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12749   }
12750
12751   if (isPSHUFHWMask(M, VT, HasInt256))
12752     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
12753                                 getShufflePSHUFHWImmediate(SVOp),
12754                                 DAG);
12755
12756   if (isPSHUFLWMask(M, VT, HasInt256))
12757     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
12758                                 getShufflePSHUFLWImmediate(SVOp),
12759                                 DAG);
12760
12761   unsigned MaskValue;
12762   if (isBlendMask(M, VT, Subtarget->hasSSE41(), HasInt256, &MaskValue))
12763     return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
12764
12765   if (isSHUFPMask(M, VT))
12766     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
12767                                 getShuffleSHUFImmediate(SVOp), DAG);
12768
12769   if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12770     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12771   if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12772     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12773
12774   //===--------------------------------------------------------------------===//
12775   // Generate target specific nodes for 128 or 256-bit shuffles only
12776   // supported in the AVX instruction set.
12777   //
12778
12779   // Handle VMOVDDUPY permutations
12780   if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
12781     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
12782
12783   // Handle VPERMILPS/D* permutations
12784   if (isVPERMILPMask(M, VT)) {
12785     if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
12786       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
12787                                   getShuffleSHUFImmediate(SVOp), DAG);
12788     return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
12789                                 getShuffleSHUFImmediate(SVOp), DAG);
12790   }
12791
12792   unsigned Idx;
12793   if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
12794     return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
12795                               Idx*(NumElems/2), DAG, dl);
12796
12797   // Handle VPERM2F128/VPERM2I128 permutations
12798   if (isVPERM2X128Mask(M, VT, HasFp256))
12799     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
12800                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
12801
12802   if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
12803     return getINSERTPS(SVOp, dl, DAG);
12804
12805   unsigned Imm8;
12806   if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
12807     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
12808
12809   if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
12810       VT.is512BitVector()) {
12811     MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
12812     MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
12813     SmallVector<SDValue, 16> permclMask;
12814     for (unsigned i = 0; i != NumElems; ++i) {
12815       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
12816     }
12817
12818     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
12819     if (V2IsUndef)
12820       // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
12821       return DAG.getNode(X86ISD::VPERMV, dl, VT,
12822                           DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
12823     return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
12824                        DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
12825   }
12826
12827   //===--------------------------------------------------------------------===//
12828   // Since no target specific shuffle was selected for this generic one,
12829   // lower it into other known shuffles. FIXME: this isn't true yet, but
12830   // this is the plan.
12831   //
12832
12833   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
12834   if (VT == MVT::v8i16) {
12835     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
12836     if (NewOp.getNode())
12837       return NewOp;
12838   }
12839
12840   if (VT == MVT::v16i16 && HasInt256) {
12841     SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
12842     if (NewOp.getNode())
12843       return NewOp;
12844   }
12845
12846   if (VT == MVT::v16i8) {
12847     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
12848     if (NewOp.getNode())
12849       return NewOp;
12850   }
12851
12852   if (VT == MVT::v32i8) {
12853     SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
12854     if (NewOp.getNode())
12855       return NewOp;
12856   }
12857
12858   // Handle all 128-bit wide vectors with 4 elements, and match them with
12859   // several different shuffle types.
12860   if (NumElems == 4 && VT.is128BitVector())
12861     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
12862
12863   // Handle general 256-bit shuffles
12864   if (VT.is256BitVector())
12865     return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
12866
12867   return SDValue();
12868 }
12869
12870 // This function assumes its argument is a BUILD_VECTOR of constants or
12871 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
12872 // true.
12873 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
12874                                     unsigned &MaskValue) {
12875   MaskValue = 0;
12876   unsigned NumElems = BuildVector->getNumOperands();
12877   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
12878   unsigned NumLanes = (NumElems - 1) / 8 + 1;
12879   unsigned NumElemsInLane = NumElems / NumLanes;
12880
12881   // Blend for v16i16 should be symetric for the both lanes.
12882   for (unsigned i = 0; i < NumElemsInLane; ++i) {
12883     SDValue EltCond = BuildVector->getOperand(i);
12884     SDValue SndLaneEltCond =
12885         (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
12886
12887     int Lane1Cond = -1, Lane2Cond = -1;
12888     if (isa<ConstantSDNode>(EltCond))
12889       Lane1Cond = !isZero(EltCond);
12890     if (isa<ConstantSDNode>(SndLaneEltCond))
12891       Lane2Cond = !isZero(SndLaneEltCond);
12892
12893     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
12894       // Lane1Cond != 0, means we want the first argument.
12895       // Lane1Cond == 0, means we want the second argument.
12896       // The encoding of this argument is 0 for the first argument, 1
12897       // for the second. Therefore, invert the condition.
12898       MaskValue |= !Lane1Cond << i;
12899     else if (Lane1Cond < 0)
12900       MaskValue |= !Lane2Cond << i;
12901     else
12902       return false;
12903   }
12904   return true;
12905 }
12906
12907 /// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
12908 /// instruction.
12909 static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
12910                                     SelectionDAG &DAG) {
12911   SDValue Cond = Op.getOperand(0);
12912   SDValue LHS = Op.getOperand(1);
12913   SDValue RHS = Op.getOperand(2);
12914   SDLoc dl(Op);
12915   MVT VT = Op.getSimpleValueType();
12916   MVT EltVT = VT.getVectorElementType();
12917   unsigned NumElems = VT.getVectorNumElements();
12918
12919   // There is no blend with immediate in AVX-512.
12920   if (VT.is512BitVector())
12921     return SDValue();
12922
12923   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
12924     return SDValue();
12925   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
12926     return SDValue();
12927
12928   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12929     return SDValue();
12930
12931   // Check the mask for BLEND and build the value.
12932   unsigned MaskValue = 0;
12933   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
12934     return SDValue();
12935
12936   // Convert i32 vectors to floating point if it is not AVX2.
12937   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
12938   MVT BlendVT = VT;
12939   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
12940     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
12941                                NumElems);
12942     LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
12943     RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
12944   }
12945
12946   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
12947                             DAG.getConstant(MaskValue, MVT::i32));
12948   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
12949 }
12950
12951 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12952   // A vselect where all conditions and data are constants can be optimized into
12953   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12954   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12955       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12956       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12957     return SDValue();
12958
12959   SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
12960   if (BlendOp.getNode())
12961     return BlendOp;
12962
12963   // Some types for vselect were previously set to Expand, not Legal or
12964   // Custom. Return an empty SDValue so we fall-through to Expand, after
12965   // the Custom lowering phase.
12966   MVT VT = Op.getSimpleValueType();
12967   switch (VT.SimpleTy) {
12968   default:
12969     break;
12970   case MVT::v8i16:
12971   case MVT::v16i16:
12972     if (Subtarget->hasBWI() && Subtarget->hasVLX())
12973       break;
12974     return SDValue();
12975   }
12976
12977   // We couldn't create a "Blend with immediate" node.
12978   // This node should still be legal, but we'll have to emit a blendv*
12979   // instruction.
12980   return Op;
12981 }
12982
12983 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12984   MVT VT = Op.getSimpleValueType();
12985   SDLoc dl(Op);
12986
12987   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12988     return SDValue();
12989
12990   if (VT.getSizeInBits() == 8) {
12991     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12992                                   Op.getOperand(0), Op.getOperand(1));
12993     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12994                                   DAG.getValueType(VT));
12995     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12996   }
12997
12998   if (VT.getSizeInBits() == 16) {
12999     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13000     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
13001     if (Idx == 0)
13002       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13003                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13004                                      DAG.getNode(ISD::BITCAST, dl,
13005                                                  MVT::v4i32,
13006                                                  Op.getOperand(0)),
13007                                      Op.getOperand(1)));
13008     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
13009                                   Op.getOperand(0), Op.getOperand(1));
13010     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13011                                   DAG.getValueType(VT));
13012     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13013   }
13014
13015   if (VT == MVT::f32) {
13016     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13017     // the result back to FR32 register. It's only worth matching if the
13018     // result has a single use which is a store or a bitcast to i32.  And in
13019     // the case of a store, it's not worth it if the index is a constant 0,
13020     // because a MOVSSmr can be used instead, which is smaller and faster.
13021     if (!Op.hasOneUse())
13022       return SDValue();
13023     SDNode *User = *Op.getNode()->use_begin();
13024     if ((User->getOpcode() != ISD::STORE ||
13025          (isa<ConstantSDNode>(Op.getOperand(1)) &&
13026           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
13027         (User->getOpcode() != ISD::BITCAST ||
13028          User->getValueType(0) != MVT::i32))
13029       return SDValue();
13030     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13031                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
13032                                               Op.getOperand(0)),
13033                                               Op.getOperand(1));
13034     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
13035   }
13036
13037   if (VT == MVT::i32 || VT == MVT::i64) {
13038     // ExtractPS/pextrq works with constant index.
13039     if (isa<ConstantSDNode>(Op.getOperand(1)))
13040       return Op;
13041   }
13042   return SDValue();
13043 }
13044
13045 /// Extract one bit from mask vector, like v16i1 or v8i1.
13046 /// AVX-512 feature.
13047 SDValue
13048 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13049   SDValue Vec = Op.getOperand(0);
13050   SDLoc dl(Vec);
13051   MVT VecVT = Vec.getSimpleValueType();
13052   SDValue Idx = Op.getOperand(1);
13053   MVT EltVT = Op.getSimpleValueType();
13054
13055   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13056   assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
13057          "Unexpected vector type in ExtractBitFromMaskVector");
13058
13059   // variable index can't be handled in mask registers,
13060   // extend vector to VR512
13061   if (!isa<ConstantSDNode>(Idx)) {
13062     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13063     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
13064     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13065                               ExtVT.getVectorElementType(), Ext, Idx);
13066     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13067   }
13068
13069   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13070   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13071   if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
13072     rc = getRegClassFor(MVT::v16i1);
13073   unsigned MaxSift = rc->getSize()*8 - 1;
13074   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13075                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13076   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13077                     DAG.getConstant(MaxSift, MVT::i8));
13078   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
13079                        DAG.getIntPtrConstant(0));
13080 }
13081
13082 SDValue
13083 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13084                                            SelectionDAG &DAG) const {
13085   SDLoc dl(Op);
13086   SDValue Vec = Op.getOperand(0);
13087   MVT VecVT = Vec.getSimpleValueType();
13088   SDValue Idx = Op.getOperand(1);
13089
13090   if (Op.getSimpleValueType() == MVT::i1)
13091     return ExtractBitFromMaskVector(Op, DAG);
13092
13093   if (!isa<ConstantSDNode>(Idx)) {
13094     if (VecVT.is512BitVector() ||
13095         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
13096          VecVT.getVectorElementType().getSizeInBits() == 32)) {
13097
13098       MVT MaskEltVT =
13099         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
13100       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
13101                                     MaskEltVT.getSizeInBits());
13102
13103       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
13104       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
13105                                 getZeroVector(MaskVT, Subtarget, DAG, dl),
13106                                 Idx, DAG.getConstant(0, getPointerTy()));
13107       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
13108       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
13109                         Perm, DAG.getConstant(0, getPointerTy()));
13110     }
13111     return SDValue();
13112   }
13113
13114   // If this is a 256-bit vector result, first extract the 128-bit vector and
13115   // then extract the element from the 128-bit vector.
13116   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
13117
13118     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13119     // Get the 128-bit vector.
13120     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
13121     MVT EltVT = VecVT.getVectorElementType();
13122
13123     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
13124
13125     //if (IdxVal >= NumElems/2)
13126     //  IdxVal -= NumElems/2;
13127     IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
13128     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
13129                        DAG.getConstant(IdxVal, MVT::i32));
13130   }
13131
13132   assert(VecVT.is128BitVector() && "Unexpected vector length");
13133
13134   if (Subtarget->hasSSE41()) {
13135     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
13136     if (Res.getNode())
13137       return Res;
13138   }
13139
13140   MVT VT = Op.getSimpleValueType();
13141   // TODO: handle v16i8.
13142   if (VT.getSizeInBits() == 16) {
13143     SDValue Vec = Op.getOperand(0);
13144     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13145     if (Idx == 0)
13146       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13147                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13148                                      DAG.getNode(ISD::BITCAST, dl,
13149                                                  MVT::v4i32, Vec),
13150                                      Op.getOperand(1)));
13151     // Transform it so it match pextrw which produces a 32-bit result.
13152     MVT EltVT = MVT::i32;
13153     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
13154                                   Op.getOperand(0), Op.getOperand(1));
13155     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
13156                                   DAG.getValueType(VT));
13157     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13158   }
13159
13160   if (VT.getSizeInBits() == 32) {
13161     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13162     if (Idx == 0)
13163       return Op;
13164
13165     // SHUFPS the element to the lowest double word, then movss.
13166     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
13167     MVT VVT = Op.getOperand(0).getSimpleValueType();
13168     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13169                                        DAG.getUNDEF(VVT), Mask);
13170     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13171                        DAG.getIntPtrConstant(0));
13172   }
13173
13174   if (VT.getSizeInBits() == 64) {
13175     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
13176     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
13177     //        to match extract_elt for f64.
13178     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
13179     if (Idx == 0)
13180       return Op;
13181
13182     // UNPCKHPD the element to the lowest double word, then movsd.
13183     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
13184     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
13185     int Mask[2] = { 1, -1 };
13186     MVT VVT = Op.getOperand(0).getSimpleValueType();
13187     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
13188                                        DAG.getUNDEF(VVT), Mask);
13189     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13190                        DAG.getIntPtrConstant(0));
13191   }
13192
13193   return SDValue();
13194 }
13195
13196 /// Insert one bit to mask vector, like v16i1 or v8i1.
13197 /// AVX-512 feature.
13198 SDValue
13199 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
13200   SDLoc dl(Op);
13201   SDValue Vec = Op.getOperand(0);
13202   SDValue Elt = Op.getOperand(1);
13203   SDValue Idx = Op.getOperand(2);
13204   MVT VecVT = Vec.getSimpleValueType();
13205
13206   if (!isa<ConstantSDNode>(Idx)) {
13207     // Non constant index. Extend source and destination,
13208     // insert element and then truncate the result.
13209     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
13210     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
13211     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13212       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13213       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13214     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13215   }
13216
13217   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13218   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13219   if (Vec.getOpcode() == ISD::UNDEF)
13220     return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13221                        DAG.getConstant(IdxVal, MVT::i8));
13222   const TargetRegisterClass* rc = getRegClassFor(VecVT);
13223   unsigned MaxSift = rc->getSize()*8 - 1;
13224   EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13225                     DAG.getConstant(MaxSift, MVT::i8));
13226   EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
13227                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
13228   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13229 }
13230
13231 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13232                                                   SelectionDAG &DAG) const {
13233   MVT VT = Op.getSimpleValueType();
13234   MVT EltVT = VT.getVectorElementType();
13235
13236   if (EltVT == MVT::i1)
13237     return InsertBitToMaskVector(Op, DAG);
13238
13239   SDLoc dl(Op);
13240   SDValue N0 = Op.getOperand(0);
13241   SDValue N1 = Op.getOperand(1);
13242   SDValue N2 = Op.getOperand(2);
13243   if (!isa<ConstantSDNode>(N2))
13244     return SDValue();
13245   auto *N2C = cast<ConstantSDNode>(N2);
13246   unsigned IdxVal = N2C->getZExtValue();
13247
13248   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13249   // into that, and then insert the subvector back into the result.
13250   if (VT.is256BitVector() || VT.is512BitVector()) {
13251     // Get the desired 128-bit vector half.
13252     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
13253
13254     // Insert the element into the desired half.
13255     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13256     unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
13257
13258     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13259                     DAG.getConstant(IdxIn128, MVT::i32));
13260
13261     // Insert the changed part back to the 256-bit vector
13262     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
13263   }
13264   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13265
13266   if (Subtarget->hasSSE41()) {
13267     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13268       unsigned Opc;
13269       if (VT == MVT::v8i16) {
13270         Opc = X86ISD::PINSRW;
13271       } else {
13272         assert(VT == MVT::v16i8);
13273         Opc = X86ISD::PINSRB;
13274       }
13275
13276       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13277       // argument.
13278       if (N1.getValueType() != MVT::i32)
13279         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13280       if (N2.getValueType() != MVT::i32)
13281         N2 = DAG.getIntPtrConstant(IdxVal);
13282       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13283     }
13284
13285     if (EltVT == MVT::f32) {
13286       // Bits [7:6] of the constant are the source select.  This will always be
13287       //  zero here.  The DAG Combiner may combine an extract_elt index into
13288       //  these
13289       //  bits.  For example (insert (extract, 3), 2) could be matched by
13290       //  putting
13291       //  the '3' into bits [7:6] of X86ISD::INSERTPS.
13292       // Bits [5:4] of the constant are the destination select.  This is the
13293       //  value of the incoming immediate.
13294       // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
13295       //   combine either bitwise AND or insert of float 0.0 to set these bits.
13296       N2 = DAG.getIntPtrConstant(IdxVal << 4);
13297       // Create this as a scalar to vector..
13298       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13299       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13300     }
13301
13302     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13303       // PINSR* works with constant index.
13304       return Op;
13305     }
13306   }
13307
13308   if (EltVT == MVT::i8)
13309     return SDValue();
13310
13311   if (EltVT.getSizeInBits() == 16) {
13312     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13313     // as its second argument.
13314     if (N1.getValueType() != MVT::i32)
13315       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13316     if (N2.getValueType() != MVT::i32)
13317       N2 = DAG.getIntPtrConstant(IdxVal);
13318     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13319   }
13320   return SDValue();
13321 }
13322
13323 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13324   SDLoc dl(Op);
13325   MVT OpVT = Op.getSimpleValueType();
13326
13327   // If this is a 256-bit vector result, first insert into a 128-bit
13328   // vector and then insert into the 256-bit vector.
13329   if (!OpVT.is128BitVector()) {
13330     // Insert into a 128-bit vector.
13331     unsigned SizeFactor = OpVT.getSizeInBits()/128;
13332     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13333                                  OpVT.getVectorNumElements() / SizeFactor);
13334
13335     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13336
13337     // Insert the 128-bit vector.
13338     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13339   }
13340
13341   if (OpVT == MVT::v1i64 &&
13342       Op.getOperand(0).getValueType() == MVT::i64)
13343     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
13344
13345   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13346   assert(OpVT.is128BitVector() && "Expected an SSE type!");
13347   return DAG.getNode(ISD::BITCAST, dl, OpVT,
13348                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
13349 }
13350
13351 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
13352 // a simple subregister reference or explicit instructions to grab
13353 // upper bits of a vector.
13354 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13355                                       SelectionDAG &DAG) {
13356   SDLoc dl(Op);
13357   SDValue In =  Op.getOperand(0);
13358   SDValue Idx = Op.getOperand(1);
13359   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13360   MVT ResVT   = Op.getSimpleValueType();
13361   MVT InVT    = In.getSimpleValueType();
13362
13363   if (Subtarget->hasFp256()) {
13364     if (ResVT.is128BitVector() &&
13365         (InVT.is256BitVector() || InVT.is512BitVector()) &&
13366         isa<ConstantSDNode>(Idx)) {
13367       return Extract128BitVector(In, IdxVal, DAG, dl);
13368     }
13369     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
13370         isa<ConstantSDNode>(Idx)) {
13371       return Extract256BitVector(In, IdxVal, DAG, dl);
13372     }
13373   }
13374   return SDValue();
13375 }
13376
13377 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
13378 // simple superregister reference or explicit instructions to insert
13379 // the upper bits of a vector.
13380 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13381                                      SelectionDAG &DAG) {
13382   if (!Subtarget->hasAVX())
13383     return SDValue();
13384
13385   SDLoc dl(Op);
13386   SDValue Vec = Op.getOperand(0);
13387   SDValue SubVec = Op.getOperand(1);
13388   SDValue Idx = Op.getOperand(2);
13389
13390   if (!isa<ConstantSDNode>(Idx))
13391     return SDValue();
13392
13393   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13394   MVT OpVT = Op.getSimpleValueType();
13395   MVT SubVecVT = SubVec.getSimpleValueType();
13396
13397   // Fold two 16-byte subvector loads into one 32-byte load:
13398   // (insert_subvector (insert_subvector undef, (load addr), 0),
13399   //                   (load addr + 16), Elts/2)
13400   // --> load32 addr
13401   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
13402       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
13403       OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
13404       !Subtarget->isUnalignedMem32Slow()) {
13405     SDValue SubVec2 = Vec.getOperand(1);
13406     if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
13407       if (Idx2->getZExtValue() == 0) {
13408         SDValue Ops[] = { SubVec2, SubVec };
13409         SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
13410         if (LD.getNode())
13411           return LD;
13412       }
13413     }
13414   }
13415
13416   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13417       SubVecVT.is128BitVector())
13418     return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13419
13420   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
13421     return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13422
13423   return SDValue();
13424 }
13425
13426 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13427 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13428 // one of the above mentioned nodes. It has to be wrapped because otherwise
13429 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13430 // be used to form addressing mode. These wrapped nodes will be selected
13431 // into MOV32ri.
13432 SDValue
13433 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13434   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13435
13436   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13437   // global base reg.
13438   unsigned char OpFlag = 0;
13439   unsigned WrapperKind = X86ISD::Wrapper;
13440   CodeModel::Model M = DAG.getTarget().getCodeModel();
13441
13442   if (Subtarget->isPICStyleRIPRel() &&
13443       (M == CodeModel::Small || M == CodeModel::Kernel))
13444     WrapperKind = X86ISD::WrapperRIP;
13445   else if (Subtarget->isPICStyleGOT())
13446     OpFlag = X86II::MO_GOTOFF;
13447   else if (Subtarget->isPICStyleStubPIC())
13448     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13449
13450   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
13451                                              CP->getAlignment(),
13452                                              CP->getOffset(), OpFlag);
13453   SDLoc DL(CP);
13454   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13455   // With PIC, the address is actually $g + Offset.
13456   if (OpFlag) {
13457     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13458                          DAG.getNode(X86ISD::GlobalBaseReg,
13459                                      SDLoc(), getPointerTy()),
13460                          Result);
13461   }
13462
13463   return Result;
13464 }
13465
13466 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13467   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13468
13469   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13470   // global base reg.
13471   unsigned char OpFlag = 0;
13472   unsigned WrapperKind = X86ISD::Wrapper;
13473   CodeModel::Model M = DAG.getTarget().getCodeModel();
13474
13475   if (Subtarget->isPICStyleRIPRel() &&
13476       (M == CodeModel::Small || M == CodeModel::Kernel))
13477     WrapperKind = X86ISD::WrapperRIP;
13478   else if (Subtarget->isPICStyleGOT())
13479     OpFlag = X86II::MO_GOTOFF;
13480   else if (Subtarget->isPICStyleStubPIC())
13481     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13482
13483   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
13484                                           OpFlag);
13485   SDLoc DL(JT);
13486   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13487
13488   // With PIC, the address is actually $g + Offset.
13489   if (OpFlag)
13490     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13491                          DAG.getNode(X86ISD::GlobalBaseReg,
13492                                      SDLoc(), getPointerTy()),
13493                          Result);
13494
13495   return Result;
13496 }
13497
13498 SDValue
13499 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13500   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13501
13502   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13503   // global base reg.
13504   unsigned char OpFlag = 0;
13505   unsigned WrapperKind = X86ISD::Wrapper;
13506   CodeModel::Model M = DAG.getTarget().getCodeModel();
13507
13508   if (Subtarget->isPICStyleRIPRel() &&
13509       (M == CodeModel::Small || M == CodeModel::Kernel)) {
13510     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
13511       OpFlag = X86II::MO_GOTPCREL;
13512     WrapperKind = X86ISD::WrapperRIP;
13513   } else if (Subtarget->isPICStyleGOT()) {
13514     OpFlag = X86II::MO_GOT;
13515   } else if (Subtarget->isPICStyleStubPIC()) {
13516     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
13517   } else if (Subtarget->isPICStyleStubNoDynamic()) {
13518     OpFlag = X86II::MO_DARWIN_NONLAZY;
13519   }
13520
13521   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
13522
13523   SDLoc DL(Op);
13524   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13525
13526   // With PIC, the address is actually $g + Offset.
13527   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
13528       !Subtarget->is64Bit()) {
13529     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13530                          DAG.getNode(X86ISD::GlobalBaseReg,
13531                                      SDLoc(), getPointerTy()),
13532                          Result);
13533   }
13534
13535   // For symbols that require a load from a stub to get the address, emit the
13536   // load.
13537   if (isGlobalStubReference(OpFlag))
13538     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
13539                          MachinePointerInfo::getGOT(), false, false, false, 0);
13540
13541   return Result;
13542 }
13543
13544 SDValue
13545 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13546   // Create the TargetBlockAddressAddress node.
13547   unsigned char OpFlags =
13548     Subtarget->ClassifyBlockAddressReference();
13549   CodeModel::Model M = DAG.getTarget().getCodeModel();
13550   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13551   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13552   SDLoc dl(Op);
13553   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
13554                                              OpFlags);
13555
13556   if (Subtarget->isPICStyleRIPRel() &&
13557       (M == CodeModel::Small || M == CodeModel::Kernel))
13558     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13559   else
13560     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13561
13562   // With PIC, the address is actually $g + Offset.
13563   if (isGlobalRelativeToPICBase(OpFlags)) {
13564     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13565                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13566                          Result);
13567   }
13568
13569   return Result;
13570 }
13571
13572 SDValue
13573 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
13574                                       int64_t Offset, SelectionDAG &DAG) const {
13575   // Create the TargetGlobalAddress node, folding in the constant
13576   // offset if it is legal.
13577   unsigned char OpFlags =
13578       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
13579   CodeModel::Model M = DAG.getTarget().getCodeModel();
13580   SDValue Result;
13581   if (OpFlags == X86II::MO_NO_FLAG &&
13582       X86::isOffsetSuitableForCodeModel(Offset, M)) {
13583     // A direct static reference to a global.
13584     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
13585     Offset = 0;
13586   } else {
13587     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
13588   }
13589
13590   if (Subtarget->isPICStyleRIPRel() &&
13591       (M == CodeModel::Small || M == CodeModel::Kernel))
13592     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13593   else
13594     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13595
13596   // With PIC, the address is actually $g + Offset.
13597   if (isGlobalRelativeToPICBase(OpFlags)) {
13598     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13599                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13600                          Result);
13601   }
13602
13603   // For globals that require a load from a stub to get the address, emit the
13604   // load.
13605   if (isGlobalStubReference(OpFlags))
13606     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
13607                          MachinePointerInfo::getGOT(), false, false, false, 0);
13608
13609   // If there was a non-zero offset that we didn't fold, create an explicit
13610   // addition for it.
13611   if (Offset != 0)
13612     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
13613                          DAG.getConstant(Offset, getPointerTy()));
13614
13615   return Result;
13616 }
13617
13618 SDValue
13619 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13620   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13621   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13622   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13623 }
13624
13625 static SDValue
13626 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13627            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13628            unsigned char OperandFlags, bool LocalDynamic = false) {
13629   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13630   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13631   SDLoc dl(GA);
13632   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13633                                            GA->getValueType(0),
13634                                            GA->getOffset(),
13635                                            OperandFlags);
13636
13637   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13638                                            : X86ISD::TLSADDR;
13639
13640   if (InFlag) {
13641     SDValue Ops[] = { Chain,  TGA, *InFlag };
13642     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13643   } else {
13644     SDValue Ops[]  = { Chain, TGA };
13645     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13646   }
13647
13648   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13649   MFI->setAdjustsStack(true);
13650   MFI->setHasCalls(true);
13651
13652   SDValue Flag = Chain.getValue(1);
13653   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13654 }
13655
13656 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13657 static SDValue
13658 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13659                                 const EVT PtrVT) {
13660   SDValue InFlag;
13661   SDLoc dl(GA);  // ? function entry point might be better
13662   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13663                                    DAG.getNode(X86ISD::GlobalBaseReg,
13664                                                SDLoc(), PtrVT), InFlag);
13665   InFlag = Chain.getValue(1);
13666
13667   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13668 }
13669
13670 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13671 static SDValue
13672 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13673                                 const EVT PtrVT) {
13674   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13675                     X86::RAX, X86II::MO_TLSGD);
13676 }
13677
13678 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13679                                            SelectionDAG &DAG,
13680                                            const EVT PtrVT,
13681                                            bool is64Bit) {
13682   SDLoc dl(GA);
13683
13684   // Get the start address of the TLS block for this module.
13685   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13686       .getInfo<X86MachineFunctionInfo>();
13687   MFI->incNumLocalDynamicTLSAccesses();
13688
13689   SDValue Base;
13690   if (is64Bit) {
13691     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13692                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
13693   } else {
13694     SDValue InFlag;
13695     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13696         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13697     InFlag = Chain.getValue(1);
13698     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13699                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13700   }
13701
13702   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13703   // of Base.
13704
13705   // Build x@dtpoff.
13706   unsigned char OperandFlags = X86II::MO_DTPOFF;
13707   unsigned WrapperKind = X86ISD::Wrapper;
13708   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13709                                            GA->getValueType(0),
13710                                            GA->getOffset(), OperandFlags);
13711   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13712
13713   // Add x@dtpoff with the base.
13714   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13715 }
13716
13717 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13718 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13719                                    const EVT PtrVT, TLSModel::Model model,
13720                                    bool is64Bit, bool isPIC) {
13721   SDLoc dl(GA);
13722
13723   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13724   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13725                                                          is64Bit ? 257 : 256));
13726
13727   SDValue ThreadPointer =
13728       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
13729                   MachinePointerInfo(Ptr), false, false, false, 0);
13730
13731   unsigned char OperandFlags = 0;
13732   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
13733   // initialexec.
13734   unsigned WrapperKind = X86ISD::Wrapper;
13735   if (model == TLSModel::LocalExec) {
13736     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13737   } else if (model == TLSModel::InitialExec) {
13738     if (is64Bit) {
13739       OperandFlags = X86II::MO_GOTTPOFF;
13740       WrapperKind = X86ISD::WrapperRIP;
13741     } else {
13742       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13743     }
13744   } else {
13745     llvm_unreachable("Unexpected model");
13746   }
13747
13748   // emit "addl x@ntpoff,%eax" (local exec)
13749   // or "addl x@indntpoff,%eax" (initial exec)
13750   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13751   SDValue TGA =
13752       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13753                                  GA->getOffset(), OperandFlags);
13754   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13755
13756   if (model == TLSModel::InitialExec) {
13757     if (isPIC && !is64Bit) {
13758       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13759                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13760                            Offset);
13761     }
13762
13763     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13764                          MachinePointerInfo::getGOT(), false, false, false, 0);
13765   }
13766
13767   // The address of the thread local variable is the add of the thread
13768   // pointer with the offset of the variable.
13769   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13770 }
13771
13772 SDValue
13773 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13774
13775   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13776   const GlobalValue *GV = GA->getGlobal();
13777
13778   if (Subtarget->isTargetELF()) {
13779     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13780
13781     switch (model) {
13782       case TLSModel::GeneralDynamic:
13783         if (Subtarget->is64Bit())
13784           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
13785         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
13786       case TLSModel::LocalDynamic:
13787         return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
13788                                            Subtarget->is64Bit());
13789       case TLSModel::InitialExec:
13790       case TLSModel::LocalExec:
13791         return LowerToTLSExecModel(
13792             GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
13793             DAG.getTarget().getRelocationModel() == Reloc::PIC_);
13794     }
13795     llvm_unreachable("Unknown TLS model.");
13796   }
13797
13798   if (Subtarget->isTargetDarwin()) {
13799     // Darwin only has one model of TLS.  Lower to that.
13800     unsigned char OpFlag = 0;
13801     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
13802                            X86ISD::WrapperRIP : X86ISD::Wrapper;
13803
13804     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13805     // global base reg.
13806     bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
13807                  !Subtarget->is64Bit();
13808     if (PIC32)
13809       OpFlag = X86II::MO_TLVP_PIC_BASE;
13810     else
13811       OpFlag = X86II::MO_TLVP;
13812     SDLoc DL(Op);
13813     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13814                                                 GA->getValueType(0),
13815                                                 GA->getOffset(), OpFlag);
13816     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13817
13818     // With PIC32, the address is actually $g + Offset.
13819     if (PIC32)
13820       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13821                            DAG.getNode(X86ISD::GlobalBaseReg,
13822                                        SDLoc(), getPointerTy()),
13823                            Offset);
13824
13825     // Lowering the machine isd will make sure everything is in the right
13826     // location.
13827     SDValue Chain = DAG.getEntryNode();
13828     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13829     SDValue Args[] = { Chain, Offset };
13830     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13831
13832     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13833     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13834     MFI->setAdjustsStack(true);
13835
13836     // And our return value (tls address) is in the standard call return value
13837     // location.
13838     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
13839     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
13840                               Chain.getValue(1));
13841   }
13842
13843   if (Subtarget->isTargetKnownWindowsMSVC() ||
13844       Subtarget->isTargetWindowsGNU()) {
13845     // Just use the implicit TLS architecture
13846     // Need to generate someting similar to:
13847     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13848     //                                  ; from TEB
13849     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
13850     //   mov     rcx, qword [rdx+rcx*8]
13851     //   mov     eax, .tls$:tlsvar
13852     //   [rax+rcx] contains the address
13853     // Windows 64bit: gs:0x58
13854     // Windows 32bit: fs:__tls_array
13855
13856     SDLoc dl(GA);
13857     SDValue Chain = DAG.getEntryNode();
13858
13859     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13860     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13861     // use its literal value of 0x2C.
13862     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
13863                                         ? Type::getInt8PtrTy(*DAG.getContext(),
13864                                                              256)
13865                                         : Type::getInt32PtrTy(*DAG.getContext(),
13866                                                               257));
13867
13868     SDValue TlsArray =
13869         Subtarget->is64Bit()
13870             ? DAG.getIntPtrConstant(0x58)
13871             : (Subtarget->isTargetWindowsGNU()
13872                    ? DAG.getIntPtrConstant(0x2C)
13873                    : DAG.getExternalSymbol("_tls_array", getPointerTy()));
13874
13875     SDValue ThreadPointer =
13876         DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
13877                     MachinePointerInfo(Ptr), false, false, false, 0);
13878
13879     // Load the _tls_index variable
13880     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
13881     if (Subtarget->is64Bit())
13882       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
13883                            IDX, MachinePointerInfo(), MVT::i32,
13884                            false, false, false, 0);
13885     else
13886       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
13887                         false, false, false, 0);
13888
13889     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
13890                                     getPointerTy());
13891     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
13892
13893     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
13894     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
13895                       false, false, false, 0);
13896
13897     // Get the offset of start of .tls section
13898     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13899                                              GA->getValueType(0),
13900                                              GA->getOffset(), X86II::MO_SECREL);
13901     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
13902
13903     // The address of the thread local variable is the add of the thread
13904     // pointer with the offset of the variable.
13905     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
13906   }
13907
13908   llvm_unreachable("TLS not implemented for this target.");
13909 }
13910
13911 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
13912 /// and take a 2 x i32 value to shift plus a shift amount.
13913 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13914   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13915   MVT VT = Op.getSimpleValueType();
13916   unsigned VTBits = VT.getSizeInBits();
13917   SDLoc dl(Op);
13918   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13919   SDValue ShOpLo = Op.getOperand(0);
13920   SDValue ShOpHi = Op.getOperand(1);
13921   SDValue ShAmt  = Op.getOperand(2);
13922   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13923   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13924   // during isel.
13925   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13926                                   DAG.getConstant(VTBits - 1, MVT::i8));
13927   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13928                                      DAG.getConstant(VTBits - 1, MVT::i8))
13929                        : DAG.getConstant(0, VT);
13930
13931   SDValue Tmp2, Tmp3;
13932   if (Op.getOpcode() == ISD::SHL_PARTS) {
13933     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13934     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13935   } else {
13936     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13937     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13938   }
13939
13940   // If the shift amount is larger or equal than the width of a part we can't
13941   // rely on the results of shld/shrd. Insert a test and select the appropriate
13942   // values for large shift amounts.
13943   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13944                                 DAG.getConstant(VTBits, MVT::i8));
13945   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13946                              AndNode, DAG.getConstant(0, MVT::i8));
13947
13948   SDValue Hi, Lo;
13949   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
13950   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13951   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13952
13953   if (Op.getOpcode() == ISD::SHL_PARTS) {
13954     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13955     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13956   } else {
13957     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13958     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13959   }
13960
13961   SDValue Ops[2] = { Lo, Hi };
13962   return DAG.getMergeValues(Ops, dl);
13963 }
13964
13965 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13966                                            SelectionDAG &DAG) const {
13967   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
13968   SDLoc dl(Op);
13969
13970   if (SrcVT.isVector()) {
13971     if (SrcVT.getVectorElementType() == MVT::i1) {
13972       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13973       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13974                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
13975                                      Op.getOperand(0)));
13976     }
13977     return SDValue();
13978   }
13979
13980   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13981          "Unknown SINT_TO_FP to lower!");
13982
13983   // These are really Legal; return the operand so the caller accepts it as
13984   // Legal.
13985   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13986     return Op;
13987   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13988       Subtarget->is64Bit()) {
13989     return Op;
13990   }
13991
13992   unsigned Size = SrcVT.getSizeInBits()/8;
13993   MachineFunction &MF = DAG.getMachineFunction();
13994   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13995   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13996   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13997                                StackSlot,
13998                                MachinePointerInfo::getFixedStack(SSFI),
13999                                false, false, 0);
14000   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14001 }
14002
14003 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14004                                      SDValue StackSlot,
14005                                      SelectionDAG &DAG) const {
14006   // Build the FILD
14007   SDLoc DL(Op);
14008   SDVTList Tys;
14009   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14010   if (useSSE)
14011     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14012   else
14013     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14014
14015   unsigned ByteSize = SrcVT.getSizeInBits()/8;
14016
14017   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14018   MachineMemOperand *MMO;
14019   if (FI) {
14020     int SSFI = FI->getIndex();
14021     MMO =
14022       DAG.getMachineFunction()
14023       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14024                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
14025   } else {
14026     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14027     StackSlot = StackSlot.getOperand(1);
14028   }
14029   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14030   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14031                                            X86ISD::FILD, DL,
14032                                            Tys, Ops, SrcVT, MMO);
14033
14034   if (useSSE) {
14035     Chain = Result.getValue(1);
14036     SDValue InFlag = Result.getValue(2);
14037
14038     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14039     // shouldn't be necessary except that RFP cannot be live across
14040     // multiple blocks. When stackifier is fixed, they can be uncoupled.
14041     MachineFunction &MF = DAG.getMachineFunction();
14042     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
14043     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
14044     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14045     Tys = DAG.getVTList(MVT::Other);
14046     SDValue Ops[] = {
14047       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
14048     };
14049     MachineMemOperand *MMO =
14050       DAG.getMachineFunction()
14051       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14052                             MachineMemOperand::MOStore, SSFISize, SSFISize);
14053
14054     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
14055                                     Ops, Op.getValueType(), MMO);
14056     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
14057                          MachinePointerInfo::getFixedStack(SSFI),
14058                          false, false, false, 0);
14059   }
14060
14061   return Result;
14062 }
14063
14064 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
14065 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
14066                                                SelectionDAG &DAG) const {
14067   // This algorithm is not obvious. Here it is what we're trying to output:
14068   /*
14069      movq       %rax,  %xmm0
14070      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
14071      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
14072      #ifdef __SSE3__
14073        haddpd   %xmm0, %xmm0
14074      #else
14075        pshufd   $0x4e, %xmm0, %xmm1
14076        addpd    %xmm1, %xmm0
14077      #endif
14078   */
14079
14080   SDLoc dl(Op);
14081   LLVMContext *Context = DAG.getContext();
14082
14083   // Build some magic constants.
14084   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
14085   Constant *C0 = ConstantDataVector::get(*Context, CV0);
14086   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
14087
14088   SmallVector<Constant*,2> CV1;
14089   CV1.push_back(
14090     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14091                                       APInt(64, 0x4330000000000000ULL))));
14092   CV1.push_back(
14093     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
14094                                       APInt(64, 0x4530000000000000ULL))));
14095   Constant *C1 = ConstantVector::get(CV1);
14096   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
14097
14098   // Load the 64-bit value into an XMM register.
14099   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
14100                             Op.getOperand(0));
14101   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
14102                               MachinePointerInfo::getConstantPool(),
14103                               false, false, false, 16);
14104   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
14105                               DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
14106                               CLod0);
14107
14108   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
14109                               MachinePointerInfo::getConstantPool(),
14110                               false, false, false, 16);
14111   SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
14112   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
14113   SDValue Result;
14114
14115   if (Subtarget->hasSSE3()) {
14116     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
14117     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
14118   } else {
14119     SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
14120     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
14121                                            S2F, 0x4E, DAG);
14122     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
14123                          DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
14124                          Sub);
14125   }
14126
14127   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
14128                      DAG.getIntPtrConstant(0));
14129 }
14130
14131 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
14132 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
14133                                                SelectionDAG &DAG) const {
14134   SDLoc dl(Op);
14135   // FP constant to bias correct the final result.
14136   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
14137                                    MVT::f64);
14138
14139   // Load the 32-bit value into an XMM register.
14140   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
14141                              Op.getOperand(0));
14142
14143   // Zero out the upper parts of the register.
14144   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
14145
14146   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14147                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
14148                      DAG.getIntPtrConstant(0));
14149
14150   // Or the load with the bias.
14151   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
14152                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14153                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14154                                                    MVT::v2f64, Load)),
14155                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
14156                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
14157                                                    MVT::v2f64, Bias)));
14158   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14159                    DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
14160                    DAG.getIntPtrConstant(0));
14161
14162   // Subtract the bias.
14163   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
14164
14165   // Handle final rounding.
14166   EVT DestVT = Op.getValueType();
14167
14168   if (DestVT.bitsLT(MVT::f64))
14169     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
14170                        DAG.getIntPtrConstant(0));
14171   if (DestVT.bitsGT(MVT::f64))
14172     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
14173
14174   // Handle final rounding.
14175   return Sub;
14176 }
14177
14178 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
14179                                      const X86Subtarget &Subtarget) {
14180   // The algorithm is the following:
14181   // #ifdef __SSE4_1__
14182   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14183   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14184   //                                 (uint4) 0x53000000, 0xaa);
14185   // #else
14186   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14187   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14188   // #endif
14189   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14190   //     return (float4) lo + fhi;
14191
14192   SDLoc DL(Op);
14193   SDValue V = Op->getOperand(0);
14194   EVT VecIntVT = V.getValueType();
14195   bool Is128 = VecIntVT == MVT::v4i32;
14196   EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
14197   // If we convert to something else than the supported type, e.g., to v4f64,
14198   // abort early.
14199   if (VecFloatVT != Op->getValueType(0))
14200     return SDValue();
14201
14202   unsigned NumElts = VecIntVT.getVectorNumElements();
14203   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
14204          "Unsupported custom type");
14205   assert(NumElts <= 8 && "The size of the constant array must be fixed");
14206
14207   // In the #idef/#else code, we have in common:
14208   // - The vector of constants:
14209   // -- 0x4b000000
14210   // -- 0x53000000
14211   // - A shift:
14212   // -- v >> 16
14213
14214   // Create the splat vector for 0x4b000000.
14215   SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
14216   SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
14217                            CstLow, CstLow, CstLow, CstLow};
14218   SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14219                                   makeArrayRef(&CstLowArray[0], NumElts));
14220   // Create the splat vector for 0x53000000.
14221   SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
14222   SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
14223                             CstHigh, CstHigh, CstHigh, CstHigh};
14224   SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14225                                    makeArrayRef(&CstHighArray[0], NumElts));
14226
14227   // Create the right shift.
14228   SDValue CstShift = DAG.getConstant(16, MVT::i32);
14229   SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
14230                              CstShift, CstShift, CstShift, CstShift};
14231   SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
14232                                     makeArrayRef(&CstShiftArray[0], NumElts));
14233   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14234
14235   SDValue Low, High;
14236   if (Subtarget.hasSSE41()) {
14237     EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14238     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14239     SDValue VecCstLowBitcast =
14240         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
14241     SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
14242     // Low will be bitcasted right away, so do not bother bitcasting back to its
14243     // original type.
14244     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14245                       VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
14246     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14247     //                                 (uint4) 0x53000000, 0xaa);
14248     SDValue VecCstHighBitcast =
14249         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
14250     SDValue VecShiftBitcast =
14251         DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
14252     // High will be bitcasted right away, so do not bother bitcasting back to
14253     // its original type.
14254     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14255                        VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
14256   } else {
14257     SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
14258     SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
14259                                      CstMask, CstMask, CstMask);
14260     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14261     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14262     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14263
14264     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14265     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14266   }
14267
14268   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14269   SDValue CstFAdd = DAG.getConstantFP(
14270       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
14271   SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
14272                             CstFAdd, CstFAdd, CstFAdd, CstFAdd};
14273   SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
14274                                    makeArrayRef(&CstFAddArray[0], NumElts));
14275
14276   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14277   SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
14278   SDValue FHigh =
14279       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14280   //     return (float4) lo + fhi;
14281   SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
14282   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14283 }
14284
14285 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14286                                                SelectionDAG &DAG) const {
14287   SDValue N0 = Op.getOperand(0);
14288   MVT SVT = N0.getSimpleValueType();
14289   SDLoc dl(Op);
14290
14291   switch (SVT.SimpleTy) {
14292   default:
14293     llvm_unreachable("Custom UINT_TO_FP is not supported!");
14294   case MVT::v4i8:
14295   case MVT::v4i16:
14296   case MVT::v8i8:
14297   case MVT::v8i16: {
14298     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
14299     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14300                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14301   }
14302   case MVT::v4i32:
14303   case MVT::v8i32:
14304     return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
14305   }
14306   llvm_unreachable(nullptr);
14307 }
14308
14309 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14310                                            SelectionDAG &DAG) const {
14311   SDValue N0 = Op.getOperand(0);
14312   SDLoc dl(Op);
14313
14314   if (Op.getValueType().isVector())
14315     return lowerUINT_TO_FP_vec(Op, DAG);
14316
14317   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14318   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14319   // the optimization here.
14320   if (DAG.SignBitIsZero(N0))
14321     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14322
14323   MVT SrcVT = N0.getSimpleValueType();
14324   MVT DstVT = Op.getSimpleValueType();
14325   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14326     return LowerUINT_TO_FP_i64(Op, DAG);
14327   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14328     return LowerUINT_TO_FP_i32(Op, DAG);
14329   if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14330     return SDValue();
14331
14332   // Make a 64-bit buffer, and use it to build an FILD.
14333   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14334   if (SrcVT == MVT::i32) {
14335     SDValue WordOff = DAG.getConstant(4, getPointerTy());
14336     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
14337                                      getPointerTy(), StackSlot, WordOff);
14338     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14339                                   StackSlot, MachinePointerInfo(),
14340                                   false, false, 0);
14341     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
14342                                   OffsetSlot, MachinePointerInfo(),
14343                                   false, false, 0);
14344     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14345     return Fild;
14346   }
14347
14348   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14349   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14350                                StackSlot, MachinePointerInfo(),
14351                                false, false, 0);
14352   // For i64 source, we need to add the appropriate power of 2 if the input
14353   // was negative.  This is the same as the optimization in
14354   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14355   // we must be careful to do the computation in x87 extended precision, not
14356   // in SSE. (The generic code can't know it's OK to do this, or how to.)
14357   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14358   MachineMemOperand *MMO =
14359     DAG.getMachineFunction()
14360     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14361                           MachineMemOperand::MOLoad, 8, 8);
14362
14363   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14364   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14365   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14366                                          MVT::i64, MMO);
14367
14368   APInt FF(32, 0x5F800000ULL);
14369
14370   // Check whether the sign bit is set.
14371   SDValue SignSet = DAG.getSetCC(dl,
14372                                  getSetCCResultType(*DAG.getContext(), MVT::i64),
14373                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
14374                                  ISD::SETLT);
14375
14376   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14377   SDValue FudgePtr = DAG.getConstantPool(
14378                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
14379                                          getPointerTy());
14380
14381   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14382   SDValue Zero = DAG.getIntPtrConstant(0);
14383   SDValue Four = DAG.getIntPtrConstant(4);
14384   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14385                                Zero, Four);
14386   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
14387
14388   // Load the value out, extending it from f32 to f80.
14389   // FIXME: Avoid the extend by constructing the right constant pool?
14390   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
14391                                  FudgePtr, MachinePointerInfo::getConstantPool(),
14392                                  MVT::f32, false, false, false, 4);
14393   // Extend everything to 80 bits to force it to be done on x87.
14394   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14395   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
14396 }
14397
14398 std::pair<SDValue,SDValue>
14399 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14400                                     bool IsSigned, bool IsReplace) const {
14401   SDLoc DL(Op);
14402
14403   EVT DstTy = Op.getValueType();
14404
14405   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
14406     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14407     DstTy = MVT::i64;
14408   }
14409
14410   assert(DstTy.getSimpleVT() <= MVT::i64 &&
14411          DstTy.getSimpleVT() >= MVT::i16 &&
14412          "Unknown FP_TO_INT to lower!");
14413
14414   // These are really Legal.
14415   if (DstTy == MVT::i32 &&
14416       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14417     return std::make_pair(SDValue(), SDValue());
14418   if (Subtarget->is64Bit() &&
14419       DstTy == MVT::i64 &&
14420       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14421     return std::make_pair(SDValue(), SDValue());
14422
14423   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
14424   // stack slot, or into the FTOL runtime function.
14425   MachineFunction &MF = DAG.getMachineFunction();
14426   unsigned MemSize = DstTy.getSizeInBits()/8;
14427   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14428   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14429
14430   unsigned Opc;
14431   if (!IsSigned && isIntegerTypeFTOL(DstTy))
14432     Opc = X86ISD::WIN_FTOL;
14433   else
14434     switch (DstTy.getSimpleVT().SimpleTy) {
14435     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14436     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14437     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14438     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14439     }
14440
14441   SDValue Chain = DAG.getEntryNode();
14442   SDValue Value = Op.getOperand(0);
14443   EVT TheVT = Op.getOperand(0).getValueType();
14444   // FIXME This causes a redundant load/store if the SSE-class value is already
14445   // in memory, such as if it is on the callstack.
14446   if (isScalarFPTypeInSSEReg(TheVT)) {
14447     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14448     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14449                          MachinePointerInfo::getFixedStack(SSFI),
14450                          false, false, 0);
14451     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14452     SDValue Ops[] = {
14453       Chain, StackSlot, DAG.getValueType(TheVT)
14454     };
14455
14456     MachineMemOperand *MMO =
14457       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14458                               MachineMemOperand::MOLoad, MemSize, MemSize);
14459     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14460     Chain = Value.getValue(1);
14461     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14462     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14463   }
14464
14465   MachineMemOperand *MMO =
14466     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14467                             MachineMemOperand::MOStore, MemSize, MemSize);
14468
14469   if (Opc != X86ISD::WIN_FTOL) {
14470     // Build the FP_TO_INT*_IN_MEM
14471     SDValue Ops[] = { Chain, Value, StackSlot };
14472     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
14473                                            Ops, DstTy, MMO);
14474     return std::make_pair(FIST, StackSlot);
14475   } else {
14476     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
14477       DAG.getVTList(MVT::Other, MVT::Glue),
14478       Chain, Value);
14479     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
14480       MVT::i32, ftol.getValue(1));
14481     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
14482       MVT::i32, eax.getValue(2));
14483     SDValue Ops[] = { eax, edx };
14484     SDValue pair = IsReplace
14485       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
14486       : DAG.getMergeValues(Ops, DL);
14487     return std::make_pair(pair, SDValue());
14488   }
14489 }
14490
14491 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14492                               const X86Subtarget *Subtarget) {
14493   MVT VT = Op->getSimpleValueType(0);
14494   SDValue In = Op->getOperand(0);
14495   MVT InVT = In.getSimpleValueType();
14496   SDLoc dl(Op);
14497
14498   // Optimize vectors in AVX mode:
14499   //
14500   //   v8i16 -> v8i32
14501   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
14502   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
14503   //   Concat upper and lower parts.
14504   //
14505   //   v4i32 -> v4i64
14506   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
14507   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
14508   //   Concat upper and lower parts.
14509   //
14510
14511   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14512       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14513       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14514     return SDValue();
14515
14516   if (Subtarget->hasInt256())
14517     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14518
14519   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14520   SDValue Undef = DAG.getUNDEF(InVT);
14521   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14522   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14523   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14524
14525   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14526                              VT.getVectorNumElements()/2);
14527
14528   OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
14529   OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
14530
14531   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14532 }
14533
14534 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14535                                         SelectionDAG &DAG) {
14536   MVT VT = Op->getSimpleValueType(0);
14537   SDValue In = Op->getOperand(0);
14538   MVT InVT = In.getSimpleValueType();
14539   SDLoc DL(Op);
14540   unsigned int NumElts = VT.getVectorNumElements();
14541   if (NumElts != 8 && NumElts != 16)
14542     return SDValue();
14543
14544   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14545     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14546
14547   EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
14548   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14549   // Now we have only mask extension
14550   assert(InVT.getVectorElementType() == MVT::i1);
14551   SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
14552   const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14553   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
14554   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14555   SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14556                            MachinePointerInfo::getConstantPool(),
14557                            false, false, false, Alignment);
14558
14559   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
14560   if (VT.is512BitVector())
14561     return Brcst;
14562   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
14563 }
14564
14565 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14566                                SelectionDAG &DAG) {
14567   if (Subtarget->hasFp256()) {
14568     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14569     if (Res.getNode())
14570       return Res;
14571   }
14572
14573   return SDValue();
14574 }
14575
14576 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14577                                 SelectionDAG &DAG) {
14578   SDLoc DL(Op);
14579   MVT VT = Op.getSimpleValueType();
14580   SDValue In = Op.getOperand(0);
14581   MVT SVT = In.getSimpleValueType();
14582
14583   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14584     return LowerZERO_EXTEND_AVX512(Op, DAG);
14585
14586   if (Subtarget->hasFp256()) {
14587     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14588     if (Res.getNode())
14589       return Res;
14590   }
14591
14592   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14593          VT.getVectorNumElements() != SVT.getVectorNumElements());
14594   return SDValue();
14595 }
14596
14597 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14598   SDLoc DL(Op);
14599   MVT VT = Op.getSimpleValueType();
14600   SDValue In = Op.getOperand(0);
14601   MVT InVT = In.getSimpleValueType();
14602
14603   if (VT == MVT::i1) {
14604     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14605            "Invalid scalar TRUNCATE operation");
14606     if (InVT.getSizeInBits() >= 32)
14607       return SDValue();
14608     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14609     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14610   }
14611   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14612          "Invalid TRUNCATE operation");
14613
14614   if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
14615     if (VT.getVectorElementType().getSizeInBits() >=8)
14616       return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14617
14618     assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
14619     unsigned NumElts = InVT.getVectorNumElements();
14620     assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
14621     if (InVT.getSizeInBits() < 512) {
14622       MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
14623       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14624       InVT = ExtVT;
14625     }
14626
14627     SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
14628     const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14629     SDValue CP = DAG.getConstantPool(C, getPointerTy());
14630     unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14631     SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14632                            MachinePointerInfo::getConstantPool(),
14633                            false, false, false, Alignment);
14634     SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
14635     SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
14636     return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
14637   }
14638
14639   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14640     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14641     if (Subtarget->hasInt256()) {
14642       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14643       In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
14644       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14645                                 ShufMask);
14646       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14647                          DAG.getIntPtrConstant(0));
14648     }
14649
14650     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14651                                DAG.getIntPtrConstant(0));
14652     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14653                                DAG.getIntPtrConstant(2));
14654     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14655     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14656     static const int ShufMask[] = {0, 2, 4, 6};
14657     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14658   }
14659
14660   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14661     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14662     if (Subtarget->hasInt256()) {
14663       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
14664
14665       SmallVector<SDValue,32> pshufbMask;
14666       for (unsigned i = 0; i < 2; ++i) {
14667         pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
14668         pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
14669         pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
14670         pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
14671         pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
14672         pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
14673         pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
14674         pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
14675         for (unsigned j = 0; j < 8; ++j)
14676           pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
14677       }
14678       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
14679       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14680       In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
14681
14682       static const int ShufMask[] = {0,  2,  -1,  -1};
14683       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
14684                                 &ShufMask[0]);
14685       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14686                        DAG.getIntPtrConstant(0));
14687       return DAG.getNode(ISD::BITCAST, DL, VT, In);
14688     }
14689
14690     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14691                                DAG.getIntPtrConstant(0));
14692
14693     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14694                                DAG.getIntPtrConstant(4));
14695
14696     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
14697     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
14698
14699     // The PSHUFB mask:
14700     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
14701                                    -1, -1, -1, -1, -1, -1, -1, -1};
14702
14703     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14704     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14705     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14706
14707     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14708     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14709
14710     // The MOVLHPS Mask:
14711     static const int ShufMask2[] = {0, 1, 4, 5};
14712     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14713     return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
14714   }
14715
14716   // Handle truncation of V256 to V128 using shuffles.
14717   if (!VT.is128BitVector() || !InVT.is256BitVector())
14718     return SDValue();
14719
14720   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
14721
14722   unsigned NumElems = VT.getVectorNumElements();
14723   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14724
14725   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14726   // Prepare truncation shuffle mask
14727   for (unsigned i = 0; i != NumElems; ++i)
14728     MaskVec[i] = i * 2;
14729   SDValue V = DAG.getVectorShuffle(NVT, DL,
14730                                    DAG.getNode(ISD::BITCAST, DL, NVT, In),
14731                                    DAG.getUNDEF(NVT), &MaskVec[0]);
14732   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14733                      DAG.getIntPtrConstant(0));
14734 }
14735
14736 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14737                                            SelectionDAG &DAG) const {
14738   assert(!Op.getSimpleValueType().isVector());
14739
14740   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14741     /*IsSigned=*/ true, /*IsReplace=*/ false);
14742   SDValue FIST = Vals.first, StackSlot = Vals.second;
14743   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14744   if (!FIST.getNode()) return Op;
14745
14746   if (StackSlot.getNode())
14747     // Load the result.
14748     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14749                        FIST, StackSlot, MachinePointerInfo(),
14750                        false, false, false, 0);
14751
14752   // The node is the result.
14753   return FIST;
14754 }
14755
14756 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14757                                            SelectionDAG &DAG) const {
14758   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14759     /*IsSigned=*/ false, /*IsReplace=*/ false);
14760   SDValue FIST = Vals.first, StackSlot = Vals.second;
14761   assert(FIST.getNode() && "Unexpected failure");
14762
14763   if (StackSlot.getNode())
14764     // Load the result.
14765     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14766                        FIST, StackSlot, MachinePointerInfo(),
14767                        false, false, false, 0);
14768
14769   // The node is the result.
14770   return FIST;
14771 }
14772
14773 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14774   SDLoc DL(Op);
14775   MVT VT = Op.getSimpleValueType();
14776   SDValue In = Op.getOperand(0);
14777   MVT SVT = In.getSimpleValueType();
14778
14779   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14780
14781   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14782                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14783                                  In, DAG.getUNDEF(SVT)));
14784 }
14785
14786 /// The only differences between FABS and FNEG are the mask and the logic op.
14787 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
14788 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14789   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14790          "Wrong opcode for lowering FABS or FNEG.");
14791
14792   bool IsFABS = (Op.getOpcode() == ISD::FABS);
14793
14794   // If this is a FABS and it has an FNEG user, bail out to fold the combination
14795   // into an FNABS. We'll lower the FABS after that if it is still in use.
14796   if (IsFABS)
14797     for (SDNode *User : Op->uses())
14798       if (User->getOpcode() == ISD::FNEG)
14799         return Op;
14800
14801   SDValue Op0 = Op.getOperand(0);
14802   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14803
14804   SDLoc dl(Op);
14805   MVT VT = Op.getSimpleValueType();
14806   // Assume scalar op for initialization; update for vector if needed.
14807   // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
14808   // generate a 16-byte vector constant and logic op even for the scalar case.
14809   // Using a 16-byte mask allows folding the load of the mask with
14810   // the logic op, so it can save (~4 bytes) on code size.
14811   MVT EltVT = VT;
14812   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
14813   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14814   // decide if we should generate a 16-byte constant mask when we only need 4 or
14815   // 8 bytes for the scalar case.
14816   if (VT.isVector()) {
14817     EltVT = VT.getVectorElementType();
14818     NumElts = VT.getVectorNumElements();
14819   }
14820
14821   unsigned EltBits = EltVT.getSizeInBits();
14822   LLVMContext *Context = DAG.getContext();
14823   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14824   APInt MaskElt =
14825     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14826   Constant *C = ConstantInt::get(*Context, MaskElt);
14827   C = ConstantVector::getSplat(NumElts, C);
14828   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14829   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
14830   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14831   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14832                              MachinePointerInfo::getConstantPool(),
14833                              false, false, false, Alignment);
14834
14835   if (VT.isVector()) {
14836     // For a vector, cast operands to a vector type, perform the logic op,
14837     // and cast the result back to the original value type.
14838     MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
14839     SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
14840     SDValue Operand = IsFNABS ?
14841       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
14842       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
14843     unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
14844     return DAG.getNode(ISD::BITCAST, dl, VT,
14845                        DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
14846   }
14847
14848   // If not vector, then scalar.
14849   unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14850   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14851   return DAG.getNode(BitOp, dl, VT, Operand, Mask);
14852 }
14853
14854 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14855   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14856   LLVMContext *Context = DAG.getContext();
14857   SDValue Op0 = Op.getOperand(0);
14858   SDValue Op1 = Op.getOperand(1);
14859   SDLoc dl(Op);
14860   MVT VT = Op.getSimpleValueType();
14861   MVT SrcVT = Op1.getSimpleValueType();
14862
14863   // If second operand is smaller, extend it first.
14864   if (SrcVT.bitsLT(VT)) {
14865     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14866     SrcVT = VT;
14867   }
14868   // And if it is bigger, shrink it first.
14869   if (SrcVT.bitsGT(VT)) {
14870     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
14871     SrcVT = VT;
14872   }
14873
14874   // At this point the operands and the result should have the same
14875   // type, and that won't be f80 since that is not custom lowered.
14876
14877   const fltSemantics &Sem =
14878       VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
14879   const unsigned SizeInBits = VT.getSizeInBits();
14880
14881   SmallVector<Constant *, 4> CV(
14882       VT == MVT::f64 ? 2 : 4,
14883       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14884
14885   // First, clear all bits but the sign bit from the second operand (sign).
14886   CV[0] = ConstantFP::get(*Context,
14887                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14888   Constant *C = ConstantVector::get(CV);
14889   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14890   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
14891                               MachinePointerInfo::getConstantPool(),
14892                               false, false, false, 16);
14893   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
14894
14895   // Next, clear the sign bit from the first operand (magnitude).
14896   // If it's a constant, we can clear it here.
14897   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14898     APFloat APF = Op0CN->getValueAPF();
14899     // If the magnitude is a positive zero, the sign bit alone is enough.
14900     if (APF.isPosZero())
14901       return SignBit;
14902     APF.clearSign();
14903     CV[0] = ConstantFP::get(*Context, APF);
14904   } else {
14905     CV[0] = ConstantFP::get(
14906         *Context,
14907         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14908   }
14909   C = ConstantVector::get(CV);
14910   CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14911   SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14912                             MachinePointerInfo::getConstantPool(),
14913                             false, false, false, 16);
14914   // If the magnitude operand wasn't a constant, we need to AND out the sign.
14915   if (!isa<ConstantFPSDNode>(Op0))
14916     Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
14917
14918   // OR the magnitude value with the sign bit.
14919   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
14920 }
14921
14922 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14923   SDValue N0 = Op.getOperand(0);
14924   SDLoc dl(Op);
14925   MVT VT = Op.getSimpleValueType();
14926
14927   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
14928   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
14929                                   DAG.getConstant(1, VT));
14930   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
14931 }
14932
14933 // Check whether an OR'd tree is PTEST-able.
14934 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
14935                                       SelectionDAG &DAG) {
14936   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14937
14938   if (!Subtarget->hasSSE41())
14939     return SDValue();
14940
14941   if (!Op->hasOneUse())
14942     return SDValue();
14943
14944   SDNode *N = Op.getNode();
14945   SDLoc DL(N);
14946
14947   SmallVector<SDValue, 8> Opnds;
14948   DenseMap<SDValue, unsigned> VecInMap;
14949   SmallVector<SDValue, 8> VecIns;
14950   EVT VT = MVT::Other;
14951
14952   // Recognize a special case where a vector is casted into wide integer to
14953   // test all 0s.
14954   Opnds.push_back(N->getOperand(0));
14955   Opnds.push_back(N->getOperand(1));
14956
14957   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14958     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14959     // BFS traverse all OR'd operands.
14960     if (I->getOpcode() == ISD::OR) {
14961       Opnds.push_back(I->getOperand(0));
14962       Opnds.push_back(I->getOperand(1));
14963       // Re-evaluate the number of nodes to be traversed.
14964       e += 2; // 2 more nodes (LHS and RHS) are pushed.
14965       continue;
14966     }
14967
14968     // Quit if a non-EXTRACT_VECTOR_ELT
14969     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14970       return SDValue();
14971
14972     // Quit if without a constant index.
14973     SDValue Idx = I->getOperand(1);
14974     if (!isa<ConstantSDNode>(Idx))
14975       return SDValue();
14976
14977     SDValue ExtractedFromVec = I->getOperand(0);
14978     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14979     if (M == VecInMap.end()) {
14980       VT = ExtractedFromVec.getValueType();
14981       // Quit if not 128/256-bit vector.
14982       if (!VT.is128BitVector() && !VT.is256BitVector())
14983         return SDValue();
14984       // Quit if not the same type.
14985       if (VecInMap.begin() != VecInMap.end() &&
14986           VT != VecInMap.begin()->first.getValueType())
14987         return SDValue();
14988       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14989       VecIns.push_back(ExtractedFromVec);
14990     }
14991     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14992   }
14993
14994   assert((VT.is128BitVector() || VT.is256BitVector()) &&
14995          "Not extracted from 128-/256-bit vector.");
14996
14997   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14998
14999   for (DenseMap<SDValue, unsigned>::const_iterator
15000         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
15001     // Quit if not all elements are used.
15002     if (I->second != FullMask)
15003       return SDValue();
15004   }
15005
15006   EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
15007
15008   // Cast all vectors into TestVT for PTEST.
15009   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
15010     VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
15011
15012   // If more than one full vectors are evaluated, OR them first before PTEST.
15013   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
15014     // Each iteration will OR 2 nodes and append the result until there is only
15015     // 1 node left, i.e. the final OR'd value of all vectors.
15016     SDValue LHS = VecIns[Slot];
15017     SDValue RHS = VecIns[Slot + 1];
15018     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
15019   }
15020
15021   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
15022                      VecIns.back(), VecIns.back());
15023 }
15024
15025 /// \brief return true if \c Op has a use that doesn't just read flags.
15026 static bool hasNonFlagsUse(SDValue Op) {
15027   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
15028        ++UI) {
15029     SDNode *User = *UI;
15030     unsigned UOpNo = UI.getOperandNo();
15031     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
15032       // Look pass truncate.
15033       UOpNo = User->use_begin().getOperandNo();
15034       User = *User->use_begin();
15035     }
15036
15037     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
15038         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
15039       return true;
15040   }
15041   return false;
15042 }
15043
15044 /// Emit nodes that will be selected as "test Op0,Op0", or something
15045 /// equivalent.
15046 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
15047                                     SelectionDAG &DAG) const {
15048   if (Op.getValueType() == MVT::i1)
15049     // KORTEST instruction should be selected
15050     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15051                        DAG.getConstant(0, Op.getValueType()));
15052
15053   // CF and OF aren't always set the way we want. Determine which
15054   // of these we need.
15055   bool NeedCF = false;
15056   bool NeedOF = false;
15057   switch (X86CC) {
15058   default: break;
15059   case X86::COND_A: case X86::COND_AE:
15060   case X86::COND_B: case X86::COND_BE:
15061     NeedCF = true;
15062     break;
15063   case X86::COND_G: case X86::COND_GE:
15064   case X86::COND_L: case X86::COND_LE:
15065   case X86::COND_O: case X86::COND_NO: {
15066     // Check if we really need to set the
15067     // Overflow flag. If NoSignedWrap is present
15068     // that is not actually needed.
15069     switch (Op->getOpcode()) {
15070     case ISD::ADD:
15071     case ISD::SUB:
15072     case ISD::MUL:
15073     case ISD::SHL: {
15074       const BinaryWithFlagsSDNode *BinNode =
15075           cast<BinaryWithFlagsSDNode>(Op.getNode());
15076       if (BinNode->hasNoSignedWrap())
15077         break;
15078     }
15079     default:
15080       NeedOF = true;
15081       break;
15082     }
15083     break;
15084   }
15085   }
15086   // See if we can use the EFLAGS value from the operand instead of
15087   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
15088   // we prove that the arithmetic won't overflow, we can't use OF or CF.
15089   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
15090     // Emit a CMP with 0, which is the TEST pattern.
15091     //if (Op.getValueType() == MVT::i1)
15092     //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
15093     //                     DAG.getConstant(0, MVT::i1));
15094     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15095                        DAG.getConstant(0, Op.getValueType()));
15096   }
15097   unsigned Opcode = 0;
15098   unsigned NumOperands = 0;
15099
15100   // Truncate operations may prevent the merge of the SETCC instruction
15101   // and the arithmetic instruction before it. Attempt to truncate the operands
15102   // of the arithmetic instruction and use a reduced bit-width instruction.
15103   bool NeedTruncation = false;
15104   SDValue ArithOp = Op;
15105   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
15106     SDValue Arith = Op->getOperand(0);
15107     // Both the trunc and the arithmetic op need to have one user each.
15108     if (Arith->hasOneUse())
15109       switch (Arith.getOpcode()) {
15110         default: break;
15111         case ISD::ADD:
15112         case ISD::SUB:
15113         case ISD::AND:
15114         case ISD::OR:
15115         case ISD::XOR: {
15116           NeedTruncation = true;
15117           ArithOp = Arith;
15118         }
15119       }
15120   }
15121
15122   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
15123   // which may be the result of a CAST.  We use the variable 'Op', which is the
15124   // non-casted variable when we check for possible users.
15125   switch (ArithOp.getOpcode()) {
15126   case ISD::ADD:
15127     // Due to an isel shortcoming, be conservative if this add is likely to be
15128     // selected as part of a load-modify-store instruction. When the root node
15129     // in a match is a store, isel doesn't know how to remap non-chain non-flag
15130     // uses of other nodes in the match, such as the ADD in this case. This
15131     // leads to the ADD being left around and reselected, with the result being
15132     // two adds in the output.  Alas, even if none our users are stores, that
15133     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
15134     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
15135     // climbing the DAG back to the root, and it doesn't seem to be worth the
15136     // effort.
15137     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15138          UE = Op.getNode()->use_end(); UI != UE; ++UI)
15139       if (UI->getOpcode() != ISD::CopyToReg &&
15140           UI->getOpcode() != ISD::SETCC &&
15141           UI->getOpcode() != ISD::STORE)
15142         goto default_case;
15143
15144     if (ConstantSDNode *C =
15145         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
15146       // An add of one will be selected as an INC.
15147       if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
15148         Opcode = X86ISD::INC;
15149         NumOperands = 1;
15150         break;
15151       }
15152
15153       // An add of negative one (subtract of one) will be selected as a DEC.
15154       if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
15155         Opcode = X86ISD::DEC;
15156         NumOperands = 1;
15157         break;
15158       }
15159     }
15160
15161     // Otherwise use a regular EFLAGS-setting add.
15162     Opcode = X86ISD::ADD;
15163     NumOperands = 2;
15164     break;
15165   case ISD::SHL:
15166   case ISD::SRL:
15167     // If we have a constant logical shift that's only used in a comparison
15168     // against zero turn it into an equivalent AND. This allows turning it into
15169     // a TEST instruction later.
15170     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
15171         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
15172       EVT VT = Op.getValueType();
15173       unsigned BitWidth = VT.getSizeInBits();
15174       unsigned ShAmt = Op->getConstantOperandVal(1);
15175       if (ShAmt >= BitWidth) // Avoid undefined shifts.
15176         break;
15177       APInt Mask = ArithOp.getOpcode() == ISD::SRL
15178                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
15179                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
15180       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
15181         break;
15182       SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
15183                                 DAG.getConstant(Mask, VT));
15184       DAG.ReplaceAllUsesWith(Op, New);
15185       Op = New;
15186     }
15187     break;
15188
15189   case ISD::AND:
15190     // If the primary and result isn't used, don't bother using X86ISD::AND,
15191     // because a TEST instruction will be better.
15192     if (!hasNonFlagsUse(Op))
15193       break;
15194     // FALL THROUGH
15195   case ISD::SUB:
15196   case ISD::OR:
15197   case ISD::XOR:
15198     // Due to the ISEL shortcoming noted above, be conservative if this op is
15199     // likely to be selected as part of a load-modify-store instruction.
15200     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15201            UE = Op.getNode()->use_end(); UI != UE; ++UI)
15202       if (UI->getOpcode() == ISD::STORE)
15203         goto default_case;
15204
15205     // Otherwise use a regular EFLAGS-setting instruction.
15206     switch (ArithOp.getOpcode()) {
15207     default: llvm_unreachable("unexpected operator!");
15208     case ISD::SUB: Opcode = X86ISD::SUB; break;
15209     case ISD::XOR: Opcode = X86ISD::XOR; break;
15210     case ISD::AND: Opcode = X86ISD::AND; break;
15211     case ISD::OR: {
15212       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
15213         SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
15214         if (EFLAGS.getNode())
15215           return EFLAGS;
15216       }
15217       Opcode = X86ISD::OR;
15218       break;
15219     }
15220     }
15221
15222     NumOperands = 2;
15223     break;
15224   case X86ISD::ADD:
15225   case X86ISD::SUB:
15226   case X86ISD::INC:
15227   case X86ISD::DEC:
15228   case X86ISD::OR:
15229   case X86ISD::XOR:
15230   case X86ISD::AND:
15231     return SDValue(Op.getNode(), 1);
15232   default:
15233   default_case:
15234     break;
15235   }
15236
15237   // If we found that truncation is beneficial, perform the truncation and
15238   // update 'Op'.
15239   if (NeedTruncation) {
15240     EVT VT = Op.getValueType();
15241     SDValue WideVal = Op->getOperand(0);
15242     EVT WideVT = WideVal.getValueType();
15243     unsigned ConvertedOp = 0;
15244     // Use a target machine opcode to prevent further DAGCombine
15245     // optimizations that may separate the arithmetic operations
15246     // from the setcc node.
15247     switch (WideVal.getOpcode()) {
15248       default: break;
15249       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
15250       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
15251       case ISD::AND: ConvertedOp = X86ISD::AND; break;
15252       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
15253       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15254     }
15255
15256     if (ConvertedOp) {
15257       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15258       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15259         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15260         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15261         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15262       }
15263     }
15264   }
15265
15266   if (Opcode == 0)
15267     // Emit a CMP with 0, which is the TEST pattern.
15268     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15269                        DAG.getConstant(0, Op.getValueType()));
15270
15271   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15272   SmallVector<SDValue, 4> Ops;
15273   for (unsigned i = 0; i != NumOperands; ++i)
15274     Ops.push_back(Op.getOperand(i));
15275
15276   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15277   DAG.ReplaceAllUsesWith(Op, New);
15278   return SDValue(New.getNode(), 1);
15279 }
15280
15281 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15282 /// equivalent.
15283 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15284                                    SDLoc dl, SelectionDAG &DAG) const {
15285   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
15286     if (C->getAPIntValue() == 0)
15287       return EmitTest(Op0, X86CC, dl, DAG);
15288
15289      if (Op0.getValueType() == MVT::i1)
15290        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
15291   }
15292
15293   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
15294        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
15295     // Do the comparison at i32 if it's smaller, besides the Atom case.
15296     // This avoids subregister aliasing issues. Keep the smaller reference
15297     // if we're optimizing for size, however, as that'll allow better folding
15298     // of memory operations.
15299     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
15300         !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
15301              AttributeSet::FunctionIndex, Attribute::MinSize) &&
15302         !Subtarget->isAtom()) {
15303       unsigned ExtendOp =
15304           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
15305       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
15306       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
15307     }
15308     // Use SUB instead of CMP to enable CSE between SUB and CMP.
15309     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
15310     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
15311                               Op0, Op1);
15312     return SDValue(Sub.getNode(), 1);
15313   }
15314   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
15315 }
15316
15317 /// Convert a comparison if required by the subtarget.
15318 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
15319                                                  SelectionDAG &DAG) const {
15320   // If the subtarget does not support the FUCOMI instruction, floating-point
15321   // comparisons have to be converted.
15322   if (Subtarget->hasCMov() ||
15323       Cmp.getOpcode() != X86ISD::CMP ||
15324       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
15325       !Cmp.getOperand(1).getValueType().isFloatingPoint())
15326     return Cmp;
15327
15328   // The instruction selector will select an FUCOM instruction instead of
15329   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
15330   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
15331   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
15332   SDLoc dl(Cmp);
15333   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
15334   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
15335   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
15336                             DAG.getConstant(8, MVT::i8));
15337   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
15338   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
15339 }
15340
15341 /// The minimum architected relative accuracy is 2^-12. We need one
15342 /// Newton-Raphson step to have a good float result (24 bits of precision).
15343 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
15344                                             DAGCombinerInfo &DCI,
15345                                             unsigned &RefinementSteps,
15346                                             bool &UseOneConstNR) const {
15347   // FIXME: We should use instruction latency models to calculate the cost of
15348   // each potential sequence, but this is very hard to do reliably because
15349   // at least Intel's Core* chips have variable timing based on the number of
15350   // significant digits in the divisor and/or sqrt operand.
15351   if (!Subtarget->useSqrtEst())
15352     return SDValue();
15353
15354   EVT VT = Op.getValueType();
15355
15356   // SSE1 has rsqrtss and rsqrtps.
15357   // TODO: Add support for AVX512 (v16f32).
15358   // It is likely not profitable to do this for f64 because a double-precision
15359   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
15360   // instructions: convert to single, rsqrtss, convert back to double, refine
15361   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
15362   // along with FMA, this could be a throughput win.
15363   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15364       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15365     RefinementSteps = 1;
15366     UseOneConstNR = false;
15367     return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
15368   }
15369   return SDValue();
15370 }
15371
15372 /// The minimum architected relative accuracy is 2^-12. We need one
15373 /// Newton-Raphson step to have a good float result (24 bits of precision).
15374 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
15375                                             DAGCombinerInfo &DCI,
15376                                             unsigned &RefinementSteps) const {
15377   // FIXME: We should use instruction latency models to calculate the cost of
15378   // each potential sequence, but this is very hard to do reliably because
15379   // at least Intel's Core* chips have variable timing based on the number of
15380   // significant digits in the divisor.
15381   if (!Subtarget->useReciprocalEst())
15382     return SDValue();
15383
15384   EVT VT = Op.getValueType();
15385
15386   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
15387   // TODO: Add support for AVX512 (v16f32).
15388   // It is likely not profitable to do this for f64 because a double-precision
15389   // reciprocal estimate with refinement on x86 prior to FMA requires
15390   // 15 instructions: convert to single, rcpss, convert back to double, refine
15391   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
15392   // along with FMA, this could be a throughput win.
15393   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15394       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15395     RefinementSteps = ReciprocalEstimateRefinementSteps;
15396     return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15397   }
15398   return SDValue();
15399 }
15400
15401 static bool isAllOnes(SDValue V) {
15402   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
15403   return C && C->isAllOnesValue();
15404 }
15405
15406 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
15407 /// if it's possible.
15408 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15409                                      SDLoc dl, SelectionDAG &DAG) const {
15410   SDValue Op0 = And.getOperand(0);
15411   SDValue Op1 = And.getOperand(1);
15412   if (Op0.getOpcode() == ISD::TRUNCATE)
15413     Op0 = Op0.getOperand(0);
15414   if (Op1.getOpcode() == ISD::TRUNCATE)
15415     Op1 = Op1.getOperand(0);
15416
15417   SDValue LHS, RHS;
15418   if (Op1.getOpcode() == ISD::SHL)
15419     std::swap(Op0, Op1);
15420   if (Op0.getOpcode() == ISD::SHL) {
15421     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
15422       if (And00C->getZExtValue() == 1) {
15423         // If we looked past a truncate, check that it's only truncating away
15424         // known zeros.
15425         unsigned BitWidth = Op0.getValueSizeInBits();
15426         unsigned AndBitWidth = And.getValueSizeInBits();
15427         if (BitWidth > AndBitWidth) {
15428           APInt Zeros, Ones;
15429           DAG.computeKnownBits(Op0, Zeros, Ones);
15430           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15431             return SDValue();
15432         }
15433         LHS = Op1;
15434         RHS = Op0.getOperand(1);
15435       }
15436   } else if (Op1.getOpcode() == ISD::Constant) {
15437     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15438     uint64_t AndRHSVal = AndRHS->getZExtValue();
15439     SDValue AndLHS = Op0;
15440
15441     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15442       LHS = AndLHS.getOperand(0);
15443       RHS = AndLHS.getOperand(1);
15444     }
15445
15446     // Use BT if the immediate can't be encoded in a TEST instruction.
15447     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15448       LHS = AndLHS;
15449       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
15450     }
15451   }
15452
15453   if (LHS.getNode()) {
15454     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
15455     // instruction.  Since the shift amount is in-range-or-undefined, we know
15456     // that doing a bittest on the i32 value is ok.  We extend to i32 because
15457     // the encoding for the i16 version is larger than the i32 version.
15458     // Also promote i16 to i32 for performance / code size reason.
15459     if (LHS.getValueType() == MVT::i8 ||
15460         LHS.getValueType() == MVT::i16)
15461       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15462
15463     // If the operand types disagree, extend the shift amount to match.  Since
15464     // BT ignores high bits (like shifts) we can use anyextend.
15465     if (LHS.getValueType() != RHS.getValueType())
15466       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15467
15468     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15469     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15470     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15471                        DAG.getConstant(Cond, MVT::i8), BT);
15472   }
15473
15474   return SDValue();
15475 }
15476
15477 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
15478 /// mask CMPs.
15479 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15480                               SDValue &Op1) {
15481   unsigned SSECC;
15482   bool Swap = false;
15483
15484   // SSE Condition code mapping:
15485   //  0 - EQ
15486   //  1 - LT
15487   //  2 - LE
15488   //  3 - UNORD
15489   //  4 - NEQ
15490   //  5 - NLT
15491   //  6 - NLE
15492   //  7 - ORD
15493   switch (SetCCOpcode) {
15494   default: llvm_unreachable("Unexpected SETCC condition");
15495   case ISD::SETOEQ:
15496   case ISD::SETEQ:  SSECC = 0; break;
15497   case ISD::SETOGT:
15498   case ISD::SETGT:  Swap = true; // Fallthrough
15499   case ISD::SETLT:
15500   case ISD::SETOLT: SSECC = 1; break;
15501   case ISD::SETOGE:
15502   case ISD::SETGE:  Swap = true; // Fallthrough
15503   case ISD::SETLE:
15504   case ISD::SETOLE: SSECC = 2; break;
15505   case ISD::SETUO:  SSECC = 3; break;
15506   case ISD::SETUNE:
15507   case ISD::SETNE:  SSECC = 4; break;
15508   case ISD::SETULE: Swap = true; // Fallthrough
15509   case ISD::SETUGE: SSECC = 5; break;
15510   case ISD::SETULT: Swap = true; // Fallthrough
15511   case ISD::SETUGT: SSECC = 6; break;
15512   case ISD::SETO:   SSECC = 7; break;
15513   case ISD::SETUEQ:
15514   case ISD::SETONE: SSECC = 8; break;
15515   }
15516   if (Swap)
15517     std::swap(Op0, Op1);
15518
15519   return SSECC;
15520 }
15521
15522 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
15523 // ones, and then concatenate the result back.
15524 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15525   MVT VT = Op.getSimpleValueType();
15526
15527   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15528          "Unsupported value type for operation");
15529
15530   unsigned NumElems = VT.getVectorNumElements();
15531   SDLoc dl(Op);
15532   SDValue CC = Op.getOperand(2);
15533
15534   // Extract the LHS vectors
15535   SDValue LHS = Op.getOperand(0);
15536   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
15537   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
15538
15539   // Extract the RHS vectors
15540   SDValue RHS = Op.getOperand(1);
15541   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
15542   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
15543
15544   // Issue the operation on the smaller types and concatenate the result back
15545   MVT EltVT = VT.getVectorElementType();
15546   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15547   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15548                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15549                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15550 }
15551
15552 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
15553                                      const X86Subtarget *Subtarget) {
15554   SDValue Op0 = Op.getOperand(0);
15555   SDValue Op1 = Op.getOperand(1);
15556   SDValue CC = Op.getOperand(2);
15557   MVT VT = Op.getSimpleValueType();
15558   SDLoc dl(Op);
15559
15560   assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
15561          Op.getValueType().getScalarType() == MVT::i1 &&
15562          "Cannot set masked compare for this operation");
15563
15564   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15565   unsigned  Opc = 0;
15566   bool Unsigned = false;
15567   bool Swap = false;
15568   unsigned SSECC;
15569   switch (SetCCOpcode) {
15570   default: llvm_unreachable("Unexpected SETCC condition");
15571   case ISD::SETNE:  SSECC = 4; break;
15572   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
15573   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15574   case ISD::SETLT:  Swap = true; //fall-through
15575   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
15576   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15577   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15578   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
15579   case ISD::SETULE: Unsigned = true; //fall-through
15580   case ISD::SETLE:  SSECC = 2; break;
15581   }
15582
15583   if (Swap)
15584     std::swap(Op0, Op1);
15585   if (Opc)
15586     return DAG.getNode(Opc, dl, VT, Op0, Op1);
15587   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15588   return DAG.getNode(Opc, dl, VT, Op0, Op1,
15589                      DAG.getConstant(SSECC, MVT::i8));
15590 }
15591
15592 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15593 /// operand \p Op1.  If non-trivial (for example because it's not constant)
15594 /// return an empty value.
15595 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
15596 {
15597   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15598   if (!BV)
15599     return SDValue();
15600
15601   MVT VT = Op1.getSimpleValueType();
15602   MVT EVT = VT.getVectorElementType();
15603   unsigned n = VT.getVectorNumElements();
15604   SmallVector<SDValue, 8> ULTOp1;
15605
15606   for (unsigned i = 0; i < n; ++i) {
15607     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15608     if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
15609       return SDValue();
15610
15611     // Avoid underflow.
15612     APInt Val = Elt->getAPIntValue();
15613     if (Val == 0)
15614       return SDValue();
15615
15616     ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
15617   }
15618
15619   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
15620 }
15621
15622 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
15623                            SelectionDAG &DAG) {
15624   SDValue Op0 = Op.getOperand(0);
15625   SDValue Op1 = Op.getOperand(1);
15626   SDValue CC = Op.getOperand(2);
15627   MVT VT = Op.getSimpleValueType();
15628   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15629   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15630   SDLoc dl(Op);
15631
15632   if (isFP) {
15633 #ifndef NDEBUG
15634     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15635     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15636 #endif
15637
15638     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15639     unsigned Opc = X86ISD::CMPP;
15640     if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15641       assert(VT.getVectorNumElements() <= 16);
15642       Opc = X86ISD::CMPM;
15643     }
15644     // In the two special cases we can't handle, emit two comparisons.
15645     if (SSECC == 8) {
15646       unsigned CC0, CC1;
15647       unsigned CombineOpc;
15648       if (SetCCOpcode == ISD::SETUEQ) {
15649         CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
15650       } else {
15651         assert(SetCCOpcode == ISD::SETONE);
15652         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
15653       }
15654
15655       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15656                                  DAG.getConstant(CC0, MVT::i8));
15657       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15658                                  DAG.getConstant(CC1, MVT::i8));
15659       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15660     }
15661     // Handle all other FP comparisons here.
15662     return DAG.getNode(Opc, dl, VT, Op0, Op1,
15663                        DAG.getConstant(SSECC, MVT::i8));
15664   }
15665
15666   // Break 256-bit integer vector compare into smaller ones.
15667   if (VT.is256BitVector() && !Subtarget->hasInt256())
15668     return Lower256IntVSETCC(Op, DAG);
15669
15670   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
15671   EVT OpVT = Op1.getValueType();
15672   if (Subtarget->hasAVX512()) {
15673     if (Op1.getValueType().is512BitVector() ||
15674         (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
15675         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
15676       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
15677
15678     // In AVX-512 architecture setcc returns mask with i1 elements,
15679     // But there is no compare instruction for i8 and i16 elements in KNL.
15680     // We are not talking about 512-bit operands in this case, these
15681     // types are illegal.
15682     if (MaskResult &&
15683         (OpVT.getVectorElementType().getSizeInBits() < 32 &&
15684          OpVT.getVectorElementType().getSizeInBits() >= 8))
15685       return DAG.getNode(ISD::TRUNCATE, dl, VT,
15686                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15687   }
15688
15689   // We are handling one of the integer comparisons here.  Since SSE only has
15690   // GT and EQ comparisons for integer, swapping operands and multiple
15691   // operations may be required for some comparisons.
15692   unsigned Opc;
15693   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15694   bool Subus = false;
15695
15696   switch (SetCCOpcode) {
15697   default: llvm_unreachable("Unexpected SETCC condition");
15698   case ISD::SETNE:  Invert = true;
15699   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
15700   case ISD::SETLT:  Swap = true;
15701   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
15702   case ISD::SETGE:  Swap = true;
15703   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
15704                     Invert = true; break;
15705   case ISD::SETULT: Swap = true;
15706   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15707                     FlipSigns = true; break;
15708   case ISD::SETUGE: Swap = true;
15709   case ISD::SETULE: Opc = X86ISD::PCMPGT;
15710                     FlipSigns = true; Invert = true; break;
15711   }
15712
15713   // Special case: Use min/max operations for SETULE/SETUGE
15714   MVT VET = VT.getVectorElementType();
15715   bool hasMinMax =
15716        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15717     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
15718
15719   if (hasMinMax) {
15720     switch (SetCCOpcode) {
15721     default: break;
15722     case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
15723     case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
15724     }
15725
15726     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15727   }
15728
15729   bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15730   if (!MinMax && hasSubus) {
15731     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15732     // Op0 u<= Op1:
15733     //   t = psubus Op0, Op1
15734     //   pcmpeq t, <0..0>
15735     switch (SetCCOpcode) {
15736     default: break;
15737     case ISD::SETULT: {
15738       // If the comparison is against a constant we can turn this into a
15739       // setule.  With psubus, setule does not require a swap.  This is
15740       // beneficial because the constant in the register is no longer
15741       // destructed as the destination so it can be hoisted out of a loop.
15742       // Only do this pre-AVX since vpcmp* is no longer destructive.
15743       if (Subtarget->hasAVX())
15744         break;
15745       SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
15746       if (ULEOp1.getNode()) {
15747         Op1 = ULEOp1;
15748         Subus = true; Invert = false; Swap = false;
15749       }
15750       break;
15751     }
15752     // Psubus is better than flip-sign because it requires no inversion.
15753     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
15754     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15755     }
15756
15757     if (Subus) {
15758       Opc = X86ISD::SUBUS;
15759       FlipSigns = false;
15760     }
15761   }
15762
15763   if (Swap)
15764     std::swap(Op0, Op1);
15765
15766   // Check that the operation in question is available (most are plain SSE2,
15767   // but PCMPGTQ and PCMPEQQ have different requirements).
15768   if (VT == MVT::v2i64) {
15769     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
15770       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
15771
15772       // First cast everything to the right type.
15773       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15774       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15775
15776       // Since SSE has no unsigned integer comparisons, we need to flip the sign
15777       // bits of the inputs before performing those operations. The lower
15778       // compare is always unsigned.
15779       SDValue SB;
15780       if (FlipSigns) {
15781         SB = DAG.getConstant(0x80000000U, MVT::v4i32);
15782       } else {
15783         SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
15784         SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
15785         SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
15786                          Sign, Zero, Sign, Zero);
15787       }
15788       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15789       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15790
15791       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15792       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15793       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15794
15795       // Create masks for only the low parts/high parts of the 64 bit integers.
15796       static const int MaskHi[] = { 1, 1, 3, 3 };
15797       static const int MaskLo[] = { 0, 0, 2, 2 };
15798       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15799       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15800       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15801
15802       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15803       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15804
15805       if (Invert)
15806         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15807
15808       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15809     }
15810
15811     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
15812       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15813       // pcmpeqd + pshufd + pand.
15814       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
15815
15816       // First cast everything to the right type.
15817       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15818       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15819
15820       // Do the compare.
15821       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15822
15823       // Make sure the lower and upper halves are both all-ones.
15824       static const int Mask[] = { 1, 0, 3, 2 };
15825       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15826       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15827
15828       if (Invert)
15829         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15830
15831       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15832     }
15833   }
15834
15835   // Since SSE has no unsigned integer comparisons, we need to flip the sign
15836   // bits of the inputs before performing those operations.
15837   if (FlipSigns) {
15838     EVT EltVT = VT.getVectorElementType();
15839     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
15840     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15841     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15842   }
15843
15844   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15845
15846   // If the logical-not of the result is required, perform that now.
15847   if (Invert)
15848     Result = DAG.getNOT(dl, Result, VT);
15849
15850   if (MinMax)
15851     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15852
15853   if (Subus)
15854     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15855                          getZeroVector(VT, Subtarget, DAG, dl));
15856
15857   return Result;
15858 }
15859
15860 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15861
15862   MVT VT = Op.getSimpleValueType();
15863
15864   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15865
15866   assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15867          && "SetCC type must be 8-bit or 1-bit integer");
15868   SDValue Op0 = Op.getOperand(0);
15869   SDValue Op1 = Op.getOperand(1);
15870   SDLoc dl(Op);
15871   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15872
15873   // Optimize to BT if possible.
15874   // Lower (X & (1 << N)) == 0 to BT(X, N).
15875   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15876   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15877   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15878       Op1.getOpcode() == ISD::Constant &&
15879       cast<ConstantSDNode>(Op1)->isNullValue() &&
15880       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15881     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
15882     if (NewSetCC.getNode()) {
15883       if (VT == MVT::i1)
15884         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15885       return NewSetCC;
15886     }
15887   }
15888
15889   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
15890   // these.
15891   if (Op1.getOpcode() == ISD::Constant &&
15892       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
15893        cast<ConstantSDNode>(Op1)->isNullValue()) &&
15894       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15895
15896     // If the input is a setcc, then reuse the input setcc or use a new one with
15897     // the inverted condition.
15898     if (Op0.getOpcode() == X86ISD::SETCC) {
15899       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15900       bool Invert = (CC == ISD::SETNE) ^
15901         cast<ConstantSDNode>(Op1)->isNullValue();
15902       if (!Invert)
15903         return Op0;
15904
15905       CCode = X86::GetOppositeBranchCondition(CCode);
15906       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15907                                   DAG.getConstant(CCode, MVT::i8),
15908                                   Op0.getOperand(1));
15909       if (VT == MVT::i1)
15910         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15911       return SetCC;
15912     }
15913   }
15914   if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
15915       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
15916       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15917
15918     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15919     return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
15920   }
15921
15922   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15923   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
15924   if (X86CC == X86::COND_INVALID)
15925     return SDValue();
15926
15927   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15928   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15929   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15930                               DAG.getConstant(X86CC, MVT::i8), EFLAGS);
15931   if (VT == MVT::i1)
15932     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15933   return SetCC;
15934 }
15935
15936 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
15937 static bool isX86LogicalCmp(SDValue Op) {
15938   unsigned Opc = Op.getNode()->getOpcode();
15939   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15940       Opc == X86ISD::SAHF)
15941     return true;
15942   if (Op.getResNo() == 1 &&
15943       (Opc == X86ISD::ADD ||
15944        Opc == X86ISD::SUB ||
15945        Opc == X86ISD::ADC ||
15946        Opc == X86ISD::SBB ||
15947        Opc == X86ISD::SMUL ||
15948        Opc == X86ISD::UMUL ||
15949        Opc == X86ISD::INC ||
15950        Opc == X86ISD::DEC ||
15951        Opc == X86ISD::OR ||
15952        Opc == X86ISD::XOR ||
15953        Opc == X86ISD::AND))
15954     return true;
15955
15956   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15957     return true;
15958
15959   return false;
15960 }
15961
15962 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15963   if (V.getOpcode() != ISD::TRUNCATE)
15964     return false;
15965
15966   SDValue VOp0 = V.getOperand(0);
15967   unsigned InBits = VOp0.getValueSizeInBits();
15968   unsigned Bits = V.getValueSizeInBits();
15969   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
15970 }
15971
15972 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15973   bool addTest = true;
15974   SDValue Cond  = Op.getOperand(0);
15975   SDValue Op1 = Op.getOperand(1);
15976   SDValue Op2 = Op.getOperand(2);
15977   SDLoc DL(Op);
15978   EVT VT = Op1.getValueType();
15979   SDValue CC;
15980
15981   // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15982   // are available. Otherwise fp cmovs get lowered into a less efficient branch
15983   // sequence later on.
15984   if (Cond.getOpcode() == ISD::SETCC &&
15985       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15986        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
15987       VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
15988     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15989     int SSECC = translateX86FSETCC(
15990         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15991
15992     if (SSECC != 8) {
15993       if (Subtarget->hasAVX512()) {
15994         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15995                                   DAG.getConstant(SSECC, MVT::i8));
15996         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15997       }
15998       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
15999                                 DAG.getConstant(SSECC, MVT::i8));
16000       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
16001       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
16002       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
16003     }
16004   }
16005
16006   if (Cond.getOpcode() == ISD::SETCC) {
16007     SDValue NewCond = LowerSETCC(Cond, DAG);
16008     if (NewCond.getNode())
16009       Cond = NewCond;
16010   }
16011
16012   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
16013   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
16014   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
16015   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
16016   if (Cond.getOpcode() == X86ISD::SETCC &&
16017       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
16018       isZero(Cond.getOperand(1).getOperand(1))) {
16019     SDValue Cmp = Cond.getOperand(1);
16020
16021     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
16022
16023     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
16024         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
16025       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
16026
16027       SDValue CmpOp0 = Cmp.getOperand(0);
16028       // Apply further optimizations for special cases
16029       // (select (x != 0), -1, 0) -> neg & sbb
16030       // (select (x == 0), 0, -1) -> neg & sbb
16031       if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
16032         if (YC->isNullValue() &&
16033             (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
16034           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
16035           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
16036                                     DAG.getConstant(0, CmpOp0.getValueType()),
16037                                     CmpOp0);
16038           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16039                                     DAG.getConstant(X86::COND_B, MVT::i8),
16040                                     SDValue(Neg.getNode(), 1));
16041           return Res;
16042         }
16043
16044       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
16045                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
16046       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16047
16048       SDValue Res =   // Res = 0 or -1.
16049         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16050                     DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
16051
16052       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
16053         Res = DAG.getNOT(DL, Res, Res.getValueType());
16054
16055       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
16056       if (!N2C || !N2C->isNullValue())
16057         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
16058       return Res;
16059     }
16060   }
16061
16062   // Look past (and (setcc_carry (cmp ...)), 1).
16063   if (Cond.getOpcode() == ISD::AND &&
16064       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16065     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16066     if (C && C->getAPIntValue() == 1)
16067       Cond = Cond.getOperand(0);
16068   }
16069
16070   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16071   // setting operand in place of the X86ISD::SETCC.
16072   unsigned CondOpcode = Cond.getOpcode();
16073   if (CondOpcode == X86ISD::SETCC ||
16074       CondOpcode == X86ISD::SETCC_CARRY) {
16075     CC = Cond.getOperand(0);
16076
16077     SDValue Cmp = Cond.getOperand(1);
16078     unsigned Opc = Cmp.getOpcode();
16079     MVT VT = Op.getSimpleValueType();
16080
16081     bool IllegalFPCMov = false;
16082     if (VT.isFloatingPoint() && !VT.isVector() &&
16083         !isScalarFPTypeInSSEReg(VT))  // FPStack?
16084       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
16085
16086     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
16087         Opc == X86ISD::BT) { // FIXME
16088       Cond = Cmp;
16089       addTest = false;
16090     }
16091   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16092              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16093              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16094               Cond.getOperand(0).getValueType() != MVT::i8)) {
16095     SDValue LHS = Cond.getOperand(0);
16096     SDValue RHS = Cond.getOperand(1);
16097     unsigned X86Opcode;
16098     unsigned X86Cond;
16099     SDVTList VTs;
16100     switch (CondOpcode) {
16101     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16102     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16103     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16104     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16105     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16106     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16107     default: llvm_unreachable("unexpected overflowing operator");
16108     }
16109     if (CondOpcode == ISD::UMULO)
16110       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16111                           MVT::i32);
16112     else
16113       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16114
16115     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
16116
16117     if (CondOpcode == ISD::UMULO)
16118       Cond = X86Op.getValue(2);
16119     else
16120       Cond = X86Op.getValue(1);
16121
16122     CC = DAG.getConstant(X86Cond, MVT::i8);
16123     addTest = false;
16124   }
16125
16126   if (addTest) {
16127     // Look pass the truncate if the high bits are known zero.
16128     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16129         Cond = Cond.getOperand(0);
16130
16131     // We know the result of AND is compared against zero. Try to match
16132     // it to BT.
16133     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16134       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
16135       if (NewSetCC.getNode()) {
16136         CC = NewSetCC.getOperand(0);
16137         Cond = NewSetCC.getOperand(1);
16138         addTest = false;
16139       }
16140     }
16141   }
16142
16143   if (addTest) {
16144     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16145     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
16146   }
16147
16148   // a <  b ? -1 :  0 -> RES = ~setcc_carry
16149   // a <  b ?  0 : -1 -> RES = setcc_carry
16150   // a >= b ? -1 :  0 -> RES = setcc_carry
16151   // a >= b ?  0 : -1 -> RES = ~setcc_carry
16152   if (Cond.getOpcode() == X86ISD::SUB) {
16153     Cond = ConvertCmpIfNecessary(Cond, DAG);
16154     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
16155
16156     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
16157         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
16158       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
16159                                 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
16160       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
16161         return DAG.getNOT(DL, Res, Res.getValueType());
16162       return Res;
16163     }
16164   }
16165
16166   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
16167   // widen the cmov and push the truncate through. This avoids introducing a new
16168   // branch during isel and doesn't add any extensions.
16169   if (Op.getValueType() == MVT::i8 &&
16170       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
16171     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
16172     if (T1.getValueType() == T2.getValueType() &&
16173         // Blacklist CopyFromReg to avoid partial register stalls.
16174         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
16175       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
16176       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
16177       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
16178     }
16179   }
16180
16181   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
16182   // condition is true.
16183   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
16184   SDValue Ops[] = { Op2, Op1, CC, Cond };
16185   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
16186 }
16187
16188 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
16189                                        SelectionDAG &DAG) {
16190   MVT VT = Op->getSimpleValueType(0);
16191   SDValue In = Op->getOperand(0);
16192   MVT InVT = In.getSimpleValueType();
16193   MVT VTElt = VT.getVectorElementType();
16194   MVT InVTElt = InVT.getVectorElementType();
16195   SDLoc dl(Op);
16196
16197   // SKX processor
16198   if ((InVTElt == MVT::i1) &&
16199       (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
16200         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
16201
16202        ((Subtarget->hasBWI() && VT.is512BitVector() &&
16203         VTElt.getSizeInBits() <= 16)) ||
16204
16205        ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
16206         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
16207
16208        ((Subtarget->hasDQI() && VT.is512BitVector() &&
16209         VTElt.getSizeInBits() >= 32))))
16210     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16211
16212   unsigned int NumElts = VT.getVectorNumElements();
16213
16214   if (NumElts != 8 && NumElts != 16)
16215     return SDValue();
16216
16217   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
16218     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
16219       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
16220     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16221   }
16222
16223   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16224   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16225
16226   MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
16227   Constant *C = ConstantInt::get(*DAG.getContext(),
16228     APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
16229
16230   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
16231   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
16232   SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
16233                           MachinePointerInfo::getConstantPool(),
16234                           false, false, false, Alignment);
16235   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
16236   if (VT.is512BitVector())
16237     return Brcst;
16238   return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
16239 }
16240
16241 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
16242                                 SelectionDAG &DAG) {
16243   MVT VT = Op->getSimpleValueType(0);
16244   SDValue In = Op->getOperand(0);
16245   MVT InVT = In.getSimpleValueType();
16246   SDLoc dl(Op);
16247
16248   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16249     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16250
16251   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16252       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16253       (VT != MVT::v16i16 || InVT != MVT::v16i8))
16254     return SDValue();
16255
16256   if (Subtarget->hasInt256())
16257     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16258
16259   // Optimize vectors in AVX mode
16260   // Sign extend  v8i16 to v8i32 and
16261   //              v4i32 to v4i64
16262   //
16263   // Divide input vector into two parts
16264   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16265   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16266   // concat the vectors to original VT
16267
16268   unsigned NumElems = InVT.getVectorNumElements();
16269   SDValue Undef = DAG.getUNDEF(InVT);
16270
16271   SmallVector<int,8> ShufMask1(NumElems, -1);
16272   for (unsigned i = 0; i != NumElems/2; ++i)
16273     ShufMask1[i] = i;
16274
16275   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
16276
16277   SmallVector<int,8> ShufMask2(NumElems, -1);
16278   for (unsigned i = 0; i != NumElems/2; ++i)
16279     ShufMask2[i] = i + NumElems/2;
16280
16281   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
16282
16283   MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
16284                                 VT.getVectorNumElements()/2);
16285
16286   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16287   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16288
16289   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16290 }
16291
16292 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16293 // may emit an illegal shuffle but the expansion is still better than scalar
16294 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16295 // we'll emit a shuffle and a arithmetic shift.
16296 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
16297 // TODO: It is possible to support ZExt by zeroing the undef values during
16298 // the shuffle phase or after the shuffle.
16299 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
16300                                  SelectionDAG &DAG) {
16301   MVT RegVT = Op.getSimpleValueType();
16302   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16303   assert(RegVT.isInteger() &&
16304          "We only custom lower integer vector sext loads.");
16305
16306   // Nothing useful we can do without SSE2 shuffles.
16307   assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
16308
16309   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16310   SDLoc dl(Ld);
16311   EVT MemVT = Ld->getMemoryVT();
16312   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16313   unsigned RegSz = RegVT.getSizeInBits();
16314
16315   ISD::LoadExtType Ext = Ld->getExtensionType();
16316
16317   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16318          && "Only anyext and sext are currently implemented.");
16319   assert(MemVT != RegVT && "Cannot extend to the same type");
16320   assert(MemVT.isVector() && "Must load a vector from memory");
16321
16322   unsigned NumElems = RegVT.getVectorNumElements();
16323   unsigned MemSz = MemVT.getSizeInBits();
16324   assert(RegSz > MemSz && "Register size must be greater than the mem size");
16325
16326   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
16327     // The only way in which we have a legal 256-bit vector result but not the
16328     // integer 256-bit operations needed to directly lower a sextload is if we
16329     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16330     // a 128-bit vector and a normal sign_extend to 256-bits that should get
16331     // correctly legalized. We do this late to allow the canonical form of
16332     // sextload to persist throughout the rest of the DAG combiner -- it wants
16333     // to fold together any extensions it can, and so will fuse a sign_extend
16334     // of an sextload into a sextload targeting a wider value.
16335     SDValue Load;
16336     if (MemSz == 128) {
16337       // Just switch this to a normal load.
16338       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16339                                        "it must be a legal 128-bit vector "
16340                                        "type!");
16341       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16342                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16343                   Ld->isInvariant(), Ld->getAlignment());
16344     } else {
16345       assert(MemSz < 128 &&
16346              "Can't extend a type wider than 128 bits to a 256 bit vector!");
16347       // Do an sext load to a 128-bit vector type. We want to use the same
16348       // number of elements, but elements half as wide. This will end up being
16349       // recursively lowered by this routine, but will succeed as we definitely
16350       // have all the necessary features if we're using AVX1.
16351       EVT HalfEltVT =
16352           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16353       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16354       Load =
16355           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16356                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16357                          Ld->isNonTemporal(), Ld->isInvariant(),
16358                          Ld->getAlignment());
16359     }
16360
16361     // Replace chain users with the new chain.
16362     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16363     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16364
16365     // Finally, do a normal sign-extend to the desired register.
16366     return DAG.getSExtOrTrunc(Load, dl, RegVT);
16367   }
16368
16369   // All sizes must be a power of two.
16370   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16371          "Non-power-of-two elements are not custom lowered!");
16372
16373   // Attempt to load the original value using scalar loads.
16374   // Find the largest scalar type that divides the total loaded size.
16375   MVT SclrLoadTy = MVT::i8;
16376   for (MVT Tp : MVT::integer_valuetypes()) {
16377     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16378       SclrLoadTy = Tp;
16379     }
16380   }
16381
16382   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16383   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16384       (64 <= MemSz))
16385     SclrLoadTy = MVT::f64;
16386
16387   // Calculate the number of scalar loads that we need to perform
16388   // in order to load our vector from memory.
16389   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16390
16391   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16392          "Can only lower sext loads with a single scalar load!");
16393
16394   unsigned loadRegZize = RegSz;
16395   if (Ext == ISD::SEXTLOAD && RegSz == 256)
16396     loadRegZize /= 2;
16397
16398   // Represent our vector as a sequence of elements which are the
16399   // largest scalar that we can load.
16400   EVT LoadUnitVecVT = EVT::getVectorVT(
16401       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16402
16403   // Represent the data using the same element type that is stored in
16404   // memory. In practice, we ''widen'' MemVT.
16405   EVT WideVecVT =
16406       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16407                        loadRegZize / MemVT.getScalarType().getSizeInBits());
16408
16409   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16410          "Invalid vector type");
16411
16412   // We can't shuffle using an illegal type.
16413   assert(TLI.isTypeLegal(WideVecVT) &&
16414          "We only lower types that form legal widened vector types");
16415
16416   SmallVector<SDValue, 8> Chains;
16417   SDValue Ptr = Ld->getBasePtr();
16418   SDValue Increment =
16419       DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
16420   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16421
16422   for (unsigned i = 0; i < NumLoads; ++i) {
16423     // Perform a single load.
16424     SDValue ScalarLoad =
16425         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16426                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16427                     Ld->getAlignment());
16428     Chains.push_back(ScalarLoad.getValue(1));
16429     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16430     // another round of DAGCombining.
16431     if (i == 0)
16432       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16433     else
16434       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16435                         ScalarLoad, DAG.getIntPtrConstant(i));
16436
16437     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16438   }
16439
16440   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16441
16442   // Bitcast the loaded value to a vector of the original element type, in
16443   // the size of the target vector type.
16444   SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
16445   unsigned SizeRatio = RegSz / MemSz;
16446
16447   if (Ext == ISD::SEXTLOAD) {
16448     // If we have SSE4.1, we can directly emit a VSEXT node.
16449     if (Subtarget->hasSSE41()) {
16450       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16451       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16452       return Sext;
16453     }
16454
16455     // Otherwise we'll shuffle the small elements in the high bits of the
16456     // larger type and perform an arithmetic shift. If the shift is not legal
16457     // it's better to scalarize.
16458     assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
16459            "We can't implement a sext load without an arithmetic right shift!");
16460
16461     // Redistribute the loaded elements into the different locations.
16462     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16463     for (unsigned i = 0; i != NumElems; ++i)
16464       ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
16465
16466     SDValue Shuff = DAG.getVectorShuffle(
16467         WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16468
16469     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16470
16471     // Build the arithmetic shift.
16472     unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16473                    MemVT.getVectorElementType().getSizeInBits();
16474     Shuff =
16475         DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
16476
16477     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16478     return Shuff;
16479   }
16480
16481   // Redistribute the loaded elements into the different locations.
16482   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16483   for (unsigned i = 0; i != NumElems; ++i)
16484     ShuffleVec[i * SizeRatio] = i;
16485
16486   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16487                                        DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16488
16489   // Bitcast to the requested type.
16490   Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16491   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16492   return Shuff;
16493 }
16494
16495 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
16496 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
16497 // from the AND / OR.
16498 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16499   Opc = Op.getOpcode();
16500   if (Opc != ISD::OR && Opc != ISD::AND)
16501     return false;
16502   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16503           Op.getOperand(0).hasOneUse() &&
16504           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16505           Op.getOperand(1).hasOneUse());
16506 }
16507
16508 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
16509 // 1 and that the SETCC node has a single use.
16510 static bool isXor1OfSetCC(SDValue Op) {
16511   if (Op.getOpcode() != ISD::XOR)
16512     return false;
16513   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16514   if (N1C && N1C->getAPIntValue() == 1) {
16515     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16516       Op.getOperand(0).hasOneUse();
16517   }
16518   return false;
16519 }
16520
16521 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16522   bool addTest = true;
16523   SDValue Chain = Op.getOperand(0);
16524   SDValue Cond  = Op.getOperand(1);
16525   SDValue Dest  = Op.getOperand(2);
16526   SDLoc dl(Op);
16527   SDValue CC;
16528   bool Inverted = false;
16529
16530   if (Cond.getOpcode() == ISD::SETCC) {
16531     // Check for setcc([su]{add,sub,mul}o == 0).
16532     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16533         isa<ConstantSDNode>(Cond.getOperand(1)) &&
16534         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
16535         Cond.getOperand(0).getResNo() == 1 &&
16536         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16537          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16538          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16539          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16540          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16541          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16542       Inverted = true;
16543       Cond = Cond.getOperand(0);
16544     } else {
16545       SDValue NewCond = LowerSETCC(Cond, DAG);
16546       if (NewCond.getNode())
16547         Cond = NewCond;
16548     }
16549   }
16550 #if 0
16551   // FIXME: LowerXALUO doesn't handle these!!
16552   else if (Cond.getOpcode() == X86ISD::ADD  ||
16553            Cond.getOpcode() == X86ISD::SUB  ||
16554            Cond.getOpcode() == X86ISD::SMUL ||
16555            Cond.getOpcode() == X86ISD::UMUL)
16556     Cond = LowerXALUO(Cond, DAG);
16557 #endif
16558
16559   // Look pass (and (setcc_carry (cmp ...)), 1).
16560   if (Cond.getOpcode() == ISD::AND &&
16561       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16562     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16563     if (C && C->getAPIntValue() == 1)
16564       Cond = Cond.getOperand(0);
16565   }
16566
16567   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16568   // setting operand in place of the X86ISD::SETCC.
16569   unsigned CondOpcode = Cond.getOpcode();
16570   if (CondOpcode == X86ISD::SETCC ||
16571       CondOpcode == X86ISD::SETCC_CARRY) {
16572     CC = Cond.getOperand(0);
16573
16574     SDValue Cmp = Cond.getOperand(1);
16575     unsigned Opc = Cmp.getOpcode();
16576     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16577     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16578       Cond = Cmp;
16579       addTest = false;
16580     } else {
16581       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16582       default: break;
16583       case X86::COND_O:
16584       case X86::COND_B:
16585         // These can only come from an arithmetic instruction with overflow,
16586         // e.g. SADDO, UADDO.
16587         Cond = Cond.getNode()->getOperand(1);
16588         addTest = false;
16589         break;
16590       }
16591     }
16592   }
16593   CondOpcode = Cond.getOpcode();
16594   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16595       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16596       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16597        Cond.getOperand(0).getValueType() != MVT::i8)) {
16598     SDValue LHS = Cond.getOperand(0);
16599     SDValue RHS = Cond.getOperand(1);
16600     unsigned X86Opcode;
16601     unsigned X86Cond;
16602     SDVTList VTs;
16603     // Keep this in sync with LowerXALUO, otherwise we might create redundant
16604     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16605     // X86ISD::INC).
16606     switch (CondOpcode) {
16607     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16608     case ISD::SADDO:
16609       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16610         if (C->isOne()) {
16611           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16612           break;
16613         }
16614       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16615     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16616     case ISD::SSUBO:
16617       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16618         if (C->isOne()) {
16619           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16620           break;
16621         }
16622       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16623     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16624     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16625     default: llvm_unreachable("unexpected overflowing operator");
16626     }
16627     if (Inverted)
16628       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16629     if (CondOpcode == ISD::UMULO)
16630       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16631                           MVT::i32);
16632     else
16633       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16634
16635     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16636
16637     if (CondOpcode == ISD::UMULO)
16638       Cond = X86Op.getValue(2);
16639     else
16640       Cond = X86Op.getValue(1);
16641
16642     CC = DAG.getConstant(X86Cond, MVT::i8);
16643     addTest = false;
16644   } else {
16645     unsigned CondOpc;
16646     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16647       SDValue Cmp = Cond.getOperand(0).getOperand(1);
16648       if (CondOpc == ISD::OR) {
16649         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16650         // two branches instead of an explicit OR instruction with a
16651         // separate test.
16652         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16653             isX86LogicalCmp(Cmp)) {
16654           CC = Cond.getOperand(0).getOperand(0);
16655           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16656                               Chain, Dest, CC, Cmp);
16657           CC = Cond.getOperand(1).getOperand(0);
16658           Cond = Cmp;
16659           addTest = false;
16660         }
16661       } else { // ISD::AND
16662         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16663         // two branches instead of an explicit AND instruction with a
16664         // separate test. However, we only do this if this block doesn't
16665         // have a fall-through edge, because this requires an explicit
16666         // jmp when the condition is false.
16667         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16668             isX86LogicalCmp(Cmp) &&
16669             Op.getNode()->hasOneUse()) {
16670           X86::CondCode CCode =
16671             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16672           CCode = X86::GetOppositeBranchCondition(CCode);
16673           CC = DAG.getConstant(CCode, MVT::i8);
16674           SDNode *User = *Op.getNode()->use_begin();
16675           // Look for an unconditional branch following this conditional branch.
16676           // We need this because we need to reverse the successors in order
16677           // to implement FCMP_OEQ.
16678           if (User->getOpcode() == ISD::BR) {
16679             SDValue FalseBB = User->getOperand(1);
16680             SDNode *NewBR =
16681               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16682             assert(NewBR == User);
16683             (void)NewBR;
16684             Dest = FalseBB;
16685
16686             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16687                                 Chain, Dest, CC, Cmp);
16688             X86::CondCode CCode =
16689               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16690             CCode = X86::GetOppositeBranchCondition(CCode);
16691             CC = DAG.getConstant(CCode, MVT::i8);
16692             Cond = Cmp;
16693             addTest = false;
16694           }
16695         }
16696       }
16697     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16698       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16699       // It should be transformed during dag combiner except when the condition
16700       // is set by a arithmetics with overflow node.
16701       X86::CondCode CCode =
16702         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16703       CCode = X86::GetOppositeBranchCondition(CCode);
16704       CC = DAG.getConstant(CCode, MVT::i8);
16705       Cond = Cond.getOperand(0).getOperand(1);
16706       addTest = false;
16707     } else if (Cond.getOpcode() == ISD::SETCC &&
16708                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16709       // For FCMP_OEQ, we can emit
16710       // two branches instead of an explicit AND instruction with a
16711       // separate test. However, we only do this if this block doesn't
16712       // have a fall-through edge, because this requires an explicit
16713       // jmp when the condition is false.
16714       if (Op.getNode()->hasOneUse()) {
16715         SDNode *User = *Op.getNode()->use_begin();
16716         // Look for an unconditional branch following this conditional branch.
16717         // We need this because we need to reverse the successors in order
16718         // to implement FCMP_OEQ.
16719         if (User->getOpcode() == ISD::BR) {
16720           SDValue FalseBB = User->getOperand(1);
16721           SDNode *NewBR =
16722             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16723           assert(NewBR == User);
16724           (void)NewBR;
16725           Dest = FalseBB;
16726
16727           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16728                                     Cond.getOperand(0), Cond.getOperand(1));
16729           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16730           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16731           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16732                               Chain, Dest, CC, Cmp);
16733           CC = DAG.getConstant(X86::COND_P, MVT::i8);
16734           Cond = Cmp;
16735           addTest = false;
16736         }
16737       }
16738     } else if (Cond.getOpcode() == ISD::SETCC &&
16739                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16740       // For FCMP_UNE, we can emit
16741       // two branches instead of an explicit AND instruction with a
16742       // separate test. However, we only do this if this block doesn't
16743       // have a fall-through edge, because this requires an explicit
16744       // jmp when the condition is false.
16745       if (Op.getNode()->hasOneUse()) {
16746         SDNode *User = *Op.getNode()->use_begin();
16747         // Look for an unconditional branch following this conditional branch.
16748         // We need this because we need to reverse the successors in order
16749         // to implement FCMP_UNE.
16750         if (User->getOpcode() == ISD::BR) {
16751           SDValue FalseBB = User->getOperand(1);
16752           SDNode *NewBR =
16753             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16754           assert(NewBR == User);
16755           (void)NewBR;
16756
16757           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16758                                     Cond.getOperand(0), Cond.getOperand(1));
16759           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16760           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16761           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16762                               Chain, Dest, CC, Cmp);
16763           CC = DAG.getConstant(X86::COND_NP, MVT::i8);
16764           Cond = Cmp;
16765           addTest = false;
16766           Dest = FalseBB;
16767         }
16768       }
16769     }
16770   }
16771
16772   if (addTest) {
16773     // Look pass the truncate if the high bits are known zero.
16774     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16775         Cond = Cond.getOperand(0);
16776
16777     // We know the result of AND is compared against zero. Try to match
16778     // it to BT.
16779     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16780       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
16781       if (NewSetCC.getNode()) {
16782         CC = NewSetCC.getOperand(0);
16783         Cond = NewSetCC.getOperand(1);
16784         addTest = false;
16785       }
16786     }
16787   }
16788
16789   if (addTest) {
16790     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16791     CC = DAG.getConstant(X86Cond, MVT::i8);
16792     Cond = EmitTest(Cond, X86Cond, dl, DAG);
16793   }
16794   Cond = ConvertCmpIfNecessary(Cond, DAG);
16795   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16796                      Chain, Dest, CC, Cond);
16797 }
16798
16799 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16800 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16801 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16802 // that the guard pages used by the OS virtual memory manager are allocated in
16803 // correct sequence.
16804 SDValue
16805 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16806                                            SelectionDAG &DAG) const {
16807   MachineFunction &MF = DAG.getMachineFunction();
16808   bool SplitStack = MF.shouldSplitStack();
16809   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
16810                SplitStack;
16811   SDLoc dl(Op);
16812
16813   if (!Lower) {
16814     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16815     SDNode* Node = Op.getNode();
16816
16817     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16818     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16819         " not tell us which reg is the stack pointer!");
16820     EVT VT = Node->getValueType(0);
16821     SDValue Tmp1 = SDValue(Node, 0);
16822     SDValue Tmp2 = SDValue(Node, 1);
16823     SDValue Tmp3 = Node->getOperand(2);
16824     SDValue Chain = Tmp1.getOperand(0);
16825
16826     // Chain the dynamic stack allocation so that it doesn't modify the stack
16827     // pointer when other instructions are using the stack.
16828     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
16829         SDLoc(Node));
16830
16831     SDValue Size = Tmp2.getOperand(1);
16832     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16833     Chain = SP.getValue(1);
16834     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
16835     const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
16836     unsigned StackAlign = TFI.getStackAlignment();
16837     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16838     if (Align > StackAlign)
16839       Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
16840           DAG.getConstant(-(uint64_t)Align, VT));
16841     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
16842
16843     Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
16844         DAG.getIntPtrConstant(0, true), SDValue(),
16845         SDLoc(Node));
16846
16847     SDValue Ops[2] = { Tmp1, Tmp2 };
16848     return DAG.getMergeValues(Ops, dl);
16849   }
16850
16851   // Get the inputs.
16852   SDValue Chain = Op.getOperand(0);
16853   SDValue Size  = Op.getOperand(1);
16854   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16855   EVT VT = Op.getNode()->getValueType(0);
16856
16857   bool Is64Bit = Subtarget->is64Bit();
16858   EVT SPTy = getPointerTy();
16859
16860   if (SplitStack) {
16861     MachineRegisterInfo &MRI = MF.getRegInfo();
16862
16863     if (Is64Bit) {
16864       // The 64 bit implementation of segmented stacks needs to clobber both r10
16865       // r11. This makes it impossible to use it along with nested parameters.
16866       const Function *F = MF.getFunction();
16867
16868       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
16869            I != E; ++I)
16870         if (I->hasNestAttr())
16871           report_fatal_error("Cannot use segmented stacks with functions that "
16872                              "have nested arguments.");
16873     }
16874
16875     const TargetRegisterClass *AddrRegClass =
16876       getRegClassFor(getPointerTy());
16877     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16878     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16879     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16880                                 DAG.getRegister(Vreg, SPTy));
16881     SDValue Ops1[2] = { Value, Chain };
16882     return DAG.getMergeValues(Ops1, dl);
16883   } else {
16884     SDValue Flag;
16885     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
16886
16887     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
16888     Flag = Chain.getValue(1);
16889     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16890
16891     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
16892
16893     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
16894     unsigned SPReg = RegInfo->getStackRegister();
16895     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16896     Chain = SP.getValue(1);
16897
16898     if (Align) {
16899       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16900                        DAG.getConstant(-(uint64_t)Align, VT));
16901       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16902     }
16903
16904     SDValue Ops1[2] = { SP, Chain };
16905     return DAG.getMergeValues(Ops1, dl);
16906   }
16907 }
16908
16909 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16910   MachineFunction &MF = DAG.getMachineFunction();
16911   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16912
16913   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16914   SDLoc DL(Op);
16915
16916   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
16917     // vastart just stores the address of the VarArgsFrameIndex slot into the
16918     // memory location argument.
16919     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16920                                    getPointerTy());
16921     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16922                         MachinePointerInfo(SV), false, false, 0);
16923   }
16924
16925   // __va_list_tag:
16926   //   gp_offset         (0 - 6 * 8)
16927   //   fp_offset         (48 - 48 + 8 * 16)
16928   //   overflow_arg_area (point to parameters coming in memory).
16929   //   reg_save_area
16930   SmallVector<SDValue, 8> MemOps;
16931   SDValue FIN = Op.getOperand(1);
16932   // Store gp_offset
16933   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
16934                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
16935                                                MVT::i32),
16936                                FIN, MachinePointerInfo(SV), false, false, 0);
16937   MemOps.push_back(Store);
16938
16939   // Store fp_offset
16940   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16941                     FIN, DAG.getIntPtrConstant(4));
16942   Store = DAG.getStore(Op.getOperand(0), DL,
16943                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
16944                                        MVT::i32),
16945                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
16946   MemOps.push_back(Store);
16947
16948   // Store ptr to overflow_arg_area
16949   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16950                     FIN, DAG.getIntPtrConstant(4));
16951   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16952                                     getPointerTy());
16953   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
16954                        MachinePointerInfo(SV, 8),
16955                        false, false, 0);
16956   MemOps.push_back(Store);
16957
16958   // Store ptr to reg_save_area.
16959   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16960                     FIN, DAG.getIntPtrConstant(8));
16961   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
16962                                     getPointerTy());
16963   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
16964                        MachinePointerInfo(SV, 16), false, false, 0);
16965   MemOps.push_back(Store);
16966   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16967 }
16968
16969 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16970   assert(Subtarget->is64Bit() &&
16971          "LowerVAARG only handles 64-bit va_arg!");
16972   assert((Subtarget->isTargetLinux() ||
16973           Subtarget->isTargetDarwin()) &&
16974           "Unhandled target in LowerVAARG");
16975   assert(Op.getNode()->getNumOperands() == 4);
16976   SDValue Chain = Op.getOperand(0);
16977   SDValue SrcPtr = Op.getOperand(1);
16978   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16979   unsigned Align = Op.getConstantOperandVal(3);
16980   SDLoc dl(Op);
16981
16982   EVT ArgVT = Op.getNode()->getValueType(0);
16983   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16984   uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
16985   uint8_t ArgMode;
16986
16987   // Decide which area this value should be read from.
16988   // TODO: Implement the AMD64 ABI in its entirety. This simple
16989   // selection mechanism works only for the basic types.
16990   if (ArgVT == MVT::f80) {
16991     llvm_unreachable("va_arg for f80 not yet implemented");
16992   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16993     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
16994   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16995     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
16996   } else {
16997     llvm_unreachable("Unhandled argument type in LowerVAARG");
16998   }
16999
17000   if (ArgMode == 2) {
17001     // Sanity Check: Make sure using fp_offset makes sense.
17002     assert(!DAG.getTarget().Options.UseSoftFloat &&
17003            !(DAG.getMachineFunction()
17004                 .getFunction()->getAttributes()
17005                 .hasAttribute(AttributeSet::FunctionIndex,
17006                               Attribute::NoImplicitFloat)) &&
17007            Subtarget->hasSSE1());
17008   }
17009
17010   // Insert VAARG_64 node into the DAG
17011   // VAARG_64 returns two values: Variable Argument Address, Chain
17012   SmallVector<SDValue, 11> InstOps;
17013   InstOps.push_back(Chain);
17014   InstOps.push_back(SrcPtr);
17015   InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
17016   InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
17017   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
17018   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
17019   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
17020                                           VTs, InstOps, MVT::i64,
17021                                           MachinePointerInfo(SV),
17022                                           /*Align=*/0,
17023                                           /*Volatile=*/false,
17024                                           /*ReadMem=*/true,
17025                                           /*WriteMem=*/true);
17026   Chain = VAARG.getValue(1);
17027
17028   // Load the next argument and return it
17029   return DAG.getLoad(ArgVT, dl,
17030                      Chain,
17031                      VAARG,
17032                      MachinePointerInfo(),
17033                      false, false, false, 0);
17034 }
17035
17036 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
17037                            SelectionDAG &DAG) {
17038   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
17039   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
17040   SDValue Chain = Op.getOperand(0);
17041   SDValue DstPtr = Op.getOperand(1);
17042   SDValue SrcPtr = Op.getOperand(2);
17043   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
17044   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17045   SDLoc DL(Op);
17046
17047   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
17048                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
17049                        false,
17050                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
17051 }
17052
17053 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
17054 // amount is a constant. Takes immediate version of shift as input.
17055 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
17056                                           SDValue SrcOp, uint64_t ShiftAmt,
17057                                           SelectionDAG &DAG) {
17058   MVT ElementType = VT.getVectorElementType();
17059
17060   // Fold this packed shift into its first operand if ShiftAmt is 0.
17061   if (ShiftAmt == 0)
17062     return SrcOp;
17063
17064   // Check for ShiftAmt >= element width
17065   if (ShiftAmt >= ElementType.getSizeInBits()) {
17066     if (Opc == X86ISD::VSRAI)
17067       ShiftAmt = ElementType.getSizeInBits() - 1;
17068     else
17069       return DAG.getConstant(0, VT);
17070   }
17071
17072   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
17073          && "Unknown target vector shift-by-constant node");
17074
17075   // Fold this packed vector shift into a build vector if SrcOp is a
17076   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
17077   if (VT == SrcOp.getSimpleValueType() &&
17078       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
17079     SmallVector<SDValue, 8> Elts;
17080     unsigned NumElts = SrcOp->getNumOperands();
17081     ConstantSDNode *ND;
17082
17083     switch(Opc) {
17084     default: llvm_unreachable(nullptr);
17085     case X86ISD::VSHLI:
17086       for (unsigned i=0; i!=NumElts; ++i) {
17087         SDValue CurrentOp = SrcOp->getOperand(i);
17088         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17089           Elts.push_back(CurrentOp);
17090           continue;
17091         }
17092         ND = cast<ConstantSDNode>(CurrentOp);
17093         const APInt &C = ND->getAPIntValue();
17094         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
17095       }
17096       break;
17097     case X86ISD::VSRLI:
17098       for (unsigned i=0; i!=NumElts; ++i) {
17099         SDValue CurrentOp = SrcOp->getOperand(i);
17100         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17101           Elts.push_back(CurrentOp);
17102           continue;
17103         }
17104         ND = cast<ConstantSDNode>(CurrentOp);
17105         const APInt &C = ND->getAPIntValue();
17106         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
17107       }
17108       break;
17109     case X86ISD::VSRAI:
17110       for (unsigned i=0; i!=NumElts; ++i) {
17111         SDValue CurrentOp = SrcOp->getOperand(i);
17112         if (CurrentOp->getOpcode() == ISD::UNDEF) {
17113           Elts.push_back(CurrentOp);
17114           continue;
17115         }
17116         ND = cast<ConstantSDNode>(CurrentOp);
17117         const APInt &C = ND->getAPIntValue();
17118         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
17119       }
17120       break;
17121     }
17122
17123     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
17124   }
17125
17126   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
17127 }
17128
17129 // getTargetVShiftNode - Handle vector element shifts where the shift amount
17130 // may or may not be a constant. Takes immediate version of shift as input.
17131 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
17132                                    SDValue SrcOp, SDValue ShAmt,
17133                                    SelectionDAG &DAG) {
17134   MVT SVT = ShAmt.getSimpleValueType();
17135   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
17136
17137   // Catch shift-by-constant.
17138   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
17139     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
17140                                       CShAmt->getZExtValue(), DAG);
17141
17142   // Change opcode to non-immediate version
17143   switch (Opc) {
17144     default: llvm_unreachable("Unknown target vector shift node");
17145     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
17146     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
17147     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
17148   }
17149
17150   const X86Subtarget &Subtarget =
17151       static_cast<const X86Subtarget &>(DAG.getSubtarget());
17152   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
17153       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
17154     // Let the shuffle legalizer expand this shift amount node.
17155     SDValue Op0 = ShAmt.getOperand(0);
17156     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
17157     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
17158   } else {
17159     // Need to build a vector containing shift amount.
17160     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
17161     SmallVector<SDValue, 4> ShOps;
17162     ShOps.push_back(ShAmt);
17163     if (SVT == MVT::i32) {
17164       ShOps.push_back(DAG.getConstant(0, SVT));
17165       ShOps.push_back(DAG.getUNDEF(SVT));
17166     }
17167     ShOps.push_back(DAG.getUNDEF(SVT));
17168
17169     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
17170     ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
17171   }
17172
17173   // The return type has to be a 128-bit type with the same element
17174   // type as the input type.
17175   MVT EltVT = VT.getVectorElementType();
17176   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
17177
17178   ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
17179   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
17180 }
17181
17182 /// \brief Return (and \p Op, \p Mask) for compare instructions or
17183 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
17184 /// necessary casting for \p Mask when lowering masking intrinsics.
17185 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
17186                                     SDValue PreservedSrc,
17187                                     const X86Subtarget *Subtarget,
17188                                     SelectionDAG &DAG) {
17189     EVT VT = Op.getValueType();
17190     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
17191                                   MVT::i1, VT.getVectorNumElements());
17192     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17193                                      Mask.getValueType().getSizeInBits());
17194     SDLoc dl(Op);
17195
17196     assert(MaskVT.isSimple() && "invalid mask type");
17197
17198     if (isAllOnes(Mask))
17199       return Op;
17200
17201     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
17202     // are extracted by EXTRACT_SUBVECTOR.
17203     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17204                               DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17205                               DAG.getIntPtrConstant(0));
17206
17207     switch (Op.getOpcode()) {
17208       default: break;
17209       case X86ISD::PCMPEQM:
17210       case X86ISD::PCMPGTM:
17211       case X86ISD::CMPM:
17212       case X86ISD::CMPMU:
17213         return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
17214     }
17215     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17216       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17217     return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
17218 }
17219
17220 /// \brief Creates an SDNode for a predicated scalar operation.
17221 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17222 /// The mask is comming as MVT::i8 and it should be truncated
17223 /// to MVT::i1 while lowering masking intrinsics.
17224 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17225 /// "X86select" instead of "vselect". We just can't create the "vselect" node for
17226 /// a scalar instruction.
17227 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17228                                     SDValue PreservedSrc,
17229                                     const X86Subtarget *Subtarget,
17230                                     SelectionDAG &DAG) {
17231     if (isAllOnes(Mask))
17232       return Op;
17233
17234     EVT VT = Op.getValueType();
17235     SDLoc dl(Op);
17236     // The mask should be of type MVT::i1
17237     SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17238
17239     if (PreservedSrc.getOpcode() == ISD::UNDEF)
17240       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17241     return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17242 }
17243
17244 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17245                                        SelectionDAG &DAG) {
17246   SDLoc dl(Op);
17247   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17248   EVT VT = Op.getValueType();
17249   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17250   if (IntrData) {
17251     switch(IntrData->Type) {
17252     case INTR_TYPE_1OP:
17253       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17254     case INTR_TYPE_2OP:
17255       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17256         Op.getOperand(2));
17257     case INTR_TYPE_3OP:
17258       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17259         Op.getOperand(2), Op.getOperand(3));
17260     case INTR_TYPE_1OP_MASK_RM: {
17261       SDValue Src = Op.getOperand(1);
17262       SDValue Src0 = Op.getOperand(2);
17263       SDValue Mask = Op.getOperand(3);
17264       SDValue RoundingMode = Op.getOperand(4);
17265       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17266                                               RoundingMode),
17267                                   Mask, Src0, Subtarget, DAG);
17268     }
17269     case INTR_TYPE_SCALAR_MASK_RM: {
17270       SDValue Src1 = Op.getOperand(1);
17271       SDValue Src2 = Op.getOperand(2);
17272       SDValue Src0 = Op.getOperand(3);
17273       SDValue Mask = Op.getOperand(4);
17274       SDValue RoundingMode = Op.getOperand(5);
17275       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17276                                               RoundingMode),
17277                                   Mask, Src0, Subtarget, DAG);
17278     }
17279     case INTR_TYPE_2OP_MASK: {
17280       SDValue Mask = Op.getOperand(4);
17281       SDValue PassThru = Op.getOperand(3);
17282       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17283       if (IntrWithRoundingModeOpcode != 0) {
17284         unsigned Round = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
17285         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17286           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17287                                       dl, Op.getValueType(),
17288                                       Op.getOperand(1), Op.getOperand(2),
17289                                       Op.getOperand(3), Op.getOperand(5)),
17290                                       Mask, PassThru, Subtarget, DAG);
17291         }
17292       }
17293       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17294                                               Op.getOperand(1),
17295                                               Op.getOperand(2)),
17296                                   Mask, PassThru, Subtarget, DAG);
17297     }
17298     case FMA_OP_MASK: {
17299       SDValue Src1 = Op.getOperand(1);
17300       SDValue Src2 = Op.getOperand(2);
17301       SDValue Src3 = Op.getOperand(3);
17302       SDValue Mask = Op.getOperand(4);
17303       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17304       if (IntrWithRoundingModeOpcode != 0) {
17305         SDValue Rnd = Op.getOperand(5);
17306         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17307             X86::STATIC_ROUNDING::CUR_DIRECTION)
17308           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17309                                                   dl, Op.getValueType(),
17310                                                   Src1, Src2, Src3, Rnd),
17311                                       Mask, Src1, Subtarget, DAG);
17312       }
17313       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17314                                               dl, Op.getValueType(),
17315                                               Src1, Src2, Src3),
17316                                   Mask, Src1, Subtarget, DAG);
17317     }
17318     case CMP_MASK:
17319     case CMP_MASK_CC: {
17320       // Comparison intrinsics with masks.
17321       // Example of transformation:
17322       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17323       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17324       // (i8 (bitcast
17325       //   (v8i1 (insert_subvector undef,
17326       //           (v2i1 (and (PCMPEQM %a, %b),
17327       //                      (extract_subvector
17328       //                         (v8i1 (bitcast %mask)), 0))), 0))))
17329       EVT VT = Op.getOperand(1).getValueType();
17330       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17331                                     VT.getVectorNumElements());
17332       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17333       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17334                                        Mask.getValueType().getSizeInBits());
17335       SDValue Cmp;
17336       if (IntrData->Type == CMP_MASK_CC) {
17337         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17338                     Op.getOperand(2), Op.getOperand(3));
17339       } else {
17340         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17341         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17342                     Op.getOperand(2));
17343       }
17344       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17345                                              DAG.getTargetConstant(0, MaskVT),
17346                                              Subtarget, DAG);
17347       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17348                                 DAG.getUNDEF(BitcastVT), CmpMask,
17349                                 DAG.getIntPtrConstant(0));
17350       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
17351     }
17352     case COMI: { // Comparison intrinsics
17353       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17354       SDValue LHS = Op.getOperand(1);
17355       SDValue RHS = Op.getOperand(2);
17356       unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
17357       assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
17358       SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17359       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17360                                   DAG.getConstant(X86CC, MVT::i8), Cond);
17361       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17362     }
17363     case VSHIFT:
17364       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17365                                  Op.getOperand(1), Op.getOperand(2), DAG);
17366     case VSHIFT_MASK:
17367       return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
17368                                                       Op.getSimpleValueType(),
17369                                                       Op.getOperand(1),
17370                                                       Op.getOperand(2), DAG),
17371                                   Op.getOperand(4), Op.getOperand(3), Subtarget,
17372                                   DAG);
17373     case COMPRESS_EXPAND_IN_REG: {
17374       SDValue Mask = Op.getOperand(3);
17375       SDValue DataToCompress = Op.getOperand(1);
17376       SDValue PassThru = Op.getOperand(2);
17377       if (isAllOnes(Mask)) // return data as is
17378         return Op.getOperand(1);
17379       EVT VT = Op.getValueType();
17380       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17381                                     VT.getVectorNumElements());
17382       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17383                                        Mask.getValueType().getSizeInBits());
17384       SDLoc dl(Op);
17385       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17386                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17387                                   DAG.getIntPtrConstant(0));
17388
17389       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
17390                          PassThru);
17391     }
17392     case BLEND: {
17393       SDValue Mask = Op.getOperand(3);
17394       EVT VT = Op.getValueType();
17395       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17396                                     VT.getVectorNumElements());
17397       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17398                                        Mask.getValueType().getSizeInBits());
17399       SDLoc dl(Op);
17400       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17401                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17402                                   DAG.getIntPtrConstant(0));
17403       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
17404                          Op.getOperand(2));
17405     }
17406     default:
17407       break;
17408     }
17409   }
17410
17411   switch (IntNo) {
17412   default: return SDValue();    // Don't custom lower most intrinsics.
17413
17414   case Intrinsic::x86_avx512_mask_valign_q_512:
17415   case Intrinsic::x86_avx512_mask_valign_d_512:
17416     // Vector source operands are swapped.
17417     return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
17418                                             Op.getValueType(), Op.getOperand(2),
17419                                             Op.getOperand(1),
17420                                             Op.getOperand(3)),
17421                                 Op.getOperand(5), Op.getOperand(4),
17422                                 Subtarget, DAG);
17423
17424   // ptest and testp intrinsics. The intrinsic these come from are designed to
17425   // return an integer value, not just an instruction so lower it to the ptest
17426   // or testp pattern and a setcc for the result.
17427   case Intrinsic::x86_sse41_ptestz:
17428   case Intrinsic::x86_sse41_ptestc:
17429   case Intrinsic::x86_sse41_ptestnzc:
17430   case Intrinsic::x86_avx_ptestz_256:
17431   case Intrinsic::x86_avx_ptestc_256:
17432   case Intrinsic::x86_avx_ptestnzc_256:
17433   case Intrinsic::x86_avx_vtestz_ps:
17434   case Intrinsic::x86_avx_vtestc_ps:
17435   case Intrinsic::x86_avx_vtestnzc_ps:
17436   case Intrinsic::x86_avx_vtestz_pd:
17437   case Intrinsic::x86_avx_vtestc_pd:
17438   case Intrinsic::x86_avx_vtestnzc_pd:
17439   case Intrinsic::x86_avx_vtestz_ps_256:
17440   case Intrinsic::x86_avx_vtestc_ps_256:
17441   case Intrinsic::x86_avx_vtestnzc_ps_256:
17442   case Intrinsic::x86_avx_vtestz_pd_256:
17443   case Intrinsic::x86_avx_vtestc_pd_256:
17444   case Intrinsic::x86_avx_vtestnzc_pd_256: {
17445     bool IsTestPacked = false;
17446     unsigned X86CC;
17447     switch (IntNo) {
17448     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17449     case Intrinsic::x86_avx_vtestz_ps:
17450     case Intrinsic::x86_avx_vtestz_pd:
17451     case Intrinsic::x86_avx_vtestz_ps_256:
17452     case Intrinsic::x86_avx_vtestz_pd_256:
17453       IsTestPacked = true; // Fallthrough
17454     case Intrinsic::x86_sse41_ptestz:
17455     case Intrinsic::x86_avx_ptestz_256:
17456       // ZF = 1
17457       X86CC = X86::COND_E;
17458       break;
17459     case Intrinsic::x86_avx_vtestc_ps:
17460     case Intrinsic::x86_avx_vtestc_pd:
17461     case Intrinsic::x86_avx_vtestc_ps_256:
17462     case Intrinsic::x86_avx_vtestc_pd_256:
17463       IsTestPacked = true; // Fallthrough
17464     case Intrinsic::x86_sse41_ptestc:
17465     case Intrinsic::x86_avx_ptestc_256:
17466       // CF = 1
17467       X86CC = X86::COND_B;
17468       break;
17469     case Intrinsic::x86_avx_vtestnzc_ps:
17470     case Intrinsic::x86_avx_vtestnzc_pd:
17471     case Intrinsic::x86_avx_vtestnzc_ps_256:
17472     case Intrinsic::x86_avx_vtestnzc_pd_256:
17473       IsTestPacked = true; // Fallthrough
17474     case Intrinsic::x86_sse41_ptestnzc:
17475     case Intrinsic::x86_avx_ptestnzc_256:
17476       // ZF and CF = 0
17477       X86CC = X86::COND_A;
17478       break;
17479     }
17480
17481     SDValue LHS = Op.getOperand(1);
17482     SDValue RHS = Op.getOperand(2);
17483     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
17484     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
17485     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17486     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
17487     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17488   }
17489   case Intrinsic::x86_avx512_kortestz_w:
17490   case Intrinsic::x86_avx512_kortestc_w: {
17491     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
17492     SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
17493     SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
17494     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17495     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
17496     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
17497     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17498   }
17499
17500   case Intrinsic::x86_sse42_pcmpistria128:
17501   case Intrinsic::x86_sse42_pcmpestria128:
17502   case Intrinsic::x86_sse42_pcmpistric128:
17503   case Intrinsic::x86_sse42_pcmpestric128:
17504   case Intrinsic::x86_sse42_pcmpistrio128:
17505   case Intrinsic::x86_sse42_pcmpestrio128:
17506   case Intrinsic::x86_sse42_pcmpistris128:
17507   case Intrinsic::x86_sse42_pcmpestris128:
17508   case Intrinsic::x86_sse42_pcmpistriz128:
17509   case Intrinsic::x86_sse42_pcmpestriz128: {
17510     unsigned Opcode;
17511     unsigned X86CC;
17512     switch (IntNo) {
17513     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
17514     case Intrinsic::x86_sse42_pcmpistria128:
17515       Opcode = X86ISD::PCMPISTRI;
17516       X86CC = X86::COND_A;
17517       break;
17518     case Intrinsic::x86_sse42_pcmpestria128:
17519       Opcode = X86ISD::PCMPESTRI;
17520       X86CC = X86::COND_A;
17521       break;
17522     case Intrinsic::x86_sse42_pcmpistric128:
17523       Opcode = X86ISD::PCMPISTRI;
17524       X86CC = X86::COND_B;
17525       break;
17526     case Intrinsic::x86_sse42_pcmpestric128:
17527       Opcode = X86ISD::PCMPESTRI;
17528       X86CC = X86::COND_B;
17529       break;
17530     case Intrinsic::x86_sse42_pcmpistrio128:
17531       Opcode = X86ISD::PCMPISTRI;
17532       X86CC = X86::COND_O;
17533       break;
17534     case Intrinsic::x86_sse42_pcmpestrio128:
17535       Opcode = X86ISD::PCMPESTRI;
17536       X86CC = X86::COND_O;
17537       break;
17538     case Intrinsic::x86_sse42_pcmpistris128:
17539       Opcode = X86ISD::PCMPISTRI;
17540       X86CC = X86::COND_S;
17541       break;
17542     case Intrinsic::x86_sse42_pcmpestris128:
17543       Opcode = X86ISD::PCMPESTRI;
17544       X86CC = X86::COND_S;
17545       break;
17546     case Intrinsic::x86_sse42_pcmpistriz128:
17547       Opcode = X86ISD::PCMPISTRI;
17548       X86CC = X86::COND_E;
17549       break;
17550     case Intrinsic::x86_sse42_pcmpestriz128:
17551       Opcode = X86ISD::PCMPESTRI;
17552       X86CC = X86::COND_E;
17553       break;
17554     }
17555     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17556     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17557     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
17558     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17559                                 DAG.getConstant(X86CC, MVT::i8),
17560                                 SDValue(PCMP.getNode(), 1));
17561     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17562   }
17563
17564   case Intrinsic::x86_sse42_pcmpistri128:
17565   case Intrinsic::x86_sse42_pcmpestri128: {
17566     unsigned Opcode;
17567     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
17568       Opcode = X86ISD::PCMPISTRI;
17569     else
17570       Opcode = X86ISD::PCMPESTRI;
17571
17572     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17573     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17574     return DAG.getNode(Opcode, dl, VTs, NewOps);
17575   }
17576   }
17577 }
17578
17579 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17580                               SDValue Src, SDValue Mask, SDValue Base,
17581                               SDValue Index, SDValue ScaleOp, SDValue Chain,
17582                               const X86Subtarget * Subtarget) {
17583   SDLoc dl(Op);
17584   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17585   assert(C && "Invalid scale type");
17586   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17587   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17588                              Index.getSimpleValueType().getVectorNumElements());
17589   SDValue MaskInReg;
17590   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17591   if (MaskC)
17592     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17593   else
17594     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17595   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
17596   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17597   SDValue Segment = DAG.getRegister(0, MVT::i32);
17598   if (Src.getOpcode() == ISD::UNDEF)
17599     Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
17600   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17601   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17602   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
17603   return DAG.getMergeValues(RetOps, dl);
17604 }
17605
17606 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17607                                SDValue Src, SDValue Mask, SDValue Base,
17608                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
17609   SDLoc dl(Op);
17610   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17611   assert(C && "Invalid scale type");
17612   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17613   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17614   SDValue Segment = DAG.getRegister(0, MVT::i32);
17615   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17616                              Index.getSimpleValueType().getVectorNumElements());
17617   SDValue MaskInReg;
17618   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17619   if (MaskC)
17620     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17621   else
17622     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17623   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
17624   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
17625   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17626   return SDValue(Res, 1);
17627 }
17628
17629 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17630                                SDValue Mask, SDValue Base, SDValue Index,
17631                                SDValue ScaleOp, SDValue Chain) {
17632   SDLoc dl(Op);
17633   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17634   assert(C && "Invalid scale type");
17635   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17636   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17637   SDValue Segment = DAG.getRegister(0, MVT::i32);
17638   EVT MaskVT =
17639     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
17640   SDValue MaskInReg;
17641   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17642   if (MaskC)
17643     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17644   else
17645     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17646   //SDVTList VTs = DAG.getVTList(MVT::Other);
17647   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17648   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
17649   return SDValue(Res, 0);
17650 }
17651
17652 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
17653 // read performance monitor counters (x86_rdpmc).
17654 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
17655                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17656                               SmallVectorImpl<SDValue> &Results) {
17657   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17658   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17659   SDValue LO, HI;
17660
17661   // The ECX register is used to select the index of the performance counter
17662   // to read.
17663   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
17664                                    N->getOperand(2));
17665   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
17666
17667   // Reads the content of a 64-bit performance counter and returns it in the
17668   // registers EDX:EAX.
17669   if (Subtarget->is64Bit()) {
17670     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17671     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17672                             LO.getValue(2));
17673   } else {
17674     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17675     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17676                             LO.getValue(2));
17677   }
17678   Chain = HI.getValue(1);
17679
17680   if (Subtarget->is64Bit()) {
17681     // The EAX register is loaded with the low-order 32 bits. The EDX register
17682     // is loaded with the supported high-order bits of the counter.
17683     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17684                               DAG.getConstant(32, MVT::i8));
17685     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17686     Results.push_back(Chain);
17687     return;
17688   }
17689
17690   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17691   SDValue Ops[] = { LO, HI };
17692   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17693   Results.push_back(Pair);
17694   Results.push_back(Chain);
17695 }
17696
17697 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
17698 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
17699 // also used to custom lower READCYCLECOUNTER nodes.
17700 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
17701                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17702                               SmallVectorImpl<SDValue> &Results) {
17703   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17704   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
17705   SDValue LO, HI;
17706
17707   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
17708   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
17709   // and the EAX register is loaded with the low-order 32 bits.
17710   if (Subtarget->is64Bit()) {
17711     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17712     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17713                             LO.getValue(2));
17714   } else {
17715     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17716     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17717                             LO.getValue(2));
17718   }
17719   SDValue Chain = HI.getValue(1);
17720
17721   if (Opcode == X86ISD::RDTSCP_DAG) {
17722     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17723
17724     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
17725     // the ECX register. Add 'ecx' explicitly to the chain.
17726     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
17727                                      HI.getValue(2));
17728     // Explicitly store the content of ECX at the location passed in input
17729     // to the 'rdtscp' intrinsic.
17730     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
17731                          MachinePointerInfo(), false, false, 0);
17732   }
17733
17734   if (Subtarget->is64Bit()) {
17735     // The EDX register is loaded with the high-order 32 bits of the MSR, and
17736     // the EAX register is loaded with the low-order 32 bits.
17737     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17738                               DAG.getConstant(32, MVT::i8));
17739     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17740     Results.push_back(Chain);
17741     return;
17742   }
17743
17744   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17745   SDValue Ops[] = { LO, HI };
17746   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17747   Results.push_back(Pair);
17748   Results.push_back(Chain);
17749 }
17750
17751 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
17752                                      SelectionDAG &DAG) {
17753   SmallVector<SDValue, 2> Results;
17754   SDLoc DL(Op);
17755   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
17756                           Results);
17757   return DAG.getMergeValues(Results, DL);
17758 }
17759
17760
17761 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17762                                       SelectionDAG &DAG) {
17763   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
17764
17765   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
17766   if (!IntrData)
17767     return SDValue();
17768
17769   SDLoc dl(Op);
17770   switch(IntrData->Type) {
17771   default:
17772     llvm_unreachable("Unknown Intrinsic Type");
17773     break;
17774   case RDSEED:
17775   case RDRAND: {
17776     // Emit the node with the right value type.
17777     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
17778     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17779
17780     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
17781     // Otherwise return the value from Rand, which is always 0, casted to i32.
17782     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
17783                       DAG.getConstant(1, Op->getValueType(1)),
17784                       DAG.getConstant(X86::COND_B, MVT::i32),
17785                       SDValue(Result.getNode(), 1) };
17786     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
17787                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
17788                                   Ops);
17789
17790     // Return { result, isValid, chain }.
17791     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
17792                        SDValue(Result.getNode(), 2));
17793   }
17794   case GATHER: {
17795   //gather(v1, mask, index, base, scale);
17796     SDValue Chain = Op.getOperand(0);
17797     SDValue Src   = Op.getOperand(2);
17798     SDValue Base  = Op.getOperand(3);
17799     SDValue Index = Op.getOperand(4);
17800     SDValue Mask  = Op.getOperand(5);
17801     SDValue Scale = Op.getOperand(6);
17802     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
17803                           Subtarget);
17804   }
17805   case SCATTER: {
17806   //scatter(base, mask, index, v1, scale);
17807     SDValue Chain = Op.getOperand(0);
17808     SDValue Base  = Op.getOperand(2);
17809     SDValue Mask  = Op.getOperand(3);
17810     SDValue Index = Op.getOperand(4);
17811     SDValue Src   = Op.getOperand(5);
17812     SDValue Scale = Op.getOperand(6);
17813     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
17814   }
17815   case PREFETCH: {
17816     SDValue Hint = Op.getOperand(6);
17817     unsigned HintVal;
17818     if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
17819         (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
17820       llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
17821     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
17822     SDValue Chain = Op.getOperand(0);
17823     SDValue Mask  = Op.getOperand(2);
17824     SDValue Index = Op.getOperand(3);
17825     SDValue Base  = Op.getOperand(4);
17826     SDValue Scale = Op.getOperand(5);
17827     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
17828   }
17829   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
17830   case RDTSC: {
17831     SmallVector<SDValue, 2> Results;
17832     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
17833     return DAG.getMergeValues(Results, dl);
17834   }
17835   // Read Performance Monitoring Counters.
17836   case RDPMC: {
17837     SmallVector<SDValue, 2> Results;
17838     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
17839     return DAG.getMergeValues(Results, dl);
17840   }
17841   // XTEST intrinsics.
17842   case XTEST: {
17843     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17844     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17845     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17846                                 DAG.getConstant(X86::COND_NE, MVT::i8),
17847                                 InTrans);
17848     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
17849     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
17850                        Ret, SDValue(InTrans.getNode(), 1));
17851   }
17852   // ADC/ADCX/SBB
17853   case ADX: {
17854     SmallVector<SDValue, 2> Results;
17855     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17856     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
17857     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
17858                                 DAG.getConstant(-1, MVT::i8));
17859     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
17860                               Op.getOperand(4), GenCF.getValue(1));
17861     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
17862                                  Op.getOperand(5), MachinePointerInfo(),
17863                                  false, false, 0);
17864     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17865                                 DAG.getConstant(X86::COND_B, MVT::i8),
17866                                 Res.getValue(1));
17867     Results.push_back(SetCC);
17868     Results.push_back(Store);
17869     return DAG.getMergeValues(Results, dl);
17870   }
17871   case COMPRESS_TO_MEM: {
17872     SDLoc dl(Op);
17873     SDValue Mask = Op.getOperand(4);
17874     SDValue DataToCompress = Op.getOperand(3);
17875     SDValue Addr = Op.getOperand(2);
17876     SDValue Chain = Op.getOperand(0);
17877
17878     if (isAllOnes(Mask)) // return just a store
17879       return DAG.getStore(Chain, dl, DataToCompress, Addr,
17880                           MachinePointerInfo(), false, false, 0);
17881
17882     EVT VT = DataToCompress.getValueType();
17883     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17884                                   VT.getVectorNumElements());
17885     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17886                                      Mask.getValueType().getSizeInBits());
17887     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17888                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17889                                 DAG.getIntPtrConstant(0));
17890
17891     SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
17892                                       DataToCompress, DAG.getUNDEF(VT));
17893     return DAG.getStore(Chain, dl, Compressed, Addr,
17894                         MachinePointerInfo(), false, false, 0);
17895   }
17896   case EXPAND_FROM_MEM: {
17897     SDLoc dl(Op);
17898     SDValue Mask = Op.getOperand(4);
17899     SDValue PathThru = Op.getOperand(3);
17900     SDValue Addr = Op.getOperand(2);
17901     SDValue Chain = Op.getOperand(0);
17902     EVT VT = Op.getValueType();
17903
17904     if (isAllOnes(Mask)) // return just a load
17905       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
17906                          false, 0);
17907     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17908                                   VT.getVectorNumElements());
17909     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17910                                      Mask.getValueType().getSizeInBits());
17911     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17912                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17913                                 DAG.getIntPtrConstant(0));
17914
17915     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
17916                                    false, false, false, 0);
17917
17918     SmallVector<SDValue, 2> Results;
17919     Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
17920                                   PathThru));
17921     Results.push_back(Chain);
17922     return DAG.getMergeValues(Results, dl);
17923   }
17924   }
17925 }
17926
17927 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
17928                                            SelectionDAG &DAG) const {
17929   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17930   MFI->setReturnAddressIsTaken(true);
17931
17932   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
17933     return SDValue();
17934
17935   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17936   SDLoc dl(Op);
17937   EVT PtrVT = getPointerTy();
17938
17939   if (Depth > 0) {
17940     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
17941     const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17942     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
17943     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17944                        DAG.getNode(ISD::ADD, dl, PtrVT,
17945                                    FrameAddr, Offset),
17946                        MachinePointerInfo(), false, false, false, 0);
17947   }
17948
17949   // Just load the return address.
17950   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
17951   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17952                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
17953 }
17954
17955 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
17956   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17957   MFI->setFrameAddressIsTaken(true);
17958
17959   EVT VT = Op.getValueType();
17960   SDLoc dl(Op);  // FIXME probably not meaningful
17961   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17962   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17963   unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(
17964       DAG.getMachineFunction());
17965   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
17966           (FrameReg == X86::EBP && VT == MVT::i32)) &&
17967          "Invalid Frame Register!");
17968   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
17969   while (Depth--)
17970     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
17971                             MachinePointerInfo(),
17972                             false, false, false, 0);
17973   return FrameAddr;
17974 }
17975
17976 // FIXME? Maybe this could be a TableGen attribute on some registers and
17977 // this table could be generated automatically from RegInfo.
17978 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
17979                                               EVT VT) const {
17980   unsigned Reg = StringSwitch<unsigned>(RegName)
17981                        .Case("esp", X86::ESP)
17982                        .Case("rsp", X86::RSP)
17983                        .Default(0);
17984   if (Reg)
17985     return Reg;
17986   report_fatal_error("Invalid register name global variable");
17987 }
17988
17989 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
17990                                                      SelectionDAG &DAG) const {
17991   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
17992   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
17993 }
17994
17995 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
17996   SDValue Chain     = Op.getOperand(0);
17997   SDValue Offset    = Op.getOperand(1);
17998   SDValue Handler   = Op.getOperand(2);
17999   SDLoc dl      (Op);
18000
18001   EVT PtrVT = getPointerTy();
18002   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
18003   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
18004   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
18005           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
18006          "Invalid Frame Register!");
18007   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
18008   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
18009
18010   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
18011                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
18012   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
18013   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
18014                        false, false, 0);
18015   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
18016
18017   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
18018                      DAG.getRegister(StoreAddrReg, PtrVT));
18019 }
18020
18021 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
18022                                                SelectionDAG &DAG) const {
18023   SDLoc DL(Op);
18024   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
18025                      DAG.getVTList(MVT::i32, MVT::Other),
18026                      Op.getOperand(0), Op.getOperand(1));
18027 }
18028
18029 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
18030                                                 SelectionDAG &DAG) const {
18031   SDLoc DL(Op);
18032   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
18033                      Op.getOperand(0), Op.getOperand(1));
18034 }
18035
18036 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
18037   return Op.getOperand(0);
18038 }
18039
18040 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
18041                                                 SelectionDAG &DAG) const {
18042   SDValue Root = Op.getOperand(0);
18043   SDValue Trmp = Op.getOperand(1); // trampoline
18044   SDValue FPtr = Op.getOperand(2); // nested function
18045   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
18046   SDLoc dl (Op);
18047
18048   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18049   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
18050
18051   if (Subtarget->is64Bit()) {
18052     SDValue OutChains[6];
18053
18054     // Large code-model.
18055     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
18056     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
18057
18058     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
18059     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
18060
18061     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
18062
18063     // Load the pointer to the nested function into R11.
18064     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
18065     SDValue Addr = Trmp;
18066     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18067                                 Addr, MachinePointerInfo(TrmpAddr),
18068                                 false, false, 0);
18069
18070     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18071                        DAG.getConstant(2, MVT::i64));
18072     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
18073                                 MachinePointerInfo(TrmpAddr, 2),
18074                                 false, false, 2);
18075
18076     // Load the 'nest' parameter value into R10.
18077     // R10 is specified in X86CallingConv.td
18078     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
18079     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18080                        DAG.getConstant(10, MVT::i64));
18081     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18082                                 Addr, MachinePointerInfo(TrmpAddr, 10),
18083                                 false, false, 0);
18084
18085     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18086                        DAG.getConstant(12, MVT::i64));
18087     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
18088                                 MachinePointerInfo(TrmpAddr, 12),
18089                                 false, false, 2);
18090
18091     // Jump to the nested function.
18092     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
18093     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18094                        DAG.getConstant(20, MVT::i64));
18095     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
18096                                 Addr, MachinePointerInfo(TrmpAddr, 20),
18097                                 false, false, 0);
18098
18099     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
18100     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18101                        DAG.getConstant(22, MVT::i64));
18102     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
18103                                 MachinePointerInfo(TrmpAddr, 22),
18104                                 false, false, 0);
18105
18106     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18107   } else {
18108     const Function *Func =
18109       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
18110     CallingConv::ID CC = Func->getCallingConv();
18111     unsigned NestReg;
18112
18113     switch (CC) {
18114     default:
18115       llvm_unreachable("Unsupported calling convention");
18116     case CallingConv::C:
18117     case CallingConv::X86_StdCall: {
18118       // Pass 'nest' parameter in ECX.
18119       // Must be kept in sync with X86CallingConv.td
18120       NestReg = X86::ECX;
18121
18122       // Check that ECX wasn't needed by an 'inreg' parameter.
18123       FunctionType *FTy = Func->getFunctionType();
18124       const AttributeSet &Attrs = Func->getAttributes();
18125
18126       if (!Attrs.isEmpty() && !Func->isVarArg()) {
18127         unsigned InRegCount = 0;
18128         unsigned Idx = 1;
18129
18130         for (FunctionType::param_iterator I = FTy->param_begin(),
18131              E = FTy->param_end(); I != E; ++I, ++Idx)
18132           if (Attrs.hasAttribute(Idx, Attribute::InReg))
18133             // FIXME: should only count parameters that are lowered to integers.
18134             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
18135
18136         if (InRegCount > 2) {
18137           report_fatal_error("Nest register in use - reduce number of inreg"
18138                              " parameters!");
18139         }
18140       }
18141       break;
18142     }
18143     case CallingConv::X86_FastCall:
18144     case CallingConv::X86_ThisCall:
18145     case CallingConv::Fast:
18146       // Pass 'nest' parameter in EAX.
18147       // Must be kept in sync with X86CallingConv.td
18148       NestReg = X86::EAX;
18149       break;
18150     }
18151
18152     SDValue OutChains[4];
18153     SDValue Addr, Disp;
18154
18155     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18156                        DAG.getConstant(10, MVT::i32));
18157     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
18158
18159     // This is storing the opcode for MOV32ri.
18160     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
18161     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
18162     OutChains[0] = DAG.getStore(Root, dl,
18163                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
18164                                 Trmp, MachinePointerInfo(TrmpAddr),
18165                                 false, false, 0);
18166
18167     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18168                        DAG.getConstant(1, MVT::i32));
18169     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
18170                                 MachinePointerInfo(TrmpAddr, 1),
18171                                 false, false, 1);
18172
18173     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18174     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18175                        DAG.getConstant(5, MVT::i32));
18176     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
18177                                 MachinePointerInfo(TrmpAddr, 5),
18178                                 false, false, 1);
18179
18180     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18181                        DAG.getConstant(6, MVT::i32));
18182     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
18183                                 MachinePointerInfo(TrmpAddr, 6),
18184                                 false, false, 1);
18185
18186     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18187   }
18188 }
18189
18190 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18191                                             SelectionDAG &DAG) const {
18192   /*
18193    The rounding mode is in bits 11:10 of FPSR, and has the following
18194    settings:
18195      00 Round to nearest
18196      01 Round to -inf
18197      10 Round to +inf
18198      11 Round to 0
18199
18200   FLT_ROUNDS, on the other hand, expects the following:
18201     -1 Undefined
18202      0 Round to 0
18203      1 Round to nearest
18204      2 Round to +inf
18205      3 Round to -inf
18206
18207   To perform the conversion, we do:
18208     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18209   */
18210
18211   MachineFunction &MF = DAG.getMachineFunction();
18212   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
18213   unsigned StackAlignment = TFI.getStackAlignment();
18214   MVT VT = Op.getSimpleValueType();
18215   SDLoc DL(Op);
18216
18217   // Save FP Control Word to stack slot
18218   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18219   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
18220
18221   MachineMemOperand *MMO =
18222    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
18223                            MachineMemOperand::MOStore, 2, 2);
18224
18225   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18226   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18227                                           DAG.getVTList(MVT::Other),
18228                                           Ops, MVT::i16, MMO);
18229
18230   // Load FP Control Word from stack slot
18231   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18232                             MachinePointerInfo(), false, false, false, 0);
18233
18234   // Transform as necessary
18235   SDValue CWD1 =
18236     DAG.getNode(ISD::SRL, DL, MVT::i16,
18237                 DAG.getNode(ISD::AND, DL, MVT::i16,
18238                             CWD, DAG.getConstant(0x800, MVT::i16)),
18239                 DAG.getConstant(11, MVT::i8));
18240   SDValue CWD2 =
18241     DAG.getNode(ISD::SRL, DL, MVT::i16,
18242                 DAG.getNode(ISD::AND, DL, MVT::i16,
18243                             CWD, DAG.getConstant(0x400, MVT::i16)),
18244                 DAG.getConstant(9, MVT::i8));
18245
18246   SDValue RetVal =
18247     DAG.getNode(ISD::AND, DL, MVT::i16,
18248                 DAG.getNode(ISD::ADD, DL, MVT::i16,
18249                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18250                             DAG.getConstant(1, MVT::i16)),
18251                 DAG.getConstant(3, MVT::i16));
18252
18253   return DAG.getNode((VT.getSizeInBits() < 16 ?
18254                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18255 }
18256
18257 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
18258   MVT VT = Op.getSimpleValueType();
18259   EVT OpVT = VT;
18260   unsigned NumBits = VT.getSizeInBits();
18261   SDLoc dl(Op);
18262
18263   Op = Op.getOperand(0);
18264   if (VT == MVT::i8) {
18265     // Zero extend to i32 since there is not an i8 bsr.
18266     OpVT = MVT::i32;
18267     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18268   }
18269
18270   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
18271   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18272   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18273
18274   // If src is zero (i.e. bsr sets ZF), returns NumBits.
18275   SDValue Ops[] = {
18276     Op,
18277     DAG.getConstant(NumBits+NumBits-1, OpVT),
18278     DAG.getConstant(X86::COND_E, MVT::i8),
18279     Op.getValue(1)
18280   };
18281   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
18282
18283   // Finally xor with NumBits-1.
18284   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18285
18286   if (VT == MVT::i8)
18287     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18288   return Op;
18289 }
18290
18291 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
18292   MVT VT = Op.getSimpleValueType();
18293   EVT OpVT = VT;
18294   unsigned NumBits = VT.getSizeInBits();
18295   SDLoc dl(Op);
18296
18297   Op = Op.getOperand(0);
18298   if (VT == MVT::i8) {
18299     // Zero extend to i32 since there is not an i8 bsr.
18300     OpVT = MVT::i32;
18301     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18302   }
18303
18304   // Issue a bsr (scan bits in reverse).
18305   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18306   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18307
18308   // And xor with NumBits-1.
18309   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18310
18311   if (VT == MVT::i8)
18312     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18313   return Op;
18314 }
18315
18316 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
18317   MVT VT = Op.getSimpleValueType();
18318   unsigned NumBits = VT.getSizeInBits();
18319   SDLoc dl(Op);
18320   Op = Op.getOperand(0);
18321
18322   // Issue a bsf (scan bits forward) which also sets EFLAGS.
18323   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18324   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
18325
18326   // If src is zero (i.e. bsf sets ZF), returns NumBits.
18327   SDValue Ops[] = {
18328     Op,
18329     DAG.getConstant(NumBits, VT),
18330     DAG.getConstant(X86::COND_E, MVT::i8),
18331     Op.getValue(1)
18332   };
18333   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
18334 }
18335
18336 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
18337 // ones, and then concatenate the result back.
18338 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
18339   MVT VT = Op.getSimpleValueType();
18340
18341   assert(VT.is256BitVector() && VT.isInteger() &&
18342          "Unsupported value type for operation");
18343
18344   unsigned NumElems = VT.getVectorNumElements();
18345   SDLoc dl(Op);
18346
18347   // Extract the LHS vectors
18348   SDValue LHS = Op.getOperand(0);
18349   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
18350   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
18351
18352   // Extract the RHS vectors
18353   SDValue RHS = Op.getOperand(1);
18354   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
18355   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
18356
18357   MVT EltVT = VT.getVectorElementType();
18358   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18359
18360   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18361                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
18362                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
18363 }
18364
18365 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
18366   assert(Op.getSimpleValueType().is256BitVector() &&
18367          Op.getSimpleValueType().isInteger() &&
18368          "Only handle AVX 256-bit vector integer operation");
18369   return Lower256IntArith(Op, DAG);
18370 }
18371
18372 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
18373   assert(Op.getSimpleValueType().is256BitVector() &&
18374          Op.getSimpleValueType().isInteger() &&
18375          "Only handle AVX 256-bit vector integer operation");
18376   return Lower256IntArith(Op, DAG);
18377 }
18378
18379 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
18380                         SelectionDAG &DAG) {
18381   SDLoc dl(Op);
18382   MVT VT = Op.getSimpleValueType();
18383
18384   // Decompose 256-bit ops into smaller 128-bit ops.
18385   if (VT.is256BitVector() && !Subtarget->hasInt256())
18386     return Lower256IntArith(Op, DAG);
18387
18388   SDValue A = Op.getOperand(0);
18389   SDValue B = Op.getOperand(1);
18390
18391   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
18392   if (VT == MVT::v4i32) {
18393     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
18394            "Should not custom lower when pmuldq is available!");
18395
18396     // Extract the odd parts.
18397     static const int UnpackMask[] = { 1, -1, 3, -1 };
18398     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
18399     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
18400
18401     // Multiply the even parts.
18402     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
18403     // Now multiply odd parts.
18404     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
18405
18406     Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
18407     Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
18408
18409     // Merge the two vectors back together with a shuffle. This expands into 2
18410     // shuffles.
18411     static const int ShufMask[] = { 0, 4, 2, 6 };
18412     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
18413   }
18414
18415   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
18416          "Only know how to lower V2I64/V4I64/V8I64 multiply");
18417
18418   //  Ahi = psrlqi(a, 32);
18419   //  Bhi = psrlqi(b, 32);
18420   //
18421   //  AloBlo = pmuludq(a, b);
18422   //  AloBhi = pmuludq(a, Bhi);
18423   //  AhiBlo = pmuludq(Ahi, b);
18424
18425   //  AloBhi = psllqi(AloBhi, 32);
18426   //  AhiBlo = psllqi(AhiBlo, 32);
18427   //  return AloBlo + AloBhi + AhiBlo;
18428
18429   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
18430   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
18431
18432   // Bit cast to 32-bit vectors for MULUDQ
18433   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
18434                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
18435   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
18436   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
18437   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
18438   Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
18439
18440   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
18441   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
18442   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
18443
18444   AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
18445   AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
18446
18447   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
18448   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
18449 }
18450
18451 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
18452   assert(Subtarget->isTargetWin64() && "Unexpected target");
18453   EVT VT = Op.getValueType();
18454   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
18455          "Unexpected return type for lowering");
18456
18457   RTLIB::Libcall LC;
18458   bool isSigned;
18459   switch (Op->getOpcode()) {
18460   default: llvm_unreachable("Unexpected request for libcall!");
18461   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
18462   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
18463   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
18464   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
18465   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
18466   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
18467   }
18468
18469   SDLoc dl(Op);
18470   SDValue InChain = DAG.getEntryNode();
18471
18472   TargetLowering::ArgListTy Args;
18473   TargetLowering::ArgListEntry Entry;
18474   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
18475     EVT ArgVT = Op->getOperand(i).getValueType();
18476     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
18477            "Unexpected argument type for lowering");
18478     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
18479     Entry.Node = StackPtr;
18480     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
18481                            false, false, 16);
18482     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18483     Entry.Ty = PointerType::get(ArgTy,0);
18484     Entry.isSExt = false;
18485     Entry.isZExt = false;
18486     Args.push_back(Entry);
18487   }
18488
18489   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18490                                          getPointerTy());
18491
18492   TargetLowering::CallLoweringInfo CLI(DAG);
18493   CLI.setDebugLoc(dl).setChain(InChain)
18494     .setCallee(getLibcallCallingConv(LC),
18495                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
18496                Callee, std::move(Args), 0)
18497     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
18498
18499   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
18500   return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
18501 }
18502
18503 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
18504                              SelectionDAG &DAG) {
18505   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
18506   EVT VT = Op0.getValueType();
18507   SDLoc dl(Op);
18508
18509   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
18510          (VT == MVT::v8i32 && Subtarget->hasInt256()));
18511
18512   // PMULxD operations multiply each even value (starting at 0) of LHS with
18513   // the related value of RHS and produce a widen result.
18514   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18515   // => <2 x i64> <ae|cg>
18516   //
18517   // In other word, to have all the results, we need to perform two PMULxD:
18518   // 1. one with the even values.
18519   // 2. one with the odd values.
18520   // To achieve #2, with need to place the odd values at an even position.
18521   //
18522   // Place the odd value at an even position (basically, shift all values 1
18523   // step to the left):
18524   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
18525   // <a|b|c|d> => <b|undef|d|undef>
18526   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
18527   // <e|f|g|h> => <f|undef|h|undef>
18528   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
18529
18530   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
18531   // ints.
18532   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
18533   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
18534   unsigned Opcode =
18535       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
18536   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18537   // => <2 x i64> <ae|cg>
18538   SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
18539                              DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
18540   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
18541   // => <2 x i64> <bf|dh>
18542   SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
18543                              DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
18544
18545   // Shuffle it back into the right order.
18546   SDValue Highs, Lows;
18547   if (VT == MVT::v8i32) {
18548     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
18549     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18550     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
18551     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18552   } else {
18553     const int HighMask[] = {1, 5, 3, 7};
18554     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18555     const int LowMask[] = {0, 4, 2, 6};
18556     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18557   }
18558
18559   // If we have a signed multiply but no PMULDQ fix up the high parts of a
18560   // unsigned multiply.
18561   if (IsSigned && !Subtarget->hasSSE41()) {
18562     SDValue ShAmt =
18563         DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
18564     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
18565                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
18566     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
18567                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
18568
18569     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
18570     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
18571   }
18572
18573   // The first result of MUL_LOHI is actually the low value, followed by the
18574   // high value.
18575   SDValue Ops[] = {Lows, Highs};
18576   return DAG.getMergeValues(Ops, dl);
18577 }
18578
18579 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
18580                                          const X86Subtarget *Subtarget) {
18581   MVT VT = Op.getSimpleValueType();
18582   SDLoc dl(Op);
18583   SDValue R = Op.getOperand(0);
18584   SDValue Amt = Op.getOperand(1);
18585
18586   // Optimize shl/srl/sra with constant shift amount.
18587   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
18588     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
18589       uint64_t ShiftAmt = ShiftConst->getZExtValue();
18590
18591       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
18592           (Subtarget->hasInt256() &&
18593            (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18594           (Subtarget->hasAVX512() &&
18595            (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18596         if (Op.getOpcode() == ISD::SHL)
18597           return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18598                                             DAG);
18599         if (Op.getOpcode() == ISD::SRL)
18600           return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18601                                             DAG);
18602         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
18603           return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18604                                             DAG);
18605       }
18606
18607       if (VT == MVT::v16i8) {
18608         if (Op.getOpcode() == ISD::SHL) {
18609           // Make a large shift.
18610           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18611                                                    MVT::v8i16, R, ShiftAmt,
18612                                                    DAG);
18613           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18614           // Zero out the rightmost bits.
18615           SmallVector<SDValue, 16> V(16,
18616                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18617                                                      MVT::i8));
18618           return DAG.getNode(ISD::AND, dl, VT, SHL,
18619                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18620         }
18621         if (Op.getOpcode() == ISD::SRL) {
18622           // Make a large shift.
18623           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18624                                                    MVT::v8i16, R, ShiftAmt,
18625                                                    DAG);
18626           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18627           // Zero out the leftmost bits.
18628           SmallVector<SDValue, 16> V(16,
18629                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18630                                                      MVT::i8));
18631           return DAG.getNode(ISD::AND, dl, VT, SRL,
18632                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18633         }
18634         if (Op.getOpcode() == ISD::SRA) {
18635           if (ShiftAmt == 7) {
18636             // R s>> 7  ===  R s< 0
18637             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18638             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18639           }
18640
18641           // R s>> a === ((R u>> a) ^ m) - m
18642           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18643           SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
18644                                                          MVT::i8));
18645           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18646           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18647           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18648           return Res;
18649         }
18650         llvm_unreachable("Unknown shift opcode.");
18651       }
18652
18653       if (Subtarget->hasInt256() && VT == MVT::v32i8) {
18654         if (Op.getOpcode() == ISD::SHL) {
18655           // Make a large shift.
18656           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18657                                                    MVT::v16i16, R, ShiftAmt,
18658                                                    DAG);
18659           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18660           // Zero out the rightmost bits.
18661           SmallVector<SDValue, 32> V(32,
18662                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18663                                                      MVT::i8));
18664           return DAG.getNode(ISD::AND, dl, VT, SHL,
18665                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18666         }
18667         if (Op.getOpcode() == ISD::SRL) {
18668           // Make a large shift.
18669           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18670                                                    MVT::v16i16, R, ShiftAmt,
18671                                                    DAG);
18672           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18673           // Zero out the leftmost bits.
18674           SmallVector<SDValue, 32> V(32,
18675                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18676                                                      MVT::i8));
18677           return DAG.getNode(ISD::AND, dl, VT, SRL,
18678                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18679         }
18680         if (Op.getOpcode() == ISD::SRA) {
18681           if (ShiftAmt == 7) {
18682             // R s>> 7  ===  R s< 0
18683             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18684             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18685           }
18686
18687           // R s>> a === ((R u>> a) ^ m) - m
18688           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18689           SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
18690                                                          MVT::i8));
18691           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18692           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18693           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18694           return Res;
18695         }
18696         llvm_unreachable("Unknown shift opcode.");
18697       }
18698     }
18699   }
18700
18701   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18702   if (!Subtarget->is64Bit() &&
18703       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
18704       Amt.getOpcode() == ISD::BITCAST &&
18705       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18706     Amt = Amt.getOperand(0);
18707     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18708                      VT.getVectorNumElements();
18709     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
18710     uint64_t ShiftAmt = 0;
18711     for (unsigned i = 0; i != Ratio; ++i) {
18712       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
18713       if (!C)
18714         return SDValue();
18715       // 6 == Log2(64)
18716       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
18717     }
18718     // Check remaining shift amounts.
18719     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18720       uint64_t ShAmt = 0;
18721       for (unsigned j = 0; j != Ratio; ++j) {
18722         ConstantSDNode *C =
18723           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
18724         if (!C)
18725           return SDValue();
18726         // 6 == Log2(64)
18727         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
18728       }
18729       if (ShAmt != ShiftAmt)
18730         return SDValue();
18731     }
18732     switch (Op.getOpcode()) {
18733     default:
18734       llvm_unreachable("Unknown shift opcode!");
18735     case ISD::SHL:
18736       return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18737                                         DAG);
18738     case ISD::SRL:
18739       return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18740                                         DAG);
18741     case ISD::SRA:
18742       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18743                                         DAG);
18744     }
18745   }
18746
18747   return SDValue();
18748 }
18749
18750 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
18751                                         const X86Subtarget* Subtarget) {
18752   MVT VT = Op.getSimpleValueType();
18753   SDLoc dl(Op);
18754   SDValue R = Op.getOperand(0);
18755   SDValue Amt = Op.getOperand(1);
18756
18757   if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
18758       VT == MVT::v4i32 || VT == MVT::v8i16 ||
18759       (Subtarget->hasInt256() &&
18760        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
18761         VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18762        (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18763     SDValue BaseShAmt;
18764     EVT EltVT = VT.getVectorElementType();
18765
18766     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
18767       // Check if this build_vector node is doing a splat.
18768       // If so, then set BaseShAmt equal to the splat value.
18769       BaseShAmt = BV->getSplatValue();
18770       if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
18771         BaseShAmt = SDValue();
18772     } else {
18773       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
18774         Amt = Amt.getOperand(0);
18775
18776       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
18777       if (SVN && SVN->isSplat()) {
18778         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
18779         SDValue InVec = Amt.getOperand(0);
18780         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
18781           assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
18782                  "Unexpected shuffle index found!");
18783           BaseShAmt = InVec.getOperand(SplatIdx);
18784         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
18785            if (ConstantSDNode *C =
18786                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
18787              if (C->getZExtValue() == SplatIdx)
18788                BaseShAmt = InVec.getOperand(1);
18789            }
18790         }
18791
18792         if (!BaseShAmt)
18793           // Avoid introducing an extract element from a shuffle.
18794           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
18795                                     DAG.getIntPtrConstant(SplatIdx));
18796       }
18797     }
18798
18799     if (BaseShAmt.getNode()) {
18800       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
18801       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
18802         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
18803       else if (EltVT.bitsLT(MVT::i32))
18804         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
18805
18806       switch (Op.getOpcode()) {
18807       default:
18808         llvm_unreachable("Unknown shift opcode!");
18809       case ISD::SHL:
18810         switch (VT.SimpleTy) {
18811         default: return SDValue();
18812         case MVT::v2i64:
18813         case MVT::v4i32:
18814         case MVT::v8i16:
18815         case MVT::v4i64:
18816         case MVT::v8i32:
18817         case MVT::v16i16:
18818         case MVT::v16i32:
18819         case MVT::v8i64:
18820           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
18821         }
18822       case ISD::SRA:
18823         switch (VT.SimpleTy) {
18824         default: return SDValue();
18825         case MVT::v4i32:
18826         case MVT::v8i16:
18827         case MVT::v8i32:
18828         case MVT::v16i16:
18829         case MVT::v16i32:
18830         case MVT::v8i64:
18831           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
18832         }
18833       case ISD::SRL:
18834         switch (VT.SimpleTy) {
18835         default: return SDValue();
18836         case MVT::v2i64:
18837         case MVT::v4i32:
18838         case MVT::v8i16:
18839         case MVT::v4i64:
18840         case MVT::v8i32:
18841         case MVT::v16i16:
18842         case MVT::v16i32:
18843         case MVT::v8i64:
18844           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
18845         }
18846       }
18847     }
18848   }
18849
18850   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18851   if (!Subtarget->is64Bit() &&
18852       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
18853       (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
18854       Amt.getOpcode() == ISD::BITCAST &&
18855       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18856     Amt = Amt.getOperand(0);
18857     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18858                      VT.getVectorNumElements();
18859     std::vector<SDValue> Vals(Ratio);
18860     for (unsigned i = 0; i != Ratio; ++i)
18861       Vals[i] = Amt.getOperand(i);
18862     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18863       for (unsigned j = 0; j != Ratio; ++j)
18864         if (Vals[j] != Amt.getOperand(i + j))
18865           return SDValue();
18866     }
18867     switch (Op.getOpcode()) {
18868     default:
18869       llvm_unreachable("Unknown shift opcode!");
18870     case ISD::SHL:
18871       return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
18872     case ISD::SRL:
18873       return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
18874     case ISD::SRA:
18875       return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
18876     }
18877   }
18878
18879   return SDValue();
18880 }
18881
18882 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
18883                           SelectionDAG &DAG) {
18884   MVT VT = Op.getSimpleValueType();
18885   SDLoc dl(Op);
18886   SDValue R = Op.getOperand(0);
18887   SDValue Amt = Op.getOperand(1);
18888   SDValue V;
18889
18890   assert(VT.isVector() && "Custom lowering only for vector shifts!");
18891   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
18892
18893   V = LowerScalarImmediateShift(Op, DAG, Subtarget);
18894   if (V.getNode())
18895     return V;
18896
18897   V = LowerScalarVariableShift(Op, DAG, Subtarget);
18898   if (V.getNode())
18899       return V;
18900
18901   if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
18902     return Op;
18903   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
18904   if (Subtarget->hasInt256()) {
18905     if (Op.getOpcode() == ISD::SRL &&
18906         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18907          VT == MVT::v4i64 || VT == MVT::v8i32))
18908       return Op;
18909     if (Op.getOpcode() == ISD::SHL &&
18910         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18911          VT == MVT::v4i64 || VT == MVT::v8i32))
18912       return Op;
18913     if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
18914       return Op;
18915   }
18916
18917   // If possible, lower this packed shift into a vector multiply instead of
18918   // expanding it into a sequence of scalar shifts.
18919   // Do this only if the vector shift count is a constant build_vector.
18920   if (Op.getOpcode() == ISD::SHL &&
18921       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
18922        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
18923       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18924     SmallVector<SDValue, 8> Elts;
18925     EVT SVT = VT.getScalarType();
18926     unsigned SVTBits = SVT.getSizeInBits();
18927     const APInt &One = APInt(SVTBits, 1);
18928     unsigned NumElems = VT.getVectorNumElements();
18929
18930     for (unsigned i=0; i !=NumElems; ++i) {
18931       SDValue Op = Amt->getOperand(i);
18932       if (Op->getOpcode() == ISD::UNDEF) {
18933         Elts.push_back(Op);
18934         continue;
18935       }
18936
18937       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
18938       const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
18939       uint64_t ShAmt = C.getZExtValue();
18940       if (ShAmt >= SVTBits) {
18941         Elts.push_back(DAG.getUNDEF(SVT));
18942         continue;
18943       }
18944       Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
18945     }
18946     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
18947     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
18948   }
18949
18950   // Lower SHL with variable shift amount.
18951   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
18952     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
18953
18954     Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
18955     Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
18956     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
18957     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
18958   }
18959
18960   // If possible, lower this shift as a sequence of two shifts by
18961   // constant plus a MOVSS/MOVSD instead of scalarizing it.
18962   // Example:
18963   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
18964   //
18965   // Could be rewritten as:
18966   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
18967   //
18968   // The advantage is that the two shifts from the example would be
18969   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
18970   // the vector shift into four scalar shifts plus four pairs of vector
18971   // insert/extract.
18972   if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
18973       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18974     unsigned TargetOpcode = X86ISD::MOVSS;
18975     bool CanBeSimplified;
18976     // The splat value for the first packed shift (the 'X' from the example).
18977     SDValue Amt1 = Amt->getOperand(0);
18978     // The splat value for the second packed shift (the 'Y' from the example).
18979     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
18980                                         Amt->getOperand(2);
18981
18982     // See if it is possible to replace this node with a sequence of
18983     // two shifts followed by a MOVSS/MOVSD
18984     if (VT == MVT::v4i32) {
18985       // Check if it is legal to use a MOVSS.
18986       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
18987                         Amt2 == Amt->getOperand(3);
18988       if (!CanBeSimplified) {
18989         // Otherwise, check if we can still simplify this node using a MOVSD.
18990         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
18991                           Amt->getOperand(2) == Amt->getOperand(3);
18992         TargetOpcode = X86ISD::MOVSD;
18993         Amt2 = Amt->getOperand(2);
18994       }
18995     } else {
18996       // Do similar checks for the case where the machine value type
18997       // is MVT::v8i16.
18998       CanBeSimplified = Amt1 == Amt->getOperand(1);
18999       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
19000         CanBeSimplified = Amt2 == Amt->getOperand(i);
19001
19002       if (!CanBeSimplified) {
19003         TargetOpcode = X86ISD::MOVSD;
19004         CanBeSimplified = true;
19005         Amt2 = Amt->getOperand(4);
19006         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
19007           CanBeSimplified = Amt1 == Amt->getOperand(i);
19008         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
19009           CanBeSimplified = Amt2 == Amt->getOperand(j);
19010       }
19011     }
19012
19013     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
19014         isa<ConstantSDNode>(Amt2)) {
19015       // Replace this node with two shifts followed by a MOVSS/MOVSD.
19016       EVT CastVT = MVT::v4i32;
19017       SDValue Splat1 =
19018         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
19019       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
19020       SDValue Splat2 =
19021         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
19022       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
19023       if (TargetOpcode == X86ISD::MOVSD)
19024         CastVT = MVT::v2i64;
19025       SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
19026       SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
19027       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
19028                                             BitCast1, DAG);
19029       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
19030     }
19031   }
19032
19033   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
19034     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
19035
19036     // a = a << 5;
19037     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
19038     Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
19039
19040     // Turn 'a' into a mask suitable for VSELECT
19041     SDValue VSelM = DAG.getConstant(0x80, VT);
19042     SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19043     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19044
19045     SDValue CM1 = DAG.getConstant(0x0f, VT);
19046     SDValue CM2 = DAG.getConstant(0x3f, VT);
19047
19048     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
19049     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
19050     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
19051     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19052     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19053
19054     // a += a
19055     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19056     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19057     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19058
19059     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
19060     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
19061     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
19062     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
19063     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
19064
19065     // a += a
19066     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
19067     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
19068     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
19069
19070     // return VSELECT(r, r+r, a);
19071     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
19072                     DAG.getNode(ISD::ADD, dl, VT, R, R), R);
19073     return R;
19074   }
19075
19076   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
19077   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
19078   // solution better.
19079   if (Subtarget->hasInt256() && VT == MVT::v8i16) {
19080     MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
19081     unsigned ExtOpc =
19082         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
19083     R = DAG.getNode(ExtOpc, dl, NewVT, R);
19084     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
19085     return DAG.getNode(ISD::TRUNCATE, dl, VT,
19086                        DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
19087     }
19088
19089   // Decompose 256-bit shifts into smaller 128-bit shifts.
19090   if (VT.is256BitVector()) {
19091     unsigned NumElems = VT.getVectorNumElements();
19092     MVT EltVT = VT.getVectorElementType();
19093     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19094
19095     // Extract the two vectors
19096     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
19097     SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
19098
19099     // Recreate the shift amount vectors
19100     SDValue Amt1, Amt2;
19101     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
19102       // Constant shift amount
19103       SmallVector<SDValue, 4> Amt1Csts;
19104       SmallVector<SDValue, 4> Amt2Csts;
19105       for (unsigned i = 0; i != NumElems/2; ++i)
19106         Amt1Csts.push_back(Amt->getOperand(i));
19107       for (unsigned i = NumElems/2; i != NumElems; ++i)
19108         Amt2Csts.push_back(Amt->getOperand(i));
19109
19110       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
19111       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
19112     } else {
19113       // Variable shift amount
19114       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
19115       Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
19116     }
19117
19118     // Issue new vector shifts for the smaller types
19119     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
19120     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
19121
19122     // Concatenate the result back
19123     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
19124   }
19125
19126   return SDValue();
19127 }
19128
19129 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
19130   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
19131   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
19132   // looks for this combo and may remove the "setcc" instruction if the "setcc"
19133   // has only one use.
19134   SDNode *N = Op.getNode();
19135   SDValue LHS = N->getOperand(0);
19136   SDValue RHS = N->getOperand(1);
19137   unsigned BaseOp = 0;
19138   unsigned Cond = 0;
19139   SDLoc DL(Op);
19140   switch (Op.getOpcode()) {
19141   default: llvm_unreachable("Unknown ovf instruction!");
19142   case ISD::SADDO:
19143     // A subtract of one will be selected as a INC. Note that INC doesn't
19144     // set CF, so we can't do this for UADDO.
19145     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19146       if (C->isOne()) {
19147         BaseOp = X86ISD::INC;
19148         Cond = X86::COND_O;
19149         break;
19150       }
19151     BaseOp = X86ISD::ADD;
19152     Cond = X86::COND_O;
19153     break;
19154   case ISD::UADDO:
19155     BaseOp = X86ISD::ADD;
19156     Cond = X86::COND_B;
19157     break;
19158   case ISD::SSUBO:
19159     // A subtract of one will be selected as a DEC. Note that DEC doesn't
19160     // set CF, so we can't do this for USUBO.
19161     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
19162       if (C->isOne()) {
19163         BaseOp = X86ISD::DEC;
19164         Cond = X86::COND_O;
19165         break;
19166       }
19167     BaseOp = X86ISD::SUB;
19168     Cond = X86::COND_O;
19169     break;
19170   case ISD::USUBO:
19171     BaseOp = X86ISD::SUB;
19172     Cond = X86::COND_B;
19173     break;
19174   case ISD::SMULO:
19175     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
19176     Cond = X86::COND_O;
19177     break;
19178   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
19179     if (N->getValueType(0) == MVT::i8) {
19180       BaseOp = X86ISD::UMUL8;
19181       Cond = X86::COND_O;
19182       break;
19183     }
19184     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
19185                                  MVT::i32);
19186     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
19187
19188     SDValue SetCC =
19189       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
19190                   DAG.getConstant(X86::COND_O, MVT::i32),
19191                   SDValue(Sum.getNode(), 2));
19192
19193     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19194   }
19195   }
19196
19197   // Also sets EFLAGS.
19198   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
19199   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
19200
19201   SDValue SetCC =
19202     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
19203                 DAG.getConstant(Cond, MVT::i32),
19204                 SDValue(Sum.getNode(), 1));
19205
19206   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19207 }
19208
19209 // Sign extension of the low part of vector elements. This may be used either
19210 // when sign extend instructions are not available or if the vector element
19211 // sizes already match the sign-extended size. If the vector elements are in
19212 // their pre-extended size and sign extend instructions are available, that will
19213 // be handled by LowerSIGN_EXTEND.
19214 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
19215                                                   SelectionDAG &DAG) const {
19216   SDLoc dl(Op);
19217   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
19218   MVT VT = Op.getSimpleValueType();
19219
19220   if (!Subtarget->hasSSE2() || !VT.isVector())
19221     return SDValue();
19222
19223   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
19224                       ExtraVT.getScalarType().getSizeInBits();
19225
19226   switch (VT.SimpleTy) {
19227     default: return SDValue();
19228     case MVT::v8i32:
19229     case MVT::v16i16:
19230       if (!Subtarget->hasFp256())
19231         return SDValue();
19232       if (!Subtarget->hasInt256()) {
19233         // needs to be split
19234         unsigned NumElems = VT.getVectorNumElements();
19235
19236         // Extract the LHS vectors
19237         SDValue LHS = Op.getOperand(0);
19238         SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
19239         SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
19240
19241         MVT EltVT = VT.getVectorElementType();
19242         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19243
19244         EVT ExtraEltVT = ExtraVT.getVectorElementType();
19245         unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
19246         ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
19247                                    ExtraNumElems/2);
19248         SDValue Extra = DAG.getValueType(ExtraVT);
19249
19250         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
19251         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
19252
19253         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
19254       }
19255       // fall through
19256     case MVT::v4i32:
19257     case MVT::v8i16: {
19258       SDValue Op0 = Op.getOperand(0);
19259
19260       // This is a sign extension of some low part of vector elements without
19261       // changing the size of the vector elements themselves:
19262       // Shift-Left + Shift-Right-Algebraic.
19263       SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
19264                                                BitsDiff, DAG);
19265       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
19266                                         DAG);
19267     }
19268   }
19269 }
19270
19271 /// Returns true if the operand type is exactly twice the native width, and
19272 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
19273 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
19274 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
19275 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
19276   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
19277
19278   if (OpWidth == 64)
19279     return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
19280   else if (OpWidth == 128)
19281     return Subtarget->hasCmpxchg16b();
19282   else
19283     return false;
19284 }
19285
19286 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19287   return needsCmpXchgNb(SI->getValueOperand()->getType());
19288 }
19289
19290 // Note: this turns large loads into lock cmpxchg8b/16b.
19291 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
19292 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19293   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
19294   return needsCmpXchgNb(PTy->getElementType());
19295 }
19296
19297 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
19298   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19299   const Type *MemType = AI->getType();
19300
19301   // If the operand is too big, we must see if cmpxchg8/16b is available
19302   // and default to library calls otherwise.
19303   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19304     return needsCmpXchgNb(MemType);
19305
19306   AtomicRMWInst::BinOp Op = AI->getOperation();
19307   switch (Op) {
19308   default:
19309     llvm_unreachable("Unknown atomic operation");
19310   case AtomicRMWInst::Xchg:
19311   case AtomicRMWInst::Add:
19312   case AtomicRMWInst::Sub:
19313     // It's better to use xadd, xsub or xchg for these in all cases.
19314     return false;
19315   case AtomicRMWInst::Or:
19316   case AtomicRMWInst::And:
19317   case AtomicRMWInst::Xor:
19318     // If the atomicrmw's result isn't actually used, we can just add a "lock"
19319     // prefix to a normal instruction for these operations.
19320     return !AI->use_empty();
19321   case AtomicRMWInst::Nand:
19322   case AtomicRMWInst::Max:
19323   case AtomicRMWInst::Min:
19324   case AtomicRMWInst::UMax:
19325   case AtomicRMWInst::UMin:
19326     // These always require a non-trivial set of data operations on x86. We must
19327     // use a cmpxchg loop.
19328     return true;
19329   }
19330 }
19331
19332 static bool hasMFENCE(const X86Subtarget& Subtarget) {
19333   // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
19334   // no-sse2). There isn't any reason to disable it if the target processor
19335   // supports it.
19336   return Subtarget.hasSSE2() || Subtarget.is64Bit();
19337 }
19338
19339 LoadInst *
19340 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19341   unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
19342   const Type *MemType = AI->getType();
19343   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
19344   // there is no benefit in turning such RMWs into loads, and it is actually
19345   // harmful as it introduces a mfence.
19346   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19347     return nullptr;
19348
19349   auto Builder = IRBuilder<>(AI);
19350   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
19351   auto SynchScope = AI->getSynchScope();
19352   // We must restrict the ordering to avoid generating loads with Release or
19353   // ReleaseAcquire orderings.
19354   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
19355   auto Ptr = AI->getPointerOperand();
19356
19357   // Before the load we need a fence. Here is an example lifted from
19358   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
19359   // is required:
19360   // Thread 0:
19361   //   x.store(1, relaxed);
19362   //   r1 = y.fetch_add(0, release);
19363   // Thread 1:
19364   //   y.fetch_add(42, acquire);
19365   //   r2 = x.load(relaxed);
19366   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
19367   // lowered to just a load without a fence. A mfence flushes the store buffer,
19368   // making the optimization clearly correct.
19369   // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
19370   // otherwise, we might be able to be more agressive on relaxed idempotent
19371   // rmw. In practice, they do not look useful, so we don't try to be
19372   // especially clever.
19373   if (SynchScope == SingleThread) {
19374     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
19375     // the IR level, so we must wrap it in an intrinsic.
19376     return nullptr;
19377   } else if (hasMFENCE(*Subtarget)) {
19378     Function *MFence = llvm::Intrinsic::getDeclaration(M,
19379             Intrinsic::x86_sse2_mfence);
19380     Builder.CreateCall(MFence);
19381   } else {
19382     // FIXME: it might make sense to use a locked operation here but on a
19383     // different cache-line to prevent cache-line bouncing. In practice it
19384     // is probably a small win, and x86 processors without mfence are rare
19385     // enough that we do not bother.
19386     return nullptr;
19387   }
19388
19389   // Finally we can emit the atomic load.
19390   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
19391           AI->getType()->getPrimitiveSizeInBits());
19392   Loaded->setAtomic(Order, SynchScope);
19393   AI->replaceAllUsesWith(Loaded);
19394   AI->eraseFromParent();
19395   return Loaded;
19396 }
19397
19398 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
19399                                  SelectionDAG &DAG) {
19400   SDLoc dl(Op);
19401   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
19402     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
19403   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
19404     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
19405
19406   // The only fence that needs an instruction is a sequentially-consistent
19407   // cross-thread fence.
19408   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
19409     if (hasMFENCE(*Subtarget))
19410       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
19411
19412     SDValue Chain = Op.getOperand(0);
19413     SDValue Zero = DAG.getConstant(0, MVT::i32);
19414     SDValue Ops[] = {
19415       DAG.getRegister(X86::ESP, MVT::i32), // Base
19416       DAG.getTargetConstant(1, MVT::i8),   // Scale
19417       DAG.getRegister(0, MVT::i32),        // Index
19418       DAG.getTargetConstant(0, MVT::i32),  // Disp
19419       DAG.getRegister(0, MVT::i32),        // Segment.
19420       Zero,
19421       Chain
19422     };
19423     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
19424     return SDValue(Res, 0);
19425   }
19426
19427   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
19428   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
19429 }
19430
19431 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
19432                              SelectionDAG &DAG) {
19433   MVT T = Op.getSimpleValueType();
19434   SDLoc DL(Op);
19435   unsigned Reg = 0;
19436   unsigned size = 0;
19437   switch(T.SimpleTy) {
19438   default: llvm_unreachable("Invalid value type!");
19439   case MVT::i8:  Reg = X86::AL;  size = 1; break;
19440   case MVT::i16: Reg = X86::AX;  size = 2; break;
19441   case MVT::i32: Reg = X86::EAX; size = 4; break;
19442   case MVT::i64:
19443     assert(Subtarget->is64Bit() && "Node not type legal!");
19444     Reg = X86::RAX; size = 8;
19445     break;
19446   }
19447   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
19448                                   Op.getOperand(2), SDValue());
19449   SDValue Ops[] = { cpIn.getValue(0),
19450                     Op.getOperand(1),
19451                     Op.getOperand(3),
19452                     DAG.getTargetConstant(size, MVT::i8),
19453                     cpIn.getValue(1) };
19454   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19455   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
19456   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
19457                                            Ops, T, MMO);
19458
19459   SDValue cpOut =
19460     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
19461   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
19462                                       MVT::i32, cpOut.getValue(2));
19463   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
19464                                 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19465
19466   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
19467   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
19468   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
19469   return SDValue();
19470 }
19471
19472 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
19473                             SelectionDAG &DAG) {
19474   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
19475   MVT DstVT = Op.getSimpleValueType();
19476
19477   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
19478     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19479     if (DstVT != MVT::f64)
19480       // This conversion needs to be expanded.
19481       return SDValue();
19482
19483     SDValue InVec = Op->getOperand(0);
19484     SDLoc dl(Op);
19485     unsigned NumElts = SrcVT.getVectorNumElements();
19486     EVT SVT = SrcVT.getVectorElementType();
19487
19488     // Widen the vector in input in the case of MVT::v2i32.
19489     // Example: from MVT::v2i32 to MVT::v4i32.
19490     SmallVector<SDValue, 16> Elts;
19491     for (unsigned i = 0, e = NumElts; i != e; ++i)
19492       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
19493                                  DAG.getIntPtrConstant(i)));
19494
19495     // Explicitly mark the extra elements as Undef.
19496     SDValue Undef = DAG.getUNDEF(SVT);
19497     for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
19498       Elts.push_back(Undef);
19499
19500     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19501     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
19502     SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
19503     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
19504                        DAG.getIntPtrConstant(0));
19505   }
19506
19507   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
19508          Subtarget->hasMMX() && "Unexpected custom BITCAST");
19509   assert((DstVT == MVT::i64 ||
19510           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
19511          "Unexpected custom BITCAST");
19512   // i64 <=> MMX conversions are Legal.
19513   if (SrcVT==MVT::i64 && DstVT.isVector())
19514     return Op;
19515   if (DstVT==MVT::i64 && SrcVT.isVector())
19516     return Op;
19517   // MMX <=> MMX conversions are Legal.
19518   if (SrcVT.isVector() && DstVT.isVector())
19519     return Op;
19520   // All other conversions need to be expanded.
19521   return SDValue();
19522 }
19523
19524 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
19525                           SelectionDAG &DAG) {
19526   SDNode *Node = Op.getNode();
19527   SDLoc dl(Node);
19528
19529   Op = Op.getOperand(0);
19530   EVT VT = Op.getValueType();
19531   assert((VT.is128BitVector() || VT.is256BitVector()) &&
19532          "CTPOP lowering only implemented for 128/256-bit wide vector types");
19533
19534   unsigned NumElts = VT.getVectorNumElements();
19535   EVT EltVT = VT.getVectorElementType();
19536   unsigned Len = EltVT.getSizeInBits();
19537
19538   // This is the vectorized version of the "best" algorithm from
19539   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
19540   // with a minor tweak to use a series of adds + shifts instead of vector
19541   // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
19542   //
19543   //  v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
19544   //  v8i32 => Always profitable
19545   //
19546   // FIXME: There a couple of possible improvements:
19547   //
19548   // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
19549   // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
19550   //
19551   assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
19552          "CTPOP not implemented for this vector element type.");
19553
19554   // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
19555   // extra legalization.
19556   bool NeedsBitcast = EltVT == MVT::i32;
19557   MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
19558
19559   SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
19560   SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
19561   SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
19562
19563   // v = v - ((v >> 1) & 0x55555555...)
19564   SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
19565   SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
19566   SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
19567   if (NeedsBitcast)
19568     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19569
19570   SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
19571   SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
19572   if (NeedsBitcast)
19573     M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
19574
19575   SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
19576   if (VT != And.getValueType())
19577     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19578   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
19579
19580   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
19581   SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
19582   SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
19583   SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
19584   SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
19585
19586   Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
19587   if (NeedsBitcast) {
19588     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19589     M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
19590     Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
19591   }
19592
19593   SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
19594   SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
19595   if (VT != AndRHS.getValueType()) {
19596     AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
19597     AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
19598   }
19599   SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
19600
19601   // v = (v + (v >> 4)) & 0x0F0F0F0F...
19602   SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
19603   SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
19604   Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
19605   Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19606
19607   SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
19608   SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
19609   if (NeedsBitcast) {
19610     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19611     M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
19612   }
19613   And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
19614   if (VT != And.getValueType())
19615     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19616
19617   // The algorithm mentioned above uses:
19618   //    v = (v * 0x01010101...) >> (Len - 8)
19619   //
19620   // Change it to use vector adds + vector shifts which yield faster results on
19621   // Haswell than using vector integer multiplication.
19622   //
19623   // For i32 elements:
19624   //    v = v + (v >> 8)
19625   //    v = v + (v >> 16)
19626   //
19627   // For i64 elements:
19628   //    v = v + (v >> 8)
19629   //    v = v + (v >> 16)
19630   //    v = v + (v >> 32)
19631   //
19632   Add = And;
19633   SmallVector<SDValue, 8> Csts;
19634   for (unsigned i = 8; i <= Len/2; i *= 2) {
19635     Csts.assign(NumElts, DAG.getConstant(i, EltVT));
19636     SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
19637     Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
19638     Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19639     Csts.clear();
19640   }
19641
19642   // The result is on the least significant 6-bits on i32 and 7-bits on i64.
19643   SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
19644   SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
19645   SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
19646   if (NeedsBitcast) {
19647     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19648     M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
19649   }
19650   And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
19651   if (VT != And.getValueType())
19652     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19653
19654   return And;
19655 }
19656
19657 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
19658   SDNode *Node = Op.getNode();
19659   SDLoc dl(Node);
19660   EVT T = Node->getValueType(0);
19661   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
19662                               DAG.getConstant(0, T), Node->getOperand(2));
19663   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
19664                        cast<AtomicSDNode>(Node)->getMemoryVT(),
19665                        Node->getOperand(0),
19666                        Node->getOperand(1), negOp,
19667                        cast<AtomicSDNode>(Node)->getMemOperand(),
19668                        cast<AtomicSDNode>(Node)->getOrdering(),
19669                        cast<AtomicSDNode>(Node)->getSynchScope());
19670 }
19671
19672 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
19673   SDNode *Node = Op.getNode();
19674   SDLoc dl(Node);
19675   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
19676
19677   // Convert seq_cst store -> xchg
19678   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
19679   // FIXME: On 32-bit, store -> fist or movq would be more efficient
19680   //        (The only way to get a 16-byte store is cmpxchg16b)
19681   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
19682   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
19683       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
19684     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
19685                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
19686                                  Node->getOperand(0),
19687                                  Node->getOperand(1), Node->getOperand(2),
19688                                  cast<AtomicSDNode>(Node)->getMemOperand(),
19689                                  cast<AtomicSDNode>(Node)->getOrdering(),
19690                                  cast<AtomicSDNode>(Node)->getSynchScope());
19691     return Swap.getValue(1);
19692   }
19693   // Other atomic stores have a simple pattern.
19694   return Op;
19695 }
19696
19697 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
19698   EVT VT = Op.getNode()->getSimpleValueType(0);
19699
19700   // Let legalize expand this if it isn't a legal type yet.
19701   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19702     return SDValue();
19703
19704   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19705
19706   unsigned Opc;
19707   bool ExtraOp = false;
19708   switch (Op.getOpcode()) {
19709   default: llvm_unreachable("Invalid code");
19710   case ISD::ADDC: Opc = X86ISD::ADD; break;
19711   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
19712   case ISD::SUBC: Opc = X86ISD::SUB; break;
19713   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
19714   }
19715
19716   if (!ExtraOp)
19717     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19718                        Op.getOperand(1));
19719   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19720                      Op.getOperand(1), Op.getOperand(2));
19721 }
19722
19723 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
19724                             SelectionDAG &DAG) {
19725   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
19726
19727   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
19728   // which returns the values as { float, float } (in XMM0) or
19729   // { double, double } (which is returned in XMM0, XMM1).
19730   SDLoc dl(Op);
19731   SDValue Arg = Op.getOperand(0);
19732   EVT ArgVT = Arg.getValueType();
19733   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19734
19735   TargetLowering::ArgListTy Args;
19736   TargetLowering::ArgListEntry Entry;
19737
19738   Entry.Node = Arg;
19739   Entry.Ty = ArgTy;
19740   Entry.isSExt = false;
19741   Entry.isZExt = false;
19742   Args.push_back(Entry);
19743
19744   bool isF64 = ArgVT == MVT::f64;
19745   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
19746   // the small struct {f32, f32} is returned in (eax, edx). For f64,
19747   // the results are returned via SRet in memory.
19748   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
19749   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19750   SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
19751
19752   Type *RetTy = isF64
19753     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
19754     : (Type*)VectorType::get(ArgTy, 4);
19755
19756   TargetLowering::CallLoweringInfo CLI(DAG);
19757   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
19758     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
19759
19760   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
19761
19762   if (isF64)
19763     // Returned in xmm0 and xmm1.
19764     return CallResult.first;
19765
19766   // Returned in bits 0:31 and 32:64 xmm0.
19767   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19768                                CallResult.first, DAG.getIntPtrConstant(0));
19769   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19770                                CallResult.first, DAG.getIntPtrConstant(1));
19771   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
19772   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
19773 }
19774
19775 /// LowerOperation - Provide custom lowering hooks for some operations.
19776 ///
19777 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
19778   switch (Op.getOpcode()) {
19779   default: llvm_unreachable("Should not custom lower this!");
19780   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
19781   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
19782   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
19783     return LowerCMP_SWAP(Op, Subtarget, DAG);
19784   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
19785   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
19786   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
19787   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
19788   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
19789   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
19790   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
19791   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
19792   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
19793   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
19794   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
19795   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
19796   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
19797   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
19798   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
19799   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
19800   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
19801   case ISD::SHL_PARTS:
19802   case ISD::SRA_PARTS:
19803   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
19804   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
19805   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
19806   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
19807   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
19808   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
19809   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
19810   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
19811   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
19812   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
19813   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
19814   case ISD::FABS:
19815   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
19816   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
19817   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
19818   case ISD::SETCC:              return LowerSETCC(Op, DAG);
19819   case ISD::SELECT:             return LowerSELECT(Op, DAG);
19820   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
19821   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
19822   case ISD::VASTART:            return LowerVASTART(Op, DAG);
19823   case ISD::VAARG:              return LowerVAARG(Op, DAG);
19824   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
19825   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
19826   case ISD::INTRINSIC_VOID:
19827   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
19828   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
19829   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
19830   case ISD::FRAME_TO_ARGS_OFFSET:
19831                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
19832   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
19833   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
19834   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
19835   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
19836   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
19837   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
19838   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
19839   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
19840   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
19841   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
19842   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
19843   case ISD::UMUL_LOHI:
19844   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
19845   case ISD::SRA:
19846   case ISD::SRL:
19847   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
19848   case ISD::SADDO:
19849   case ISD::UADDO:
19850   case ISD::SSUBO:
19851   case ISD::USUBO:
19852   case ISD::SMULO:
19853   case ISD::UMULO:              return LowerXALUO(Op, DAG);
19854   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
19855   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
19856   case ISD::ADDC:
19857   case ISD::ADDE:
19858   case ISD::SUBC:
19859   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
19860   case ISD::ADD:                return LowerADD(Op, DAG);
19861   case ISD::SUB:                return LowerSUB(Op, DAG);
19862   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
19863   }
19864 }
19865
19866 /// ReplaceNodeResults - Replace a node with an illegal result type
19867 /// with a new node built out of custom code.
19868 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
19869                                            SmallVectorImpl<SDValue>&Results,
19870                                            SelectionDAG &DAG) const {
19871   SDLoc dl(N);
19872   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19873   switch (N->getOpcode()) {
19874   default:
19875     llvm_unreachable("Do not know how to custom type legalize this operation!");
19876   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
19877   case X86ISD::FMINC:
19878   case X86ISD::FMIN:
19879   case X86ISD::FMAXC:
19880   case X86ISD::FMAX: {
19881     EVT VT = N->getValueType(0);
19882     if (VT != MVT::v2f32)
19883       llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
19884     SDValue UNDEF = DAG.getUNDEF(VT);
19885     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19886                               N->getOperand(0), UNDEF);
19887     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19888                               N->getOperand(1), UNDEF);
19889     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
19890     return;
19891   }
19892   case ISD::SIGN_EXTEND_INREG:
19893   case ISD::ADDC:
19894   case ISD::ADDE:
19895   case ISD::SUBC:
19896   case ISD::SUBE:
19897     // We don't want to expand or promote these.
19898     return;
19899   case ISD::SDIV:
19900   case ISD::UDIV:
19901   case ISD::SREM:
19902   case ISD::UREM:
19903   case ISD::SDIVREM:
19904   case ISD::UDIVREM: {
19905     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
19906     Results.push_back(V);
19907     return;
19908   }
19909   case ISD::FP_TO_SINT:
19910   case ISD::FP_TO_UINT: {
19911     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
19912
19913     if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
19914       return;
19915
19916     std::pair<SDValue,SDValue> Vals =
19917         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
19918     SDValue FIST = Vals.first, StackSlot = Vals.second;
19919     if (FIST.getNode()) {
19920       EVT VT = N->getValueType(0);
19921       // Return a load from the stack slot.
19922       if (StackSlot.getNode())
19923         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
19924                                       MachinePointerInfo(),
19925                                       false, false, false, 0));
19926       else
19927         Results.push_back(FIST);
19928     }
19929     return;
19930   }
19931   case ISD::UINT_TO_FP: {
19932     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19933     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
19934         N->getValueType(0) != MVT::v2f32)
19935       return;
19936     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
19937                                  N->getOperand(0));
19938     SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
19939                                      MVT::f64);
19940     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
19941     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
19942                              DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
19943     Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
19944     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
19945     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
19946     return;
19947   }
19948   case ISD::FP_ROUND: {
19949     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
19950         return;
19951     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
19952     Results.push_back(V);
19953     return;
19954   }
19955   case ISD::INTRINSIC_W_CHAIN: {
19956     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
19957     switch (IntNo) {
19958     default : llvm_unreachable("Do not know how to custom type "
19959                                "legalize this intrinsic operation!");
19960     case Intrinsic::x86_rdtsc:
19961       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19962                                      Results);
19963     case Intrinsic::x86_rdtscp:
19964       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
19965                                      Results);
19966     case Intrinsic::x86_rdpmc:
19967       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
19968     }
19969   }
19970   case ISD::READCYCLECOUNTER: {
19971     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19972                                    Results);
19973   }
19974   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
19975     EVT T = N->getValueType(0);
19976     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
19977     bool Regs64bit = T == MVT::i128;
19978     EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
19979     SDValue cpInL, cpInH;
19980     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19981                         DAG.getConstant(0, HalfT));
19982     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19983                         DAG.getConstant(1, HalfT));
19984     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
19985                              Regs64bit ? X86::RAX : X86::EAX,
19986                              cpInL, SDValue());
19987     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
19988                              Regs64bit ? X86::RDX : X86::EDX,
19989                              cpInH, cpInL.getValue(1));
19990     SDValue swapInL, swapInH;
19991     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19992                           DAG.getConstant(0, HalfT));
19993     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19994                           DAG.getConstant(1, HalfT));
19995     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
19996                                Regs64bit ? X86::RBX : X86::EBX,
19997                                swapInL, cpInH.getValue(1));
19998     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
19999                                Regs64bit ? X86::RCX : X86::ECX,
20000                                swapInH, swapInL.getValue(1));
20001     SDValue Ops[] = { swapInH.getValue(0),
20002                       N->getOperand(1),
20003                       swapInH.getValue(1) };
20004     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20005     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
20006     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
20007                                   X86ISD::LCMPXCHG8_DAG;
20008     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
20009     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
20010                                         Regs64bit ? X86::RAX : X86::EAX,
20011                                         HalfT, Result.getValue(1));
20012     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
20013                                         Regs64bit ? X86::RDX : X86::EDX,
20014                                         HalfT, cpOutL.getValue(2));
20015     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
20016
20017     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
20018                                         MVT::i32, cpOutH.getValue(2));
20019     SDValue Success =
20020         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
20021                     DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
20022     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
20023
20024     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
20025     Results.push_back(Success);
20026     Results.push_back(EFLAGS.getValue(1));
20027     return;
20028   }
20029   case ISD::ATOMIC_SWAP:
20030   case ISD::ATOMIC_LOAD_ADD:
20031   case ISD::ATOMIC_LOAD_SUB:
20032   case ISD::ATOMIC_LOAD_AND:
20033   case ISD::ATOMIC_LOAD_OR:
20034   case ISD::ATOMIC_LOAD_XOR:
20035   case ISD::ATOMIC_LOAD_NAND:
20036   case ISD::ATOMIC_LOAD_MIN:
20037   case ISD::ATOMIC_LOAD_MAX:
20038   case ISD::ATOMIC_LOAD_UMIN:
20039   case ISD::ATOMIC_LOAD_UMAX:
20040   case ISD::ATOMIC_LOAD: {
20041     // Delegate to generic TypeLegalization. Situations we can really handle
20042     // should have already been dealt with by AtomicExpandPass.cpp.
20043     break;
20044   }
20045   case ISD::BITCAST: {
20046     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
20047     EVT DstVT = N->getValueType(0);
20048     EVT SrcVT = N->getOperand(0)->getValueType(0);
20049
20050     if (SrcVT != MVT::f64 ||
20051         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
20052       return;
20053
20054     unsigned NumElts = DstVT.getVectorNumElements();
20055     EVT SVT = DstVT.getVectorElementType();
20056     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
20057     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
20058                                    MVT::v2f64, N->getOperand(0));
20059     SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
20060
20061     if (ExperimentalVectorWideningLegalization) {
20062       // If we are legalizing vectors by widening, we already have the desired
20063       // legal vector type, just return it.
20064       Results.push_back(ToVecInt);
20065       return;
20066     }
20067
20068     SmallVector<SDValue, 8> Elts;
20069     for (unsigned i = 0, e = NumElts; i != e; ++i)
20070       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
20071                                    ToVecInt, DAG.getIntPtrConstant(i)));
20072
20073     Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
20074   }
20075   }
20076 }
20077
20078 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
20079   switch (Opcode) {
20080   default: return nullptr;
20081   case X86ISD::BSF:                return "X86ISD::BSF";
20082   case X86ISD::BSR:                return "X86ISD::BSR";
20083   case X86ISD::SHLD:               return "X86ISD::SHLD";
20084   case X86ISD::SHRD:               return "X86ISD::SHRD";
20085   case X86ISD::FAND:               return "X86ISD::FAND";
20086   case X86ISD::FANDN:              return "X86ISD::FANDN";
20087   case X86ISD::FOR:                return "X86ISD::FOR";
20088   case X86ISD::FXOR:               return "X86ISD::FXOR";
20089   case X86ISD::FSRL:               return "X86ISD::FSRL";
20090   case X86ISD::FILD:               return "X86ISD::FILD";
20091   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
20092   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
20093   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
20094   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
20095   case X86ISD::FLD:                return "X86ISD::FLD";
20096   case X86ISD::FST:                return "X86ISD::FST";
20097   case X86ISD::CALL:               return "X86ISD::CALL";
20098   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
20099   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
20100   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
20101   case X86ISD::BT:                 return "X86ISD::BT";
20102   case X86ISD::CMP:                return "X86ISD::CMP";
20103   case X86ISD::COMI:               return "X86ISD::COMI";
20104   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
20105   case X86ISD::CMPM:               return "X86ISD::CMPM";
20106   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
20107   case X86ISD::SETCC:              return "X86ISD::SETCC";
20108   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
20109   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
20110   case X86ISD::CMOV:               return "X86ISD::CMOV";
20111   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
20112   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
20113   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
20114   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
20115   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
20116   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
20117   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
20118   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
20119   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
20120   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
20121   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
20122   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
20123   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
20124   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
20125   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
20126   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
20127   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
20128   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
20129   case X86ISD::HADD:               return "X86ISD::HADD";
20130   case X86ISD::HSUB:               return "X86ISD::HSUB";
20131   case X86ISD::FHADD:              return "X86ISD::FHADD";
20132   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
20133   case X86ISD::UMAX:               return "X86ISD::UMAX";
20134   case X86ISD::UMIN:               return "X86ISD::UMIN";
20135   case X86ISD::SMAX:               return "X86ISD::SMAX";
20136   case X86ISD::SMIN:               return "X86ISD::SMIN";
20137   case X86ISD::FMAX:               return "X86ISD::FMAX";
20138   case X86ISD::FMIN:               return "X86ISD::FMIN";
20139   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
20140   case X86ISD::FMINC:              return "X86ISD::FMINC";
20141   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
20142   case X86ISD::FRCP:               return "X86ISD::FRCP";
20143   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
20144   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
20145   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
20146   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
20147   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
20148   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
20149   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
20150   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
20151   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
20152   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
20153   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
20154   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
20155   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
20156   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
20157   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
20158   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
20159   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
20160   case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
20161   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
20162   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
20163   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
20164   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
20165   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
20166   case X86ISD::VSHL:               return "X86ISD::VSHL";
20167   case X86ISD::VSRL:               return "X86ISD::VSRL";
20168   case X86ISD::VSRA:               return "X86ISD::VSRA";
20169   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
20170   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
20171   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
20172   case X86ISD::CMPP:               return "X86ISD::CMPP";
20173   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
20174   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
20175   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
20176   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
20177   case X86ISD::ADD:                return "X86ISD::ADD";
20178   case X86ISD::SUB:                return "X86ISD::SUB";
20179   case X86ISD::ADC:                return "X86ISD::ADC";
20180   case X86ISD::SBB:                return "X86ISD::SBB";
20181   case X86ISD::SMUL:               return "X86ISD::SMUL";
20182   case X86ISD::UMUL:               return "X86ISD::UMUL";
20183   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
20184   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
20185   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
20186   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
20187   case X86ISD::INC:                return "X86ISD::INC";
20188   case X86ISD::DEC:                return "X86ISD::DEC";
20189   case X86ISD::OR:                 return "X86ISD::OR";
20190   case X86ISD::XOR:                return "X86ISD::XOR";
20191   case X86ISD::AND:                return "X86ISD::AND";
20192   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
20193   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
20194   case X86ISD::PTEST:              return "X86ISD::PTEST";
20195   case X86ISD::TESTP:              return "X86ISD::TESTP";
20196   case X86ISD::TESTM:              return "X86ISD::TESTM";
20197   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
20198   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
20199   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
20200   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
20201   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
20202   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
20203   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
20204   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
20205   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
20206   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
20207   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
20208   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
20209   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
20210   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
20211   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
20212   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
20213   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
20214   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
20215   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
20216   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
20217   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
20218   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
20219   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
20220   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
20221   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
20222   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
20223   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
20224   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
20225   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
20226   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
20227   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
20228   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
20229   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
20230   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
20231   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
20232   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
20233   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
20234   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
20235   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
20236   case X86ISD::SAHF:               return "X86ISD::SAHF";
20237   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
20238   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
20239   case X86ISD::FMADD:              return "X86ISD::FMADD";
20240   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
20241   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
20242   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
20243   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
20244   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
20245   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
20246   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
20247   case X86ISD::XTEST:              return "X86ISD::XTEST";
20248   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
20249   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
20250   case X86ISD::SELECT:             return "X86ISD::SELECT";
20251   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
20252   case X86ISD::RCP28:              return "X86ISD::RCP28";
20253   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
20254   }
20255 }
20256
20257 // isLegalAddressingMode - Return true if the addressing mode represented
20258 // by AM is legal for this target, for a load/store of the specified type.
20259 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
20260                                               Type *Ty) const {
20261   // X86 supports extremely general addressing modes.
20262   CodeModel::Model M = getTargetMachine().getCodeModel();
20263   Reloc::Model R = getTargetMachine().getRelocationModel();
20264
20265   // X86 allows a sign-extended 32-bit immediate field as a displacement.
20266   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
20267     return false;
20268
20269   if (AM.BaseGV) {
20270     unsigned GVFlags =
20271       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
20272
20273     // If a reference to this global requires an extra load, we can't fold it.
20274     if (isGlobalStubReference(GVFlags))
20275       return false;
20276
20277     // If BaseGV requires a register for the PIC base, we cannot also have a
20278     // BaseReg specified.
20279     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
20280       return false;
20281
20282     // If lower 4G is not available, then we must use rip-relative addressing.
20283     if ((M != CodeModel::Small || R != Reloc::Static) &&
20284         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
20285       return false;
20286   }
20287
20288   switch (AM.Scale) {
20289   case 0:
20290   case 1:
20291   case 2:
20292   case 4:
20293   case 8:
20294     // These scales always work.
20295     break;
20296   case 3:
20297   case 5:
20298   case 9:
20299     // These scales are formed with basereg+scalereg.  Only accept if there is
20300     // no basereg yet.
20301     if (AM.HasBaseReg)
20302       return false;
20303     break;
20304   default:  // Other stuff never works.
20305     return false;
20306   }
20307
20308   return true;
20309 }
20310
20311 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
20312   unsigned Bits = Ty->getScalarSizeInBits();
20313
20314   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
20315   // particularly cheaper than those without.
20316   if (Bits == 8)
20317     return false;
20318
20319   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
20320   // variable shifts just as cheap as scalar ones.
20321   if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
20322     return false;
20323
20324   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
20325   // fully general vector.
20326   return true;
20327 }
20328
20329 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
20330   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20331     return false;
20332   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
20333   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
20334   return NumBits1 > NumBits2;
20335 }
20336
20337 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
20338   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20339     return false;
20340
20341   if (!isTypeLegal(EVT::getEVT(Ty1)))
20342     return false;
20343
20344   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
20345
20346   // Assuming the caller doesn't have a zeroext or signext return parameter,
20347   // truncation all the way down to i1 is valid.
20348   return true;
20349 }
20350
20351 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
20352   return isInt<32>(Imm);
20353 }
20354
20355 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
20356   // Can also use sub to handle negated immediates.
20357   return isInt<32>(Imm);
20358 }
20359
20360 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
20361   if (!VT1.isInteger() || !VT2.isInteger())
20362     return false;
20363   unsigned NumBits1 = VT1.getSizeInBits();
20364   unsigned NumBits2 = VT2.getSizeInBits();
20365   return NumBits1 > NumBits2;
20366 }
20367
20368 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
20369   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20370   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
20371 }
20372
20373 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
20374   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20375   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
20376 }
20377
20378 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
20379   EVT VT1 = Val.getValueType();
20380   if (isZExtFree(VT1, VT2))
20381     return true;
20382
20383   if (Val.getOpcode() != ISD::LOAD)
20384     return false;
20385
20386   if (!VT1.isSimple() || !VT1.isInteger() ||
20387       !VT2.isSimple() || !VT2.isInteger())
20388     return false;
20389
20390   switch (VT1.getSimpleVT().SimpleTy) {
20391   default: break;
20392   case MVT::i8:
20393   case MVT::i16:
20394   case MVT::i32:
20395     // X86 has 8, 16, and 32-bit zero-extending loads.
20396     return true;
20397   }
20398
20399   return false;
20400 }
20401
20402 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
20403
20404 bool
20405 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
20406   if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
20407     return false;
20408
20409   VT = VT.getScalarType();
20410
20411   if (!VT.isSimple())
20412     return false;
20413
20414   switch (VT.getSimpleVT().SimpleTy) {
20415   case MVT::f32:
20416   case MVT::f64:
20417     return true;
20418   default:
20419     break;
20420   }
20421
20422   return false;
20423 }
20424
20425 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
20426   // i16 instructions are longer (0x66 prefix) and potentially slower.
20427   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
20428 }
20429
20430 /// isShuffleMaskLegal - Targets can use this to indicate that they only
20431 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
20432 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
20433 /// are assumed to be legal.
20434 bool
20435 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
20436                                       EVT VT) const {
20437   if (!VT.isSimple())
20438     return false;
20439
20440   MVT SVT = VT.getSimpleVT();
20441
20442   // Very little shuffling can be done for 64-bit vectors right now.
20443   if (VT.getSizeInBits() == 64)
20444     return false;
20445
20446   // This is an experimental legality test that is tailored to match the
20447   // legality test of the experimental lowering more closely. They are gated
20448   // separately to ease testing of performance differences.
20449   if (ExperimentalVectorShuffleLegality)
20450     // We only care that the types being shuffled are legal. The lowering can
20451     // handle any possible shuffle mask that results.
20452     return isTypeLegal(SVT);
20453
20454   // If this is a single-input shuffle with no 128 bit lane crossings we can
20455   // lower it into pshufb.
20456   if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
20457       (SVT.is256BitVector() && Subtarget->hasInt256())) {
20458     bool isLegal = true;
20459     for (unsigned I = 0, E = M.size(); I != E; ++I) {
20460       if (M[I] >= (int)SVT.getVectorNumElements() ||
20461           ShuffleCrosses128bitLane(SVT, I, M[I])) {
20462         isLegal = false;
20463         break;
20464       }
20465     }
20466     if (isLegal)
20467       return true;
20468   }
20469
20470   // FIXME: blends, shifts.
20471   return (SVT.getVectorNumElements() == 2 ||
20472           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
20473           isMOVLMask(M, SVT) ||
20474           isCommutedMOVLMask(M, SVT) ||
20475           isMOVHLPSMask(M, SVT) ||
20476           isSHUFPMask(M, SVT) ||
20477           isSHUFPMask(M, SVT, /* Commuted */ true) ||
20478           isPSHUFDMask(M, SVT) ||
20479           isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
20480           isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
20481           isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
20482           isPALIGNRMask(M, SVT, Subtarget) ||
20483           isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
20484           isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
20485           isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20486           isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20487           isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
20488           (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
20489 }
20490
20491 bool
20492 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
20493                                           EVT VT) const {
20494   if (!VT.isSimple())
20495     return false;
20496
20497   MVT SVT = VT.getSimpleVT();
20498
20499   // This is an experimental legality test that is tailored to match the
20500   // legality test of the experimental lowering more closely. They are gated
20501   // separately to ease testing of performance differences.
20502   if (ExperimentalVectorShuffleLegality)
20503     // The new vector shuffle lowering is very good at managing zero-inputs.
20504     return isShuffleMaskLegal(Mask, VT);
20505
20506   unsigned NumElts = SVT.getVectorNumElements();
20507   // FIXME: This collection of masks seems suspect.
20508   if (NumElts == 2)
20509     return true;
20510   if (NumElts == 4 && SVT.is128BitVector()) {
20511     return (isMOVLMask(Mask, SVT)  ||
20512             isCommutedMOVLMask(Mask, SVT, true) ||
20513             isSHUFPMask(Mask, SVT) ||
20514             isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
20515             isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
20516                         Subtarget->hasInt256()));
20517   }
20518   return false;
20519 }
20520
20521 //===----------------------------------------------------------------------===//
20522 //                           X86 Scheduler Hooks
20523 //===----------------------------------------------------------------------===//
20524
20525 /// Utility function to emit xbegin specifying the start of an RTM region.
20526 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
20527                                      const TargetInstrInfo *TII) {
20528   DebugLoc DL = MI->getDebugLoc();
20529
20530   const BasicBlock *BB = MBB->getBasicBlock();
20531   MachineFunction::iterator I = MBB;
20532   ++I;
20533
20534   // For the v = xbegin(), we generate
20535   //
20536   // thisMBB:
20537   //  xbegin sinkMBB
20538   //
20539   // mainMBB:
20540   //  eax = -1
20541   //
20542   // sinkMBB:
20543   //  v = eax
20544
20545   MachineBasicBlock *thisMBB = MBB;
20546   MachineFunction *MF = MBB->getParent();
20547   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
20548   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
20549   MF->insert(I, mainMBB);
20550   MF->insert(I, sinkMBB);
20551
20552   // Transfer the remainder of BB and its successor edges to sinkMBB.
20553   sinkMBB->splice(sinkMBB->begin(), MBB,
20554                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20555   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
20556
20557   // thisMBB:
20558   //  xbegin sinkMBB
20559   //  # fallthrough to mainMBB
20560   //  # abortion to sinkMBB
20561   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
20562   thisMBB->addSuccessor(mainMBB);
20563   thisMBB->addSuccessor(sinkMBB);
20564
20565   // mainMBB:
20566   //  EAX = -1
20567   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
20568   mainMBB->addSuccessor(sinkMBB);
20569
20570   // sinkMBB:
20571   // EAX is live into the sinkMBB
20572   sinkMBB->addLiveIn(X86::EAX);
20573   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20574           TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20575     .addReg(X86::EAX);
20576
20577   MI->eraseFromParent();
20578   return sinkMBB;
20579 }
20580
20581 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
20582 // or XMM0_V32I8 in AVX all of this code can be replaced with that
20583 // in the .td file.
20584 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
20585                                        const TargetInstrInfo *TII) {
20586   unsigned Opc;
20587   switch (MI->getOpcode()) {
20588   default: llvm_unreachable("illegal opcode!");
20589   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
20590   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
20591   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
20592   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
20593   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
20594   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
20595   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
20596   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
20597   }
20598
20599   DebugLoc dl = MI->getDebugLoc();
20600   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20601
20602   unsigned NumArgs = MI->getNumOperands();
20603   for (unsigned i = 1; i < NumArgs; ++i) {
20604     MachineOperand &Op = MI->getOperand(i);
20605     if (!(Op.isReg() && Op.isImplicit()))
20606       MIB.addOperand(Op);
20607   }
20608   if (MI->hasOneMemOperand())
20609     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20610
20611   BuildMI(*BB, MI, dl,
20612     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20613     .addReg(X86::XMM0);
20614
20615   MI->eraseFromParent();
20616   return BB;
20617 }
20618
20619 // FIXME: Custom handling because TableGen doesn't support multiple implicit
20620 // defs in an instruction pattern
20621 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
20622                                        const TargetInstrInfo *TII) {
20623   unsigned Opc;
20624   switch (MI->getOpcode()) {
20625   default: llvm_unreachable("illegal opcode!");
20626   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
20627   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
20628   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
20629   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
20630   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
20631   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
20632   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
20633   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
20634   }
20635
20636   DebugLoc dl = MI->getDebugLoc();
20637   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20638
20639   unsigned NumArgs = MI->getNumOperands(); // remove the results
20640   for (unsigned i = 1; i < NumArgs; ++i) {
20641     MachineOperand &Op = MI->getOperand(i);
20642     if (!(Op.isReg() && Op.isImplicit()))
20643       MIB.addOperand(Op);
20644   }
20645   if (MI->hasOneMemOperand())
20646     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20647
20648   BuildMI(*BB, MI, dl,
20649     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20650     .addReg(X86::ECX);
20651
20652   MI->eraseFromParent();
20653   return BB;
20654 }
20655
20656 static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
20657                                       const X86Subtarget *Subtarget) {
20658   DebugLoc dl = MI->getDebugLoc();
20659   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20660   // Address into RAX/EAX, other two args into ECX, EDX.
20661   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
20662   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
20663   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
20664   for (int i = 0; i < X86::AddrNumOperands; ++i)
20665     MIB.addOperand(MI->getOperand(i));
20666
20667   unsigned ValOps = X86::AddrNumOperands;
20668   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
20669     .addReg(MI->getOperand(ValOps).getReg());
20670   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
20671     .addReg(MI->getOperand(ValOps+1).getReg());
20672
20673   // The instruction doesn't actually take any operands though.
20674   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
20675
20676   MI->eraseFromParent(); // The pseudo is gone now.
20677   return BB;
20678 }
20679
20680 MachineBasicBlock *
20681 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
20682                                                  MachineBasicBlock *MBB) const {
20683   // Emit va_arg instruction on X86-64.
20684
20685   // Operands to this pseudo-instruction:
20686   // 0  ) Output        : destination address (reg)
20687   // 1-5) Input         : va_list address (addr, i64mem)
20688   // 6  ) ArgSize       : Size (in bytes) of vararg type
20689   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
20690   // 8  ) Align         : Alignment of type
20691   // 9  ) EFLAGS (implicit-def)
20692
20693   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
20694   assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
20695
20696   unsigned DestReg = MI->getOperand(0).getReg();
20697   MachineOperand &Base = MI->getOperand(1);
20698   MachineOperand &Scale = MI->getOperand(2);
20699   MachineOperand &Index = MI->getOperand(3);
20700   MachineOperand &Disp = MI->getOperand(4);
20701   MachineOperand &Segment = MI->getOperand(5);
20702   unsigned ArgSize = MI->getOperand(6).getImm();
20703   unsigned ArgMode = MI->getOperand(7).getImm();
20704   unsigned Align = MI->getOperand(8).getImm();
20705
20706   // Memory Reference
20707   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
20708   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
20709   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
20710
20711   // Machine Information
20712   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20713   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
20714   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
20715   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
20716   DebugLoc DL = MI->getDebugLoc();
20717
20718   // struct va_list {
20719   //   i32   gp_offset
20720   //   i32   fp_offset
20721   //   i64   overflow_area (address)
20722   //   i64   reg_save_area (address)
20723   // }
20724   // sizeof(va_list) = 24
20725   // alignment(va_list) = 8
20726
20727   unsigned TotalNumIntRegs = 6;
20728   unsigned TotalNumXMMRegs = 8;
20729   bool UseGPOffset = (ArgMode == 1);
20730   bool UseFPOffset = (ArgMode == 2);
20731   unsigned MaxOffset = TotalNumIntRegs * 8 +
20732                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
20733
20734   /* Align ArgSize to a multiple of 8 */
20735   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
20736   bool NeedsAlign = (Align > 8);
20737
20738   MachineBasicBlock *thisMBB = MBB;
20739   MachineBasicBlock *overflowMBB;
20740   MachineBasicBlock *offsetMBB;
20741   MachineBasicBlock *endMBB;
20742
20743   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
20744   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
20745   unsigned OffsetReg = 0;
20746
20747   if (!UseGPOffset && !UseFPOffset) {
20748     // If we only pull from the overflow region, we don't create a branch.
20749     // We don't need to alter control flow.
20750     OffsetDestReg = 0; // unused
20751     OverflowDestReg = DestReg;
20752
20753     offsetMBB = nullptr;
20754     overflowMBB = thisMBB;
20755     endMBB = thisMBB;
20756   } else {
20757     // First emit code to check if gp_offset (or fp_offset) is below the bound.
20758     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
20759     // If not, pull from overflow_area. (branch to overflowMBB)
20760     //
20761     //       thisMBB
20762     //         |     .
20763     //         |        .
20764     //     offsetMBB   overflowMBB
20765     //         |        .
20766     //         |     .
20767     //        endMBB
20768
20769     // Registers for the PHI in endMBB
20770     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
20771     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
20772
20773     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20774     MachineFunction *MF = MBB->getParent();
20775     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20776     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20777     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20778
20779     MachineFunction::iterator MBBIter = MBB;
20780     ++MBBIter;
20781
20782     // Insert the new basic blocks
20783     MF->insert(MBBIter, offsetMBB);
20784     MF->insert(MBBIter, overflowMBB);
20785     MF->insert(MBBIter, endMBB);
20786
20787     // Transfer the remainder of MBB and its successor edges to endMBB.
20788     endMBB->splice(endMBB->begin(), thisMBB,
20789                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
20790     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
20791
20792     // Make offsetMBB and overflowMBB successors of thisMBB
20793     thisMBB->addSuccessor(offsetMBB);
20794     thisMBB->addSuccessor(overflowMBB);
20795
20796     // endMBB is a successor of both offsetMBB and overflowMBB
20797     offsetMBB->addSuccessor(endMBB);
20798     overflowMBB->addSuccessor(endMBB);
20799
20800     // Load the offset value into a register
20801     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20802     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
20803       .addOperand(Base)
20804       .addOperand(Scale)
20805       .addOperand(Index)
20806       .addDisp(Disp, UseFPOffset ? 4 : 0)
20807       .addOperand(Segment)
20808       .setMemRefs(MMOBegin, MMOEnd);
20809
20810     // Check if there is enough room left to pull this argument.
20811     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
20812       .addReg(OffsetReg)
20813       .addImm(MaxOffset + 8 - ArgSizeA8);
20814
20815     // Branch to "overflowMBB" if offset >= max
20816     // Fall through to "offsetMBB" otherwise
20817     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
20818       .addMBB(overflowMBB);
20819   }
20820
20821   // In offsetMBB, emit code to use the reg_save_area.
20822   if (offsetMBB) {
20823     assert(OffsetReg != 0);
20824
20825     // Read the reg_save_area address.
20826     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
20827     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
20828       .addOperand(Base)
20829       .addOperand(Scale)
20830       .addOperand(Index)
20831       .addDisp(Disp, 16)
20832       .addOperand(Segment)
20833       .setMemRefs(MMOBegin, MMOEnd);
20834
20835     // Zero-extend the offset
20836     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
20837       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
20838         .addImm(0)
20839         .addReg(OffsetReg)
20840         .addImm(X86::sub_32bit);
20841
20842     // Add the offset to the reg_save_area to get the final address.
20843     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
20844       .addReg(OffsetReg64)
20845       .addReg(RegSaveReg);
20846
20847     // Compute the offset for the next argument
20848     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20849     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
20850       .addReg(OffsetReg)
20851       .addImm(UseFPOffset ? 16 : 8);
20852
20853     // Store it back into the va_list.
20854     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
20855       .addOperand(Base)
20856       .addOperand(Scale)
20857       .addOperand(Index)
20858       .addDisp(Disp, UseFPOffset ? 4 : 0)
20859       .addOperand(Segment)
20860       .addReg(NextOffsetReg)
20861       .setMemRefs(MMOBegin, MMOEnd);
20862
20863     // Jump to endMBB
20864     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
20865       .addMBB(endMBB);
20866   }
20867
20868   //
20869   // Emit code to use overflow area
20870   //
20871
20872   // Load the overflow_area address into a register.
20873   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
20874   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
20875     .addOperand(Base)
20876     .addOperand(Scale)
20877     .addOperand(Index)
20878     .addDisp(Disp, 8)
20879     .addOperand(Segment)
20880     .setMemRefs(MMOBegin, MMOEnd);
20881
20882   // If we need to align it, do so. Otherwise, just copy the address
20883   // to OverflowDestReg.
20884   if (NeedsAlign) {
20885     // Align the overflow address
20886     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
20887     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
20888
20889     // aligned_addr = (addr + (align-1)) & ~(align-1)
20890     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
20891       .addReg(OverflowAddrReg)
20892       .addImm(Align-1);
20893
20894     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
20895       .addReg(TmpReg)
20896       .addImm(~(uint64_t)(Align-1));
20897   } else {
20898     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
20899       .addReg(OverflowAddrReg);
20900   }
20901
20902   // Compute the next overflow address after this argument.
20903   // (the overflow address should be kept 8-byte aligned)
20904   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
20905   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
20906     .addReg(OverflowDestReg)
20907     .addImm(ArgSizeA8);
20908
20909   // Store the new overflow address.
20910   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
20911     .addOperand(Base)
20912     .addOperand(Scale)
20913     .addOperand(Index)
20914     .addDisp(Disp, 8)
20915     .addOperand(Segment)
20916     .addReg(NextAddrReg)
20917     .setMemRefs(MMOBegin, MMOEnd);
20918
20919   // If we branched, emit the PHI to the front of endMBB.
20920   if (offsetMBB) {
20921     BuildMI(*endMBB, endMBB->begin(), DL,
20922             TII->get(X86::PHI), DestReg)
20923       .addReg(OffsetDestReg).addMBB(offsetMBB)
20924       .addReg(OverflowDestReg).addMBB(overflowMBB);
20925   }
20926
20927   // Erase the pseudo instruction
20928   MI->eraseFromParent();
20929
20930   return endMBB;
20931 }
20932
20933 MachineBasicBlock *
20934 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
20935                                                  MachineInstr *MI,
20936                                                  MachineBasicBlock *MBB) const {
20937   // Emit code to save XMM registers to the stack. The ABI says that the
20938   // number of registers to save is given in %al, so it's theoretically
20939   // possible to do an indirect jump trick to avoid saving all of them,
20940   // however this code takes a simpler approach and just executes all
20941   // of the stores if %al is non-zero. It's less code, and it's probably
20942   // easier on the hardware branch predictor, and stores aren't all that
20943   // expensive anyway.
20944
20945   // Create the new basic blocks. One block contains all the XMM stores,
20946   // and one block is the final destination regardless of whether any
20947   // stores were performed.
20948   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20949   MachineFunction *F = MBB->getParent();
20950   MachineFunction::iterator MBBIter = MBB;
20951   ++MBBIter;
20952   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
20953   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
20954   F->insert(MBBIter, XMMSaveMBB);
20955   F->insert(MBBIter, EndMBB);
20956
20957   // Transfer the remainder of MBB and its successor edges to EndMBB.
20958   EndMBB->splice(EndMBB->begin(), MBB,
20959                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20960   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
20961
20962   // The original block will now fall through to the XMM save block.
20963   MBB->addSuccessor(XMMSaveMBB);
20964   // The XMMSaveMBB will fall through to the end block.
20965   XMMSaveMBB->addSuccessor(EndMBB);
20966
20967   // Now add the instructions.
20968   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20969   DebugLoc DL = MI->getDebugLoc();
20970
20971   unsigned CountReg = MI->getOperand(0).getReg();
20972   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
20973   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
20974
20975   if (!Subtarget->isTargetWin64()) {
20976     // If %al is 0, branch around the XMM save block.
20977     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
20978     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
20979     MBB->addSuccessor(EndMBB);
20980   }
20981
20982   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
20983   // that was just emitted, but clearly shouldn't be "saved".
20984   assert((MI->getNumOperands() <= 3 ||
20985           !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
20986           MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
20987          && "Expected last argument to be EFLAGS");
20988   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
20989   // In the XMM save block, save all the XMM argument registers.
20990   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
20991     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
20992     MachineMemOperand *MMO =
20993       F->getMachineMemOperand(
20994           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
20995         MachineMemOperand::MOStore,
20996         /*Size=*/16, /*Align=*/16);
20997     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
20998       .addFrameIndex(RegSaveFrameIndex)
20999       .addImm(/*Scale=*/1)
21000       .addReg(/*IndexReg=*/0)
21001       .addImm(/*Disp=*/Offset)
21002       .addReg(/*Segment=*/0)
21003       .addReg(MI->getOperand(i).getReg())
21004       .addMemOperand(MMO);
21005   }
21006
21007   MI->eraseFromParent();   // The pseudo instruction is gone now.
21008
21009   return EndMBB;
21010 }
21011
21012 // The EFLAGS operand of SelectItr might be missing a kill marker
21013 // because there were multiple uses of EFLAGS, and ISel didn't know
21014 // which to mark. Figure out whether SelectItr should have had a
21015 // kill marker, and set it if it should. Returns the correct kill
21016 // marker value.
21017 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
21018                                      MachineBasicBlock* BB,
21019                                      const TargetRegisterInfo* TRI) {
21020   // Scan forward through BB for a use/def of EFLAGS.
21021   MachineBasicBlock::iterator miI(std::next(SelectItr));
21022   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
21023     const MachineInstr& mi = *miI;
21024     if (mi.readsRegister(X86::EFLAGS))
21025       return false;
21026     if (mi.definesRegister(X86::EFLAGS))
21027       break; // Should have kill-flag - update below.
21028   }
21029
21030   // If we hit the end of the block, check whether EFLAGS is live into a
21031   // successor.
21032   if (miI == BB->end()) {
21033     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
21034                                           sEnd = BB->succ_end();
21035          sItr != sEnd; ++sItr) {
21036       MachineBasicBlock* succ = *sItr;
21037       if (succ->isLiveIn(X86::EFLAGS))
21038         return false;
21039     }
21040   }
21041
21042   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
21043   // out. SelectMI should have a kill flag on EFLAGS.
21044   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
21045   return true;
21046 }
21047
21048 MachineBasicBlock *
21049 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
21050                                      MachineBasicBlock *BB) const {
21051   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21052   DebugLoc DL = MI->getDebugLoc();
21053
21054   // To "insert" a SELECT_CC instruction, we actually have to insert the
21055   // diamond control-flow pattern.  The incoming instruction knows the
21056   // destination vreg to set, the condition code register to branch on, the
21057   // true/false values to select between, and a branch opcode to use.
21058   const BasicBlock *LLVM_BB = BB->getBasicBlock();
21059   MachineFunction::iterator It = BB;
21060   ++It;
21061
21062   //  thisMBB:
21063   //  ...
21064   //   TrueVal = ...
21065   //   cmpTY ccX, r1, r2
21066   //   bCC copy1MBB
21067   //   fallthrough --> copy0MBB
21068   MachineBasicBlock *thisMBB = BB;
21069   MachineFunction *F = BB->getParent();
21070   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
21071   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
21072   F->insert(It, copy0MBB);
21073   F->insert(It, sinkMBB);
21074
21075   // If the EFLAGS register isn't dead in the terminator, then claim that it's
21076   // live into the sink and copy blocks.
21077   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
21078   if (!MI->killsRegister(X86::EFLAGS) &&
21079       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
21080     copy0MBB->addLiveIn(X86::EFLAGS);
21081     sinkMBB->addLiveIn(X86::EFLAGS);
21082   }
21083
21084   // Transfer the remainder of BB and its successor edges to sinkMBB.
21085   sinkMBB->splice(sinkMBB->begin(), BB,
21086                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
21087   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
21088
21089   // Add the true and fallthrough blocks as its successors.
21090   BB->addSuccessor(copy0MBB);
21091   BB->addSuccessor(sinkMBB);
21092
21093   // Create the conditional branch instruction.
21094   unsigned Opc =
21095     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
21096   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
21097
21098   //  copy0MBB:
21099   //   %FalseValue = ...
21100   //   # fallthrough to sinkMBB
21101   copy0MBB->addSuccessor(sinkMBB);
21102
21103   //  sinkMBB:
21104   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
21105   //  ...
21106   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21107           TII->get(X86::PHI), MI->getOperand(0).getReg())
21108     .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
21109     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
21110
21111   MI->eraseFromParent();   // The pseudo instruction is gone now.
21112   return sinkMBB;
21113 }
21114
21115 MachineBasicBlock *
21116 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
21117                                         MachineBasicBlock *BB) const {
21118   MachineFunction *MF = BB->getParent();
21119   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21120   DebugLoc DL = MI->getDebugLoc();
21121   const BasicBlock *LLVM_BB = BB->getBasicBlock();
21122
21123   assert(MF->shouldSplitStack());
21124
21125   const bool Is64Bit = Subtarget->is64Bit();
21126   const bool IsLP64 = Subtarget->isTarget64BitLP64();
21127
21128   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
21129   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
21130
21131   // BB:
21132   //  ... [Till the alloca]
21133   // If stacklet is not large enough, jump to mallocMBB
21134   //
21135   // bumpMBB:
21136   //  Allocate by subtracting from RSP
21137   //  Jump to continueMBB
21138   //
21139   // mallocMBB:
21140   //  Allocate by call to runtime
21141   //
21142   // continueMBB:
21143   //  ...
21144   //  [rest of original BB]
21145   //
21146
21147   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21148   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21149   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
21150
21151   MachineRegisterInfo &MRI = MF->getRegInfo();
21152   const TargetRegisterClass *AddrRegClass =
21153     getRegClassFor(getPointerTy());
21154
21155   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21156     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
21157     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
21158     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
21159     sizeVReg = MI->getOperand(1).getReg(),
21160     physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
21161
21162   MachineFunction::iterator MBBIter = BB;
21163   ++MBBIter;
21164
21165   MF->insert(MBBIter, bumpMBB);
21166   MF->insert(MBBIter, mallocMBB);
21167   MF->insert(MBBIter, continueMBB);
21168
21169   continueMBB->splice(continueMBB->begin(), BB,
21170                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
21171   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
21172
21173   // Add code to the main basic block to check if the stack limit has been hit,
21174   // and if so, jump to mallocMBB otherwise to bumpMBB.
21175   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
21176   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
21177     .addReg(tmpSPVReg).addReg(sizeVReg);
21178   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
21179     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
21180     .addReg(SPLimitVReg);
21181   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
21182
21183   // bumpMBB simply decreases the stack pointer, since we know the current
21184   // stacklet has enough space.
21185   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
21186     .addReg(SPLimitVReg);
21187   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
21188     .addReg(SPLimitVReg);
21189   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21190
21191   // Calls into a routine in libgcc to allocate more space from the heap.
21192   const uint32_t *RegMask =
21193       Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21194   if (IsLP64) {
21195     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
21196       .addReg(sizeVReg);
21197     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21198       .addExternalSymbol("__morestack_allocate_stack_space")
21199       .addRegMask(RegMask)
21200       .addReg(X86::RDI, RegState::Implicit)
21201       .addReg(X86::RAX, RegState::ImplicitDefine);
21202   } else if (Is64Bit) {
21203     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
21204       .addReg(sizeVReg);
21205     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21206       .addExternalSymbol("__morestack_allocate_stack_space")
21207       .addRegMask(RegMask)
21208       .addReg(X86::EDI, RegState::Implicit)
21209       .addReg(X86::EAX, RegState::ImplicitDefine);
21210   } else {
21211     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
21212       .addImm(12);
21213     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
21214     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
21215       .addExternalSymbol("__morestack_allocate_stack_space")
21216       .addRegMask(RegMask)
21217       .addReg(X86::EAX, RegState::ImplicitDefine);
21218   }
21219
21220   if (!Is64Bit)
21221     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
21222       .addImm(16);
21223
21224   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
21225     .addReg(IsLP64 ? X86::RAX : X86::EAX);
21226   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21227
21228   // Set up the CFG correctly.
21229   BB->addSuccessor(bumpMBB);
21230   BB->addSuccessor(mallocMBB);
21231   mallocMBB->addSuccessor(continueMBB);
21232   bumpMBB->addSuccessor(continueMBB);
21233
21234   // Take care of the PHI nodes.
21235   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
21236           MI->getOperand(0).getReg())
21237     .addReg(mallocPtrVReg).addMBB(mallocMBB)
21238     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
21239
21240   // Delete the original pseudo instruction.
21241   MI->eraseFromParent();
21242
21243   // And we're done.
21244   return continueMBB;
21245 }
21246
21247 MachineBasicBlock *
21248 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
21249                                         MachineBasicBlock *BB) const {
21250   DebugLoc DL = MI->getDebugLoc();
21251
21252   assert(!Subtarget->isTargetMachO());
21253
21254   X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
21255
21256   MI->eraseFromParent();   // The pseudo instruction is gone now.
21257   return BB;
21258 }
21259
21260 MachineBasicBlock *
21261 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
21262                                       MachineBasicBlock *BB) const {
21263   // This is pretty easy.  We're taking the value that we received from
21264   // our load from the relocation, sticking it in either RDI (x86-64)
21265   // or EAX and doing an indirect call.  The return value will then
21266   // be in the normal return register.
21267   MachineFunction *F = BB->getParent();
21268   const X86InstrInfo *TII = Subtarget->getInstrInfo();
21269   DebugLoc DL = MI->getDebugLoc();
21270
21271   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
21272   assert(MI->getOperand(3).isGlobal() && "This should be a global");
21273
21274   // Get a register mask for the lowered call.
21275   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
21276   // proper register mask.
21277   const uint32_t *RegMask =
21278       Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
21279   if (Subtarget->is64Bit()) {
21280     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21281                                       TII->get(X86::MOV64rm), X86::RDI)
21282     .addReg(X86::RIP)
21283     .addImm(0).addReg(0)
21284     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21285                       MI->getOperand(3).getTargetFlags())
21286     .addReg(0);
21287     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
21288     addDirectMem(MIB, X86::RDI);
21289     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
21290   } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
21291     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21292                                       TII->get(X86::MOV32rm), X86::EAX)
21293     .addReg(0)
21294     .addImm(0).addReg(0)
21295     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21296                       MI->getOperand(3).getTargetFlags())
21297     .addReg(0);
21298     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21299     addDirectMem(MIB, X86::EAX);
21300     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21301   } else {
21302     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21303                                       TII->get(X86::MOV32rm), X86::EAX)
21304     .addReg(TII->getGlobalBaseReg(F))
21305     .addImm(0).addReg(0)
21306     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21307                       MI->getOperand(3).getTargetFlags())
21308     .addReg(0);
21309     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21310     addDirectMem(MIB, X86::EAX);
21311     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21312   }
21313
21314   MI->eraseFromParent(); // The pseudo instruction is gone now.
21315   return BB;
21316 }
21317
21318 MachineBasicBlock *
21319 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
21320                                     MachineBasicBlock *MBB) const {
21321   DebugLoc DL = MI->getDebugLoc();
21322   MachineFunction *MF = MBB->getParent();
21323   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21324   MachineRegisterInfo &MRI = MF->getRegInfo();
21325
21326   const BasicBlock *BB = MBB->getBasicBlock();
21327   MachineFunction::iterator I = MBB;
21328   ++I;
21329
21330   // Memory Reference
21331   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21332   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21333
21334   unsigned DstReg;
21335   unsigned MemOpndSlot = 0;
21336
21337   unsigned CurOp = 0;
21338
21339   DstReg = MI->getOperand(CurOp++).getReg();
21340   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
21341   assert(RC->hasType(MVT::i32) && "Invalid destination!");
21342   unsigned mainDstReg = MRI.createVirtualRegister(RC);
21343   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
21344
21345   MemOpndSlot = CurOp;
21346
21347   MVT PVT = getPointerTy();
21348   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21349          "Invalid Pointer Size!");
21350
21351   // For v = setjmp(buf), we generate
21352   //
21353   // thisMBB:
21354   //  buf[LabelOffset] = restoreMBB
21355   //  SjLjSetup restoreMBB
21356   //
21357   // mainMBB:
21358   //  v_main = 0
21359   //
21360   // sinkMBB:
21361   //  v = phi(main, restore)
21362   //
21363   // restoreMBB:
21364   //  if base pointer being used, load it from frame
21365   //  v_restore = 1
21366
21367   MachineBasicBlock *thisMBB = MBB;
21368   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
21369   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
21370   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
21371   MF->insert(I, mainMBB);
21372   MF->insert(I, sinkMBB);
21373   MF->push_back(restoreMBB);
21374
21375   MachineInstrBuilder MIB;
21376
21377   // Transfer the remainder of BB and its successor edges to sinkMBB.
21378   sinkMBB->splice(sinkMBB->begin(), MBB,
21379                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21380   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
21381
21382   // thisMBB:
21383   unsigned PtrStoreOpc = 0;
21384   unsigned LabelReg = 0;
21385   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21386   Reloc::Model RM = MF->getTarget().getRelocationModel();
21387   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
21388                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
21389
21390   // Prepare IP either in reg or imm.
21391   if (!UseImmLabel) {
21392     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
21393     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
21394     LabelReg = MRI.createVirtualRegister(PtrRC);
21395     if (Subtarget->is64Bit()) {
21396       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
21397               .addReg(X86::RIP)
21398               .addImm(0)
21399               .addReg(0)
21400               .addMBB(restoreMBB)
21401               .addReg(0);
21402     } else {
21403       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
21404       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
21405               .addReg(XII->getGlobalBaseReg(MF))
21406               .addImm(0)
21407               .addReg(0)
21408               .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
21409               .addReg(0);
21410     }
21411   } else
21412     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
21413   // Store IP
21414   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
21415   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21416     if (i == X86::AddrDisp)
21417       MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
21418     else
21419       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
21420   }
21421   if (!UseImmLabel)
21422     MIB.addReg(LabelReg);
21423   else
21424     MIB.addMBB(restoreMBB);
21425   MIB.setMemRefs(MMOBegin, MMOEnd);
21426   // Setup
21427   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
21428           .addMBB(restoreMBB);
21429
21430   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21431   MIB.addRegMask(RegInfo->getNoPreservedMask());
21432   thisMBB->addSuccessor(mainMBB);
21433   thisMBB->addSuccessor(restoreMBB);
21434
21435   // mainMBB:
21436   //  EAX = 0
21437   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
21438   mainMBB->addSuccessor(sinkMBB);
21439
21440   // sinkMBB:
21441   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21442           TII->get(X86::PHI), DstReg)
21443     .addReg(mainDstReg).addMBB(mainMBB)
21444     .addReg(restoreDstReg).addMBB(restoreMBB);
21445
21446   // restoreMBB:
21447   if (RegInfo->hasBasePointer(*MF)) {
21448     const bool Uses64BitFramePtr =
21449         Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
21450     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
21451     X86FI->setRestoreBasePointer(MF);
21452     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
21453     unsigned BasePtr = RegInfo->getBaseRegister();
21454     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
21455     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
21456                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
21457       .setMIFlag(MachineInstr::FrameSetup);
21458   }
21459   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
21460   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
21461   restoreMBB->addSuccessor(sinkMBB);
21462
21463   MI->eraseFromParent();
21464   return sinkMBB;
21465 }
21466
21467 MachineBasicBlock *
21468 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
21469                                      MachineBasicBlock *MBB) const {
21470   DebugLoc DL = MI->getDebugLoc();
21471   MachineFunction *MF = MBB->getParent();
21472   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21473   MachineRegisterInfo &MRI = MF->getRegInfo();
21474
21475   // Memory Reference
21476   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21477   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21478
21479   MVT PVT = getPointerTy();
21480   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21481          "Invalid Pointer Size!");
21482
21483   const TargetRegisterClass *RC =
21484     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
21485   unsigned Tmp = MRI.createVirtualRegister(RC);
21486   // Since FP is only updated here but NOT referenced, it's treated as GPR.
21487   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
21488   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
21489   unsigned SP = RegInfo->getStackRegister();
21490
21491   MachineInstrBuilder MIB;
21492
21493   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21494   const int64_t SPOffset = 2 * PVT.getStoreSize();
21495
21496   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
21497   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
21498
21499   // Reload FP
21500   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
21501   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
21502     MIB.addOperand(MI->getOperand(i));
21503   MIB.setMemRefs(MMOBegin, MMOEnd);
21504   // Reload IP
21505   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
21506   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21507     if (i == X86::AddrDisp)
21508       MIB.addDisp(MI->getOperand(i), LabelOffset);
21509     else
21510       MIB.addOperand(MI->getOperand(i));
21511   }
21512   MIB.setMemRefs(MMOBegin, MMOEnd);
21513   // Reload SP
21514   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
21515   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21516     if (i == X86::AddrDisp)
21517       MIB.addDisp(MI->getOperand(i), SPOffset);
21518     else
21519       MIB.addOperand(MI->getOperand(i));
21520   }
21521   MIB.setMemRefs(MMOBegin, MMOEnd);
21522   // Jump
21523   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
21524
21525   MI->eraseFromParent();
21526   return MBB;
21527 }
21528
21529 // Replace 213-type (isel default) FMA3 instructions with 231-type for
21530 // accumulator loops. Writing back to the accumulator allows the coalescer
21531 // to remove extra copies in the loop.
21532 MachineBasicBlock *
21533 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
21534                                  MachineBasicBlock *MBB) const {
21535   MachineOperand &AddendOp = MI->getOperand(3);
21536
21537   // Bail out early if the addend isn't a register - we can't switch these.
21538   if (!AddendOp.isReg())
21539     return MBB;
21540
21541   MachineFunction &MF = *MBB->getParent();
21542   MachineRegisterInfo &MRI = MF.getRegInfo();
21543
21544   // Check whether the addend is defined by a PHI:
21545   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
21546   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
21547   if (!AddendDef.isPHI())
21548     return MBB;
21549
21550   // Look for the following pattern:
21551   // loop:
21552   //   %addend = phi [%entry, 0], [%loop, %result]
21553   //   ...
21554   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
21555
21556   // Replace with:
21557   //   loop:
21558   //   %addend = phi [%entry, 0], [%loop, %result]
21559   //   ...
21560   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
21561
21562   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
21563     assert(AddendDef.getOperand(i).isReg());
21564     MachineOperand PHISrcOp = AddendDef.getOperand(i);
21565     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
21566     if (&PHISrcInst == MI) {
21567       // Found a matching instruction.
21568       unsigned NewFMAOpc = 0;
21569       switch (MI->getOpcode()) {
21570         case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
21571         case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
21572         case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
21573         case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
21574         case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
21575         case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
21576         case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
21577         case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
21578         case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
21579         case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
21580         case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
21581         case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
21582         case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
21583         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
21584         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
21585         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
21586         case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
21587         case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
21588         case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
21589         case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
21590
21591         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
21592         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
21593         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
21594         case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
21595         case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
21596         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
21597         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
21598         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
21599         case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
21600         case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
21601         case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
21602         case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
21603         default: llvm_unreachable("Unrecognized FMA variant.");
21604       }
21605
21606       const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
21607       MachineInstrBuilder MIB =
21608         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
21609         .addOperand(MI->getOperand(0))
21610         .addOperand(MI->getOperand(3))
21611         .addOperand(MI->getOperand(2))
21612         .addOperand(MI->getOperand(1));
21613       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
21614       MI->eraseFromParent();
21615     }
21616   }
21617
21618   return MBB;
21619 }
21620
21621 MachineBasicBlock *
21622 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
21623                                                MachineBasicBlock *BB) const {
21624   switch (MI->getOpcode()) {
21625   default: llvm_unreachable("Unexpected instr type to insert");
21626   case X86::TAILJMPd64:
21627   case X86::TAILJMPr64:
21628   case X86::TAILJMPm64:
21629   case X86::TAILJMPd64_REX:
21630   case X86::TAILJMPr64_REX:
21631   case X86::TAILJMPm64_REX:
21632     llvm_unreachable("TAILJMP64 would not be touched here.");
21633   case X86::TCRETURNdi64:
21634   case X86::TCRETURNri64:
21635   case X86::TCRETURNmi64:
21636     return BB;
21637   case X86::WIN_ALLOCA:
21638     return EmitLoweredWinAlloca(MI, BB);
21639   case X86::SEG_ALLOCA_32:
21640   case X86::SEG_ALLOCA_64:
21641     return EmitLoweredSegAlloca(MI, BB);
21642   case X86::TLSCall_32:
21643   case X86::TLSCall_64:
21644     return EmitLoweredTLSCall(MI, BB);
21645   case X86::CMOV_GR8:
21646   case X86::CMOV_FR32:
21647   case X86::CMOV_FR64:
21648   case X86::CMOV_V4F32:
21649   case X86::CMOV_V2F64:
21650   case X86::CMOV_V2I64:
21651   case X86::CMOV_V8F32:
21652   case X86::CMOV_V4F64:
21653   case X86::CMOV_V4I64:
21654   case X86::CMOV_V16F32:
21655   case X86::CMOV_V8F64:
21656   case X86::CMOV_V8I64:
21657   case X86::CMOV_GR16:
21658   case X86::CMOV_GR32:
21659   case X86::CMOV_RFP32:
21660   case X86::CMOV_RFP64:
21661   case X86::CMOV_RFP80:
21662     return EmitLoweredSelect(MI, BB);
21663
21664   case X86::FP32_TO_INT16_IN_MEM:
21665   case X86::FP32_TO_INT32_IN_MEM:
21666   case X86::FP32_TO_INT64_IN_MEM:
21667   case X86::FP64_TO_INT16_IN_MEM:
21668   case X86::FP64_TO_INT32_IN_MEM:
21669   case X86::FP64_TO_INT64_IN_MEM:
21670   case X86::FP80_TO_INT16_IN_MEM:
21671   case X86::FP80_TO_INT32_IN_MEM:
21672   case X86::FP80_TO_INT64_IN_MEM: {
21673     MachineFunction *F = BB->getParent();
21674     const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21675     DebugLoc DL = MI->getDebugLoc();
21676
21677     // Change the floating point control register to use "round towards zero"
21678     // mode when truncating to an integer value.
21679     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
21680     addFrameReference(BuildMI(*BB, MI, DL,
21681                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
21682
21683     // Load the old value of the high byte of the control word...
21684     unsigned OldCW =
21685       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
21686     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
21687                       CWFrameIdx);
21688
21689     // Set the high part to be round to zero...
21690     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
21691       .addImm(0xC7F);
21692
21693     // Reload the modified control word now...
21694     addFrameReference(BuildMI(*BB, MI, DL,
21695                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21696
21697     // Restore the memory image of control word to original value
21698     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
21699       .addReg(OldCW);
21700
21701     // Get the X86 opcode to use.
21702     unsigned Opc;
21703     switch (MI->getOpcode()) {
21704     default: llvm_unreachable("illegal opcode!");
21705     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
21706     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
21707     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
21708     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
21709     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
21710     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
21711     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
21712     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
21713     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
21714     }
21715
21716     X86AddressMode AM;
21717     MachineOperand &Op = MI->getOperand(0);
21718     if (Op.isReg()) {
21719       AM.BaseType = X86AddressMode::RegBase;
21720       AM.Base.Reg = Op.getReg();
21721     } else {
21722       AM.BaseType = X86AddressMode::FrameIndexBase;
21723       AM.Base.FrameIndex = Op.getIndex();
21724     }
21725     Op = MI->getOperand(1);
21726     if (Op.isImm())
21727       AM.Scale = Op.getImm();
21728     Op = MI->getOperand(2);
21729     if (Op.isImm())
21730       AM.IndexReg = Op.getImm();
21731     Op = MI->getOperand(3);
21732     if (Op.isGlobal()) {
21733       AM.GV = Op.getGlobal();
21734     } else {
21735       AM.Disp = Op.getImm();
21736     }
21737     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
21738                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
21739
21740     // Reload the original control word now.
21741     addFrameReference(BuildMI(*BB, MI, DL,
21742                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21743
21744     MI->eraseFromParent();   // The pseudo instruction is gone now.
21745     return BB;
21746   }
21747     // String/text processing lowering.
21748   case X86::PCMPISTRM128REG:
21749   case X86::VPCMPISTRM128REG:
21750   case X86::PCMPISTRM128MEM:
21751   case X86::VPCMPISTRM128MEM:
21752   case X86::PCMPESTRM128REG:
21753   case X86::VPCMPESTRM128REG:
21754   case X86::PCMPESTRM128MEM:
21755   case X86::VPCMPESTRM128MEM:
21756     assert(Subtarget->hasSSE42() &&
21757            "Target must have SSE4.2 or AVX features enabled");
21758     return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
21759
21760   // String/text processing lowering.
21761   case X86::PCMPISTRIREG:
21762   case X86::VPCMPISTRIREG:
21763   case X86::PCMPISTRIMEM:
21764   case X86::VPCMPISTRIMEM:
21765   case X86::PCMPESTRIREG:
21766   case X86::VPCMPESTRIREG:
21767   case X86::PCMPESTRIMEM:
21768   case X86::VPCMPESTRIMEM:
21769     assert(Subtarget->hasSSE42() &&
21770            "Target must have SSE4.2 or AVX features enabled");
21771     return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
21772
21773   // Thread synchronization.
21774   case X86::MONITOR:
21775     return EmitMonitor(MI, BB, Subtarget);
21776
21777   // xbegin
21778   case X86::XBEGIN:
21779     return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
21780
21781   case X86::VASTART_SAVE_XMM_REGS:
21782     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
21783
21784   case X86::VAARG_64:
21785     return EmitVAARG64WithCustomInserter(MI, BB);
21786
21787   case X86::EH_SjLj_SetJmp32:
21788   case X86::EH_SjLj_SetJmp64:
21789     return emitEHSjLjSetJmp(MI, BB);
21790
21791   case X86::EH_SjLj_LongJmp32:
21792   case X86::EH_SjLj_LongJmp64:
21793     return emitEHSjLjLongJmp(MI, BB);
21794
21795   case TargetOpcode::STATEPOINT:
21796     // As an implementation detail, STATEPOINT shares the STACKMAP format at
21797     // this point in the process.  We diverge later.
21798     return emitPatchPoint(MI, BB);
21799
21800   case TargetOpcode::STACKMAP:
21801   case TargetOpcode::PATCHPOINT:
21802     return emitPatchPoint(MI, BB);
21803
21804   case X86::VFMADDPDr213r:
21805   case X86::VFMADDPSr213r:
21806   case X86::VFMADDSDr213r:
21807   case X86::VFMADDSSr213r:
21808   case X86::VFMSUBPDr213r:
21809   case X86::VFMSUBPSr213r:
21810   case X86::VFMSUBSDr213r:
21811   case X86::VFMSUBSSr213r:
21812   case X86::VFNMADDPDr213r:
21813   case X86::VFNMADDPSr213r:
21814   case X86::VFNMADDSDr213r:
21815   case X86::VFNMADDSSr213r:
21816   case X86::VFNMSUBPDr213r:
21817   case X86::VFNMSUBPSr213r:
21818   case X86::VFNMSUBSDr213r:
21819   case X86::VFNMSUBSSr213r:
21820   case X86::VFMADDSUBPDr213r:
21821   case X86::VFMADDSUBPSr213r:
21822   case X86::VFMSUBADDPDr213r:
21823   case X86::VFMSUBADDPSr213r:
21824   case X86::VFMADDPDr213rY:
21825   case X86::VFMADDPSr213rY:
21826   case X86::VFMSUBPDr213rY:
21827   case X86::VFMSUBPSr213rY:
21828   case X86::VFNMADDPDr213rY:
21829   case X86::VFNMADDPSr213rY:
21830   case X86::VFNMSUBPDr213rY:
21831   case X86::VFNMSUBPSr213rY:
21832   case X86::VFMADDSUBPDr213rY:
21833   case X86::VFMADDSUBPSr213rY:
21834   case X86::VFMSUBADDPDr213rY:
21835   case X86::VFMSUBADDPSr213rY:
21836     return emitFMA3Instr(MI, BB);
21837   }
21838 }
21839
21840 //===----------------------------------------------------------------------===//
21841 //                           X86 Optimization Hooks
21842 //===----------------------------------------------------------------------===//
21843
21844 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
21845                                                       APInt &KnownZero,
21846                                                       APInt &KnownOne,
21847                                                       const SelectionDAG &DAG,
21848                                                       unsigned Depth) const {
21849   unsigned BitWidth = KnownZero.getBitWidth();
21850   unsigned Opc = Op.getOpcode();
21851   assert((Opc >= ISD::BUILTIN_OP_END ||
21852           Opc == ISD::INTRINSIC_WO_CHAIN ||
21853           Opc == ISD::INTRINSIC_W_CHAIN ||
21854           Opc == ISD::INTRINSIC_VOID) &&
21855          "Should use MaskedValueIsZero if you don't know whether Op"
21856          " is a target node!");
21857
21858   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
21859   switch (Opc) {
21860   default: break;
21861   case X86ISD::ADD:
21862   case X86ISD::SUB:
21863   case X86ISD::ADC:
21864   case X86ISD::SBB:
21865   case X86ISD::SMUL:
21866   case X86ISD::UMUL:
21867   case X86ISD::INC:
21868   case X86ISD::DEC:
21869   case X86ISD::OR:
21870   case X86ISD::XOR:
21871   case X86ISD::AND:
21872     // These nodes' second result is a boolean.
21873     if (Op.getResNo() == 0)
21874       break;
21875     // Fallthrough
21876   case X86ISD::SETCC:
21877     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
21878     break;
21879   case ISD::INTRINSIC_WO_CHAIN: {
21880     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21881     unsigned NumLoBits = 0;
21882     switch (IntId) {
21883     default: break;
21884     case Intrinsic::x86_sse_movmsk_ps:
21885     case Intrinsic::x86_avx_movmsk_ps_256:
21886     case Intrinsic::x86_sse2_movmsk_pd:
21887     case Intrinsic::x86_avx_movmsk_pd_256:
21888     case Intrinsic::x86_mmx_pmovmskb:
21889     case Intrinsic::x86_sse2_pmovmskb_128:
21890     case Intrinsic::x86_avx2_pmovmskb: {
21891       // High bits of movmskp{s|d}, pmovmskb are known zero.
21892       switch (IntId) {
21893         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
21894         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
21895         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
21896         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
21897         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
21898         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
21899         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
21900         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
21901       }
21902       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
21903       break;
21904     }
21905     }
21906     break;
21907   }
21908   }
21909 }
21910
21911 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
21912   SDValue Op,
21913   const SelectionDAG &,
21914   unsigned Depth) const {
21915   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
21916   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
21917     return Op.getValueType().getScalarType().getSizeInBits();
21918
21919   // Fallback case.
21920   return 1;
21921 }
21922
21923 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
21924 /// node is a GlobalAddress + offset.
21925 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
21926                                        const GlobalValue* &GA,
21927                                        int64_t &Offset) const {
21928   if (N->getOpcode() == X86ISD::Wrapper) {
21929     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
21930       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
21931       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
21932       return true;
21933     }
21934   }
21935   return TargetLowering::isGAPlusOffset(N, GA, Offset);
21936 }
21937
21938 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
21939 /// same as extracting the high 128-bit part of 256-bit vector and then
21940 /// inserting the result into the low part of a new 256-bit vector
21941 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
21942   EVT VT = SVOp->getValueType(0);
21943   unsigned NumElems = VT.getVectorNumElements();
21944
21945   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21946   for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
21947     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21948         SVOp->getMaskElt(j) >= 0)
21949       return false;
21950
21951   return true;
21952 }
21953
21954 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
21955 /// same as extracting the low 128-bit part of 256-bit vector and then
21956 /// inserting the result into the high part of a new 256-bit vector
21957 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
21958   EVT VT = SVOp->getValueType(0);
21959   unsigned NumElems = VT.getVectorNumElements();
21960
21961   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21962   for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
21963     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21964         SVOp->getMaskElt(j) >= 0)
21965       return false;
21966
21967   return true;
21968 }
21969
21970 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
21971 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
21972                                         TargetLowering::DAGCombinerInfo &DCI,
21973                                         const X86Subtarget* Subtarget) {
21974   SDLoc dl(N);
21975   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
21976   SDValue V1 = SVOp->getOperand(0);
21977   SDValue V2 = SVOp->getOperand(1);
21978   EVT VT = SVOp->getValueType(0);
21979   unsigned NumElems = VT.getVectorNumElements();
21980
21981   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
21982       V2.getOpcode() == ISD::CONCAT_VECTORS) {
21983     //
21984     //                   0,0,0,...
21985     //                      |
21986     //    V      UNDEF    BUILD_VECTOR    UNDEF
21987     //     \      /           \           /
21988     //  CONCAT_VECTOR         CONCAT_VECTOR
21989     //         \                  /
21990     //          \                /
21991     //          RESULT: V + zero extended
21992     //
21993     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
21994         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
21995         V1.getOperand(1).getOpcode() != ISD::UNDEF)
21996       return SDValue();
21997
21998     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
21999       return SDValue();
22000
22001     // To match the shuffle mask, the first half of the mask should
22002     // be exactly the first vector, and all the rest a splat with the
22003     // first element of the second one.
22004     for (unsigned i = 0; i != NumElems/2; ++i)
22005       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
22006           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
22007         return SDValue();
22008
22009     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
22010     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
22011       if (Ld->hasNUsesOfValue(1, 0)) {
22012         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
22013         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
22014         SDValue ResNode =
22015           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
22016                                   Ld->getMemoryVT(),
22017                                   Ld->getPointerInfo(),
22018                                   Ld->getAlignment(),
22019                                   false/*isVolatile*/, true/*ReadMem*/,
22020                                   false/*WriteMem*/);
22021
22022         // Make sure the newly-created LOAD is in the same position as Ld in
22023         // terms of dependency. We create a TokenFactor for Ld and ResNode,
22024         // and update uses of Ld's output chain to use the TokenFactor.
22025         if (Ld->hasAnyUseOfValue(1)) {
22026           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
22027                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
22028           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
22029           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
22030                                  SDValue(ResNode.getNode(), 1));
22031         }
22032
22033         return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
22034       }
22035     }
22036
22037     // Emit a zeroed vector and insert the desired subvector on its
22038     // first half.
22039     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22040     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
22041     return DCI.CombineTo(N, InsV);
22042   }
22043
22044   //===--------------------------------------------------------------------===//
22045   // Combine some shuffles into subvector extracts and inserts:
22046   //
22047
22048   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
22049   if (isShuffleHigh128VectorInsertLow(SVOp)) {
22050     SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
22051     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
22052     return DCI.CombineTo(N, InsV);
22053   }
22054
22055   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
22056   if (isShuffleLow128VectorInsertHigh(SVOp)) {
22057     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
22058     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
22059     return DCI.CombineTo(N, InsV);
22060   }
22061
22062   return SDValue();
22063 }
22064
22065 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
22066 /// possible.
22067 ///
22068 /// This is the leaf of the recursive combinine below. When we have found some
22069 /// chain of single-use x86 shuffle instructions and accumulated the combined
22070 /// shuffle mask represented by them, this will try to pattern match that mask
22071 /// into either a single instruction if there is a special purpose instruction
22072 /// for this operation, or into a PSHUFB instruction which is a fully general
22073 /// instruction but should only be used to replace chains over a certain depth.
22074 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
22075                                    int Depth, bool HasPSHUFB, SelectionDAG &DAG,
22076                                    TargetLowering::DAGCombinerInfo &DCI,
22077                                    const X86Subtarget *Subtarget) {
22078   assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
22079
22080   // Find the operand that enters the chain. Note that multiple uses are OK
22081   // here, we're not going to remove the operand we find.
22082   SDValue Input = Op.getOperand(0);
22083   while (Input.getOpcode() == ISD::BITCAST)
22084     Input = Input.getOperand(0);
22085
22086   MVT VT = Input.getSimpleValueType();
22087   MVT RootVT = Root.getSimpleValueType();
22088   SDLoc DL(Root);
22089
22090   // Just remove no-op shuffle masks.
22091   if (Mask.size() == 1) {
22092     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
22093                   /*AddTo*/ true);
22094     return true;
22095   }
22096
22097   // Use the float domain if the operand type is a floating point type.
22098   bool FloatDomain = VT.isFloatingPoint();
22099
22100   // For floating point shuffles, we don't have free copies in the shuffle
22101   // instructions or the ability to load as part of the instruction, so
22102   // canonicalize their shuffles to UNPCK or MOV variants.
22103   //
22104   // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
22105   // vectors because it can have a load folded into it that UNPCK cannot. This
22106   // doesn't preclude something switching to the shorter encoding post-RA.
22107   if (FloatDomain) {
22108     if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
22109       bool Lo = Mask.equals(0, 0);
22110       unsigned Shuffle;
22111       MVT ShuffleVT;
22112       // Check if we have SSE3 which will let us use MOVDDUP. That instruction
22113       // is no slower than UNPCKLPD but has the option to fold the input operand
22114       // into even an unaligned memory load.
22115       if (Lo && Subtarget->hasSSE3()) {
22116         Shuffle = X86ISD::MOVDDUP;
22117         ShuffleVT = MVT::v2f64;
22118       } else {
22119         // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
22120         // than the UNPCK variants.
22121         Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
22122         ShuffleVT = MVT::v4f32;
22123       }
22124       if (Depth == 1 && Root->getOpcode() == Shuffle)
22125         return false; // Nothing to do!
22126       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22127       DCI.AddToWorklist(Op.getNode());
22128       if (Shuffle == X86ISD::MOVDDUP)
22129         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22130       else
22131         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22132       DCI.AddToWorklist(Op.getNode());
22133       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22134                     /*AddTo*/ true);
22135       return true;
22136     }
22137     if (Subtarget->hasSSE3() &&
22138         (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
22139       bool Lo = Mask.equals(0, 0, 2, 2);
22140       unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
22141       MVT ShuffleVT = MVT::v4f32;
22142       if (Depth == 1 && Root->getOpcode() == Shuffle)
22143         return false; // Nothing to do!
22144       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22145       DCI.AddToWorklist(Op.getNode());
22146       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22147       DCI.AddToWorklist(Op.getNode());
22148       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22149                     /*AddTo*/ true);
22150       return true;
22151     }
22152     if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
22153       bool Lo = Mask.equals(0, 0, 1, 1);
22154       unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22155       MVT ShuffleVT = MVT::v4f32;
22156       if (Depth == 1 && Root->getOpcode() == Shuffle)
22157         return false; // Nothing to do!
22158       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22159       DCI.AddToWorklist(Op.getNode());
22160       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22161       DCI.AddToWorklist(Op.getNode());
22162       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22163                     /*AddTo*/ true);
22164       return true;
22165     }
22166   }
22167
22168   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
22169   // variants as none of these have single-instruction variants that are
22170   // superior to the UNPCK formulation.
22171   if (!FloatDomain &&
22172       (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
22173        Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
22174        Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
22175        Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
22176                    15))) {
22177     bool Lo = Mask[0] == 0;
22178     unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22179     if (Depth == 1 && Root->getOpcode() == Shuffle)
22180       return false; // Nothing to do!
22181     MVT ShuffleVT;
22182     switch (Mask.size()) {
22183     case 8:
22184       ShuffleVT = MVT::v8i16;
22185       break;
22186     case 16:
22187       ShuffleVT = MVT::v16i8;
22188       break;
22189     default:
22190       llvm_unreachable("Impossible mask size!");
22191     };
22192     Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22193     DCI.AddToWorklist(Op.getNode());
22194     Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22195     DCI.AddToWorklist(Op.getNode());
22196     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22197                   /*AddTo*/ true);
22198     return true;
22199   }
22200
22201   // Don't try to re-form single instruction chains under any circumstances now
22202   // that we've done encoding canonicalization for them.
22203   if (Depth < 2)
22204     return false;
22205
22206   // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
22207   // can replace them with a single PSHUFB instruction profitably. Intel's
22208   // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
22209   // in practice PSHUFB tends to be *very* fast so we're more aggressive.
22210   if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
22211     SmallVector<SDValue, 16> PSHUFBMask;
22212     assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
22213     int Ratio = 16 / Mask.size();
22214     for (unsigned i = 0; i < 16; ++i) {
22215       if (Mask[i / Ratio] == SM_SentinelUndef) {
22216         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
22217         continue;
22218       }
22219       int M = Mask[i / Ratio] != SM_SentinelZero
22220                   ? Ratio * Mask[i / Ratio] + i % Ratio
22221                   : 255;
22222       PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
22223     }
22224     Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
22225     DCI.AddToWorklist(Op.getNode());
22226     SDValue PSHUFBMaskOp =
22227         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
22228     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
22229     Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
22230     DCI.AddToWorklist(Op.getNode());
22231     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22232                   /*AddTo*/ true);
22233     return true;
22234   }
22235
22236   // Failed to find any combines.
22237   return false;
22238 }
22239
22240 /// \brief Fully generic combining of x86 shuffle instructions.
22241 ///
22242 /// This should be the last combine run over the x86 shuffle instructions. Once
22243 /// they have been fully optimized, this will recursively consider all chains
22244 /// of single-use shuffle instructions, build a generic model of the cumulative
22245 /// shuffle operation, and check for simpler instructions which implement this
22246 /// operation. We use this primarily for two purposes:
22247 ///
22248 /// 1) Collapse generic shuffles to specialized single instructions when
22249 ///    equivalent. In most cases, this is just an encoding size win, but
22250 ///    sometimes we will collapse multiple generic shuffles into a single
22251 ///    special-purpose shuffle.
22252 /// 2) Look for sequences of shuffle instructions with 3 or more total
22253 ///    instructions, and replace them with the slightly more expensive SSSE3
22254 ///    PSHUFB instruction if available. We do this as the last combining step
22255 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
22256 ///    a suitable short sequence of other instructions. The PHUFB will either
22257 ///    use a register or have to read from memory and so is slightly (but only
22258 ///    slightly) more expensive than the other shuffle instructions.
22259 ///
22260 /// Because this is inherently a quadratic operation (for each shuffle in
22261 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
22262 /// This should never be an issue in practice as the shuffle lowering doesn't
22263 /// produce sequences of more than 8 instructions.
22264 ///
22265 /// FIXME: We will currently miss some cases where the redundant shuffling
22266 /// would simplify under the threshold for PSHUFB formation because of
22267 /// combine-ordering. To fix this, we should do the redundant instruction
22268 /// combining in this recursive walk.
22269 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
22270                                           ArrayRef<int> RootMask,
22271                                           int Depth, bool HasPSHUFB,
22272                                           SelectionDAG &DAG,
22273                                           TargetLowering::DAGCombinerInfo &DCI,
22274                                           const X86Subtarget *Subtarget) {
22275   // Bound the depth of our recursive combine because this is ultimately
22276   // quadratic in nature.
22277   if (Depth > 8)
22278     return false;
22279
22280   // Directly rip through bitcasts to find the underlying operand.
22281   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
22282     Op = Op.getOperand(0);
22283
22284   MVT VT = Op.getSimpleValueType();
22285   if (!VT.isVector())
22286     return false; // Bail if we hit a non-vector.
22287   // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
22288   // version should be added.
22289   if (VT.getSizeInBits() != 128)
22290     return false;
22291
22292   assert(Root.getSimpleValueType().isVector() &&
22293          "Shuffles operate on vector types!");
22294   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
22295          "Can only combine shuffles of the same vector register size.");
22296
22297   if (!isTargetShuffle(Op.getOpcode()))
22298     return false;
22299   SmallVector<int, 16> OpMask;
22300   bool IsUnary;
22301   bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
22302   // We only can combine unary shuffles which we can decode the mask for.
22303   if (!HaveMask || !IsUnary)
22304     return false;
22305
22306   assert(VT.getVectorNumElements() == OpMask.size() &&
22307          "Different mask size from vector size!");
22308   assert(((RootMask.size() > OpMask.size() &&
22309            RootMask.size() % OpMask.size() == 0) ||
22310           (OpMask.size() > RootMask.size() &&
22311            OpMask.size() % RootMask.size() == 0) ||
22312           OpMask.size() == RootMask.size()) &&
22313          "The smaller number of elements must divide the larger.");
22314   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
22315   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
22316   assert(((RootRatio == 1 && OpRatio == 1) ||
22317           (RootRatio == 1) != (OpRatio == 1)) &&
22318          "Must not have a ratio for both incoming and op masks!");
22319
22320   SmallVector<int, 16> Mask;
22321   Mask.reserve(std::max(OpMask.size(), RootMask.size()));
22322
22323   // Merge this shuffle operation's mask into our accumulated mask. Note that
22324   // this shuffle's mask will be the first applied to the input, followed by the
22325   // root mask to get us all the way to the root value arrangement. The reason
22326   // for this order is that we are recursing up the operation chain.
22327   for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
22328     int RootIdx = i / RootRatio;
22329     if (RootMask[RootIdx] < 0) {
22330       // This is a zero or undef lane, we're done.
22331       Mask.push_back(RootMask[RootIdx]);
22332       continue;
22333     }
22334
22335     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
22336     int OpIdx = RootMaskedIdx / OpRatio;
22337     if (OpMask[OpIdx] < 0) {
22338       // The incoming lanes are zero or undef, it doesn't matter which ones we
22339       // are using.
22340       Mask.push_back(OpMask[OpIdx]);
22341       continue;
22342     }
22343
22344     // Ok, we have non-zero lanes, map them through.
22345     Mask.push_back(OpMask[OpIdx] * OpRatio +
22346                    RootMaskedIdx % OpRatio);
22347   }
22348
22349   // See if we can recurse into the operand to combine more things.
22350   switch (Op.getOpcode()) {
22351     case X86ISD::PSHUFB:
22352       HasPSHUFB = true;
22353     case X86ISD::PSHUFD:
22354     case X86ISD::PSHUFHW:
22355     case X86ISD::PSHUFLW:
22356       if (Op.getOperand(0).hasOneUse() &&
22357           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22358                                         HasPSHUFB, DAG, DCI, Subtarget))
22359         return true;
22360       break;
22361
22362     case X86ISD::UNPCKL:
22363     case X86ISD::UNPCKH:
22364       assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
22365       // We can't check for single use, we have to check that this shuffle is the only user.
22366       if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
22367           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22368                                         HasPSHUFB, DAG, DCI, Subtarget))
22369           return true;
22370       break;
22371   }
22372
22373   // Minor canonicalization of the accumulated shuffle mask to make it easier
22374   // to match below. All this does is detect masks with squential pairs of
22375   // elements, and shrink them to the half-width mask. It does this in a loop
22376   // so it will reduce the size of the mask to the minimal width mask which
22377   // performs an equivalent shuffle.
22378   SmallVector<int, 16> WidenedMask;
22379   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
22380     Mask = std::move(WidenedMask);
22381     WidenedMask.clear();
22382   }
22383
22384   return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
22385                                 Subtarget);
22386 }
22387
22388 /// \brief Get the PSHUF-style mask from PSHUF node.
22389 ///
22390 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
22391 /// PSHUF-style masks that can be reused with such instructions.
22392 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
22393   SmallVector<int, 4> Mask;
22394   bool IsUnary;
22395   bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
22396   (void)HaveMask;
22397   assert(HaveMask);
22398
22399   switch (N.getOpcode()) {
22400   case X86ISD::PSHUFD:
22401     return Mask;
22402   case X86ISD::PSHUFLW:
22403     Mask.resize(4);
22404     return Mask;
22405   case X86ISD::PSHUFHW:
22406     Mask.erase(Mask.begin(), Mask.begin() + 4);
22407     for (int &M : Mask)
22408       M -= 4;
22409     return Mask;
22410   default:
22411     llvm_unreachable("No valid shuffle instruction found!");
22412   }
22413 }
22414
22415 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
22416 ///
22417 /// We walk up the chain and look for a combinable shuffle, skipping over
22418 /// shuffles that we could hoist this shuffle's transformation past without
22419 /// altering anything.
22420 static SDValue
22421 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
22422                              SelectionDAG &DAG,
22423                              TargetLowering::DAGCombinerInfo &DCI) {
22424   assert(N.getOpcode() == X86ISD::PSHUFD &&
22425          "Called with something other than an x86 128-bit half shuffle!");
22426   SDLoc DL(N);
22427
22428   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
22429   // of the shuffles in the chain so that we can form a fresh chain to replace
22430   // this one.
22431   SmallVector<SDValue, 8> Chain;
22432   SDValue V = N.getOperand(0);
22433   for (; V.hasOneUse(); V = V.getOperand(0)) {
22434     switch (V.getOpcode()) {
22435     default:
22436       return SDValue(); // Nothing combined!
22437
22438     case ISD::BITCAST:
22439       // Skip bitcasts as we always know the type for the target specific
22440       // instructions.
22441       continue;
22442
22443     case X86ISD::PSHUFD:
22444       // Found another dword shuffle.
22445       break;
22446
22447     case X86ISD::PSHUFLW:
22448       // Check that the low words (being shuffled) are the identity in the
22449       // dword shuffle, and the high words are self-contained.
22450       if (Mask[0] != 0 || Mask[1] != 1 ||
22451           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
22452         return SDValue();
22453
22454       Chain.push_back(V);
22455       continue;
22456
22457     case X86ISD::PSHUFHW:
22458       // Check that the high words (being shuffled) are the identity in the
22459       // dword shuffle, and the low words are self-contained.
22460       if (Mask[2] != 2 || Mask[3] != 3 ||
22461           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
22462         return SDValue();
22463
22464       Chain.push_back(V);
22465       continue;
22466
22467     case X86ISD::UNPCKL:
22468     case X86ISD::UNPCKH:
22469       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
22470       // shuffle into a preceding word shuffle.
22471       if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
22472         return SDValue();
22473
22474       // Search for a half-shuffle which we can combine with.
22475       unsigned CombineOp =
22476           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
22477       if (V.getOperand(0) != V.getOperand(1) ||
22478           !V->isOnlyUserOf(V.getOperand(0).getNode()))
22479         return SDValue();
22480       Chain.push_back(V);
22481       V = V.getOperand(0);
22482       do {
22483         switch (V.getOpcode()) {
22484         default:
22485           return SDValue(); // Nothing to combine.
22486
22487         case X86ISD::PSHUFLW:
22488         case X86ISD::PSHUFHW:
22489           if (V.getOpcode() == CombineOp)
22490             break;
22491
22492           Chain.push_back(V);
22493
22494           // Fallthrough!
22495         case ISD::BITCAST:
22496           V = V.getOperand(0);
22497           continue;
22498         }
22499         break;
22500       } while (V.hasOneUse());
22501       break;
22502     }
22503     // Break out of the loop if we break out of the switch.
22504     break;
22505   }
22506
22507   if (!V.hasOneUse())
22508     // We fell out of the loop without finding a viable combining instruction.
22509     return SDValue();
22510
22511   // Merge this node's mask and our incoming mask.
22512   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22513   for (int &M : Mask)
22514     M = VMask[M];
22515   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
22516                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22517
22518   // Rebuild the chain around this new shuffle.
22519   while (!Chain.empty()) {
22520     SDValue W = Chain.pop_back_val();
22521
22522     if (V.getValueType() != W.getOperand(0).getValueType())
22523       V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
22524
22525     switch (W.getOpcode()) {
22526     default:
22527       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
22528
22529     case X86ISD::UNPCKL:
22530     case X86ISD::UNPCKH:
22531       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
22532       break;
22533
22534     case X86ISD::PSHUFD:
22535     case X86ISD::PSHUFLW:
22536     case X86ISD::PSHUFHW:
22537       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
22538       break;
22539     }
22540   }
22541   if (V.getValueType() != N.getValueType())
22542     V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
22543
22544   // Return the new chain to replace N.
22545   return V;
22546 }
22547
22548 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
22549 ///
22550 /// We walk up the chain, skipping shuffles of the other half and looking
22551 /// through shuffles which switch halves trying to find a shuffle of the same
22552 /// pair of dwords.
22553 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
22554                                         SelectionDAG &DAG,
22555                                         TargetLowering::DAGCombinerInfo &DCI) {
22556   assert(
22557       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
22558       "Called with something other than an x86 128-bit half shuffle!");
22559   SDLoc DL(N);
22560   unsigned CombineOpcode = N.getOpcode();
22561
22562   // Walk up a single-use chain looking for a combinable shuffle.
22563   SDValue V = N.getOperand(0);
22564   for (; V.hasOneUse(); V = V.getOperand(0)) {
22565     switch (V.getOpcode()) {
22566     default:
22567       return false; // Nothing combined!
22568
22569     case ISD::BITCAST:
22570       // Skip bitcasts as we always know the type for the target specific
22571       // instructions.
22572       continue;
22573
22574     case X86ISD::PSHUFLW:
22575     case X86ISD::PSHUFHW:
22576       if (V.getOpcode() == CombineOpcode)
22577         break;
22578
22579       // Other-half shuffles are no-ops.
22580       continue;
22581     }
22582     // Break out of the loop if we break out of the switch.
22583     break;
22584   }
22585
22586   if (!V.hasOneUse())
22587     // We fell out of the loop without finding a viable combining instruction.
22588     return false;
22589
22590   // Combine away the bottom node as its shuffle will be accumulated into
22591   // a preceding shuffle.
22592   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22593
22594   // Record the old value.
22595   SDValue Old = V;
22596
22597   // Merge this node's mask and our incoming mask (adjusted to account for all
22598   // the pshufd instructions encountered).
22599   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22600   for (int &M : Mask)
22601     M = VMask[M];
22602   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
22603                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22604
22605   // Check that the shuffles didn't cancel each other out. If not, we need to
22606   // combine to the new one.
22607   if (Old != V)
22608     // Replace the combinable shuffle with the combined one, updating all users
22609     // so that we re-evaluate the chain here.
22610     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
22611
22612   return true;
22613 }
22614
22615 /// \brief Try to combine x86 target specific shuffles.
22616 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
22617                                            TargetLowering::DAGCombinerInfo &DCI,
22618                                            const X86Subtarget *Subtarget) {
22619   SDLoc DL(N);
22620   MVT VT = N.getSimpleValueType();
22621   SmallVector<int, 4> Mask;
22622
22623   switch (N.getOpcode()) {
22624   case X86ISD::PSHUFD:
22625   case X86ISD::PSHUFLW:
22626   case X86ISD::PSHUFHW:
22627     Mask = getPSHUFShuffleMask(N);
22628     assert(Mask.size() == 4);
22629     break;
22630   default:
22631     return SDValue();
22632   }
22633
22634   // Nuke no-op shuffles that show up after combining.
22635   if (isNoopShuffleMask(Mask))
22636     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22637
22638   // Look for simplifications involving one or two shuffle instructions.
22639   SDValue V = N.getOperand(0);
22640   switch (N.getOpcode()) {
22641   default:
22642     break;
22643   case X86ISD::PSHUFLW:
22644   case X86ISD::PSHUFHW:
22645     assert(VT == MVT::v8i16);
22646     (void)VT;
22647
22648     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
22649       return SDValue(); // We combined away this shuffle, so we're done.
22650
22651     // See if this reduces to a PSHUFD which is no more expensive and can
22652     // combine with more operations. Note that it has to at least flip the
22653     // dwords as otherwise it would have been removed as a no-op.
22654     if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
22655       int DMask[] = {0, 1, 2, 3};
22656       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
22657       DMask[DOffset + 0] = DOffset + 1;
22658       DMask[DOffset + 1] = DOffset + 0;
22659       V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
22660       DCI.AddToWorklist(V.getNode());
22661       V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
22662                       getV4X86ShuffleImm8ForMask(DMask, DAG));
22663       DCI.AddToWorklist(V.getNode());
22664       return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
22665     }
22666
22667     // Look for shuffle patterns which can be implemented as a single unpack.
22668     // FIXME: This doesn't handle the location of the PSHUFD generically, and
22669     // only works when we have a PSHUFD followed by two half-shuffles.
22670     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
22671         (V.getOpcode() == X86ISD::PSHUFLW ||
22672          V.getOpcode() == X86ISD::PSHUFHW) &&
22673         V.getOpcode() != N.getOpcode() &&
22674         V.hasOneUse()) {
22675       SDValue D = V.getOperand(0);
22676       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
22677         D = D.getOperand(0);
22678       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
22679         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22680         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
22681         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22682         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22683         int WordMask[8];
22684         for (int i = 0; i < 4; ++i) {
22685           WordMask[i + NOffset] = Mask[i] + NOffset;
22686           WordMask[i + VOffset] = VMask[i] + VOffset;
22687         }
22688         // Map the word mask through the DWord mask.
22689         int MappedMask[8];
22690         for (int i = 0; i < 8; ++i)
22691           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
22692         const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
22693         const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
22694         if (std::equal(std::begin(MappedMask), std::end(MappedMask),
22695                        std::begin(UnpackLoMask)) ||
22696             std::equal(std::begin(MappedMask), std::end(MappedMask),
22697                        std::begin(UnpackHiMask))) {
22698           // We can replace all three shuffles with an unpack.
22699           V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
22700           DCI.AddToWorklist(V.getNode());
22701           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
22702                                                 : X86ISD::UNPCKH,
22703                              DL, MVT::v8i16, V, V);
22704         }
22705       }
22706     }
22707
22708     break;
22709
22710   case X86ISD::PSHUFD:
22711     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
22712       return NewN;
22713
22714     break;
22715   }
22716
22717   return SDValue();
22718 }
22719
22720 /// \brief Try to combine a shuffle into a target-specific add-sub node.
22721 ///
22722 /// We combine this directly on the abstract vector shuffle nodes so it is
22723 /// easier to generically match. We also insert dummy vector shuffle nodes for
22724 /// the operands which explicitly discard the lanes which are unused by this
22725 /// operation to try to flow through the rest of the combiner the fact that
22726 /// they're unused.
22727 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
22728   SDLoc DL(N);
22729   EVT VT = N->getValueType(0);
22730
22731   // We only handle target-independent shuffles.
22732   // FIXME: It would be easy and harmless to use the target shuffle mask
22733   // extraction tool to support more.
22734   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
22735     return SDValue();
22736
22737   auto *SVN = cast<ShuffleVectorSDNode>(N);
22738   ArrayRef<int> Mask = SVN->getMask();
22739   SDValue V1 = N->getOperand(0);
22740   SDValue V2 = N->getOperand(1);
22741
22742   // We require the first shuffle operand to be the SUB node, and the second to
22743   // be the ADD node.
22744   // FIXME: We should support the commuted patterns.
22745   if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
22746     return SDValue();
22747
22748   // If there are other uses of these operations we can't fold them.
22749   if (!V1->hasOneUse() || !V2->hasOneUse())
22750     return SDValue();
22751
22752   // Ensure that both operations have the same operands. Note that we can
22753   // commute the FADD operands.
22754   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
22755   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
22756       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
22757     return SDValue();
22758
22759   // We're looking for blends between FADD and FSUB nodes. We insist on these
22760   // nodes being lined up in a specific expected pattern.
22761   if (!(isShuffleEquivalent(Mask, 0, 3) ||
22762         isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
22763         isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
22764     return SDValue();
22765
22766   // Only specific types are legal at this point, assert so we notice if and
22767   // when these change.
22768   assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
22769           VT == MVT::v4f64) &&
22770          "Unknown vector type encountered!");
22771
22772   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
22773 }
22774
22775 /// PerformShuffleCombine - Performs several different shuffle combines.
22776 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
22777                                      TargetLowering::DAGCombinerInfo &DCI,
22778                                      const X86Subtarget *Subtarget) {
22779   SDLoc dl(N);
22780   SDValue N0 = N->getOperand(0);
22781   SDValue N1 = N->getOperand(1);
22782   EVT VT = N->getValueType(0);
22783
22784   // Don't create instructions with illegal types after legalize types has run.
22785   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22786   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
22787     return SDValue();
22788
22789   // If we have legalized the vector types, look for blends of FADD and FSUB
22790   // nodes that we can fuse into an ADDSUB node.
22791   if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
22792     if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
22793       return AddSub;
22794
22795   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
22796   if (Subtarget->hasFp256() && VT.is256BitVector() &&
22797       N->getOpcode() == ISD::VECTOR_SHUFFLE)
22798     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
22799
22800   // During Type Legalization, when promoting illegal vector types,
22801   // the backend might introduce new shuffle dag nodes and bitcasts.
22802   //
22803   // This code performs the following transformation:
22804   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
22805   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
22806   //
22807   // We do this only if both the bitcast and the BINOP dag nodes have
22808   // one use. Also, perform this transformation only if the new binary
22809   // operation is legal. This is to avoid introducing dag nodes that
22810   // potentially need to be further expanded (or custom lowered) into a
22811   // less optimal sequence of dag nodes.
22812   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
22813       N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
22814       N0.getOpcode() == ISD::BITCAST) {
22815     SDValue BC0 = N0.getOperand(0);
22816     EVT SVT = BC0.getValueType();
22817     unsigned Opcode = BC0.getOpcode();
22818     unsigned NumElts = VT.getVectorNumElements();
22819
22820     if (BC0.hasOneUse() && SVT.isVector() &&
22821         SVT.getVectorNumElements() * 2 == NumElts &&
22822         TLI.isOperationLegal(Opcode, VT)) {
22823       bool CanFold = false;
22824       switch (Opcode) {
22825       default : break;
22826       case ISD::ADD :
22827       case ISD::FADD :
22828       case ISD::SUB :
22829       case ISD::FSUB :
22830       case ISD::MUL :
22831       case ISD::FMUL :
22832         CanFold = true;
22833       }
22834
22835       unsigned SVTNumElts = SVT.getVectorNumElements();
22836       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
22837       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
22838         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
22839       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
22840         CanFold = SVOp->getMaskElt(i) < 0;
22841
22842       if (CanFold) {
22843         SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
22844         SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
22845         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
22846         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
22847       }
22848     }
22849   }
22850
22851   // Only handle 128 wide vector from here on.
22852   if (!VT.is128BitVector())
22853     return SDValue();
22854
22855   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
22856   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
22857   // consecutive, non-overlapping, and in the right order.
22858   SmallVector<SDValue, 16> Elts;
22859   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
22860     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
22861
22862   SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
22863   if (LD.getNode())
22864     return LD;
22865
22866   if (isTargetShuffle(N->getOpcode())) {
22867     SDValue Shuffle =
22868         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
22869     if (Shuffle.getNode())
22870       return Shuffle;
22871
22872     // Try recursively combining arbitrary sequences of x86 shuffle
22873     // instructions into higher-order shuffles. We do this after combining
22874     // specific PSHUF instruction sequences into their minimal form so that we
22875     // can evaluate how many specialized shuffle instructions are involved in
22876     // a particular chain.
22877     SmallVector<int, 1> NonceMask; // Just a placeholder.
22878     NonceMask.push_back(0);
22879     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
22880                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
22881                                       DCI, Subtarget))
22882       return SDValue(); // This routine will use CombineTo to replace N.
22883   }
22884
22885   return SDValue();
22886 }
22887
22888 /// PerformTruncateCombine - Converts truncate operation to
22889 /// a sequence of vector shuffle operations.
22890 /// It is possible when we truncate 256-bit vector to 128-bit vector
22891 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
22892                                       TargetLowering::DAGCombinerInfo &DCI,
22893                                       const X86Subtarget *Subtarget)  {
22894   return SDValue();
22895 }
22896
22897 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
22898 /// specific shuffle of a load can be folded into a single element load.
22899 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
22900 /// shuffles have been custom lowered so we need to handle those here.
22901 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
22902                                          TargetLowering::DAGCombinerInfo &DCI) {
22903   if (DCI.isBeforeLegalizeOps())
22904     return SDValue();
22905
22906   SDValue InVec = N->getOperand(0);
22907   SDValue EltNo = N->getOperand(1);
22908
22909   if (!isa<ConstantSDNode>(EltNo))
22910     return SDValue();
22911
22912   EVT OriginalVT = InVec.getValueType();
22913
22914   if (InVec.getOpcode() == ISD::BITCAST) {
22915     // Don't duplicate a load with other uses.
22916     if (!InVec.hasOneUse())
22917       return SDValue();
22918     EVT BCVT = InVec.getOperand(0).getValueType();
22919     if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
22920       return SDValue();
22921     InVec = InVec.getOperand(0);
22922   }
22923
22924   EVT CurrentVT = InVec.getValueType();
22925
22926   if (!isTargetShuffle(InVec.getOpcode()))
22927     return SDValue();
22928
22929   // Don't duplicate a load with other uses.
22930   if (!InVec.hasOneUse())
22931     return SDValue();
22932
22933   SmallVector<int, 16> ShuffleMask;
22934   bool UnaryShuffle;
22935   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
22936                             ShuffleMask, UnaryShuffle))
22937     return SDValue();
22938
22939   // Select the input vector, guarding against out of range extract vector.
22940   unsigned NumElems = CurrentVT.getVectorNumElements();
22941   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
22942   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
22943   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
22944                                          : InVec.getOperand(1);
22945
22946   // If inputs to shuffle are the same for both ops, then allow 2 uses
22947   unsigned AllowedUses = InVec.getNumOperands() > 1 &&
22948                          InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
22949
22950   if (LdNode.getOpcode() == ISD::BITCAST) {
22951     // Don't duplicate a load with other uses.
22952     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
22953       return SDValue();
22954
22955     AllowedUses = 1; // only allow 1 load use if we have a bitcast
22956     LdNode = LdNode.getOperand(0);
22957   }
22958
22959   if (!ISD::isNormalLoad(LdNode.getNode()))
22960     return SDValue();
22961
22962   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
22963
22964   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
22965     return SDValue();
22966
22967   EVT EltVT = N->getValueType(0);
22968   // If there's a bitcast before the shuffle, check if the load type and
22969   // alignment is valid.
22970   unsigned Align = LN0->getAlignment();
22971   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22972   unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
22973       EltVT.getTypeForEVT(*DAG.getContext()));
22974
22975   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
22976     return SDValue();
22977
22978   // All checks match so transform back to vector_shuffle so that DAG combiner
22979   // can finish the job
22980   SDLoc dl(N);
22981
22982   // Create shuffle node taking into account the case that its a unary shuffle
22983   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
22984                                    : InVec.getOperand(1);
22985   Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
22986                                  InVec.getOperand(0), Shuffle,
22987                                  &ShuffleMask[0]);
22988   Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
22989   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
22990                      EltNo);
22991 }
22992
22993 /// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
22994 /// special and don't usually play with other vector types, it's better to
22995 /// handle them early to be sure we emit efficient code by avoiding
22996 /// store-load conversions.
22997 static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
22998   if (N->getValueType(0) != MVT::x86mmx ||
22999       N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
23000       N->getOperand(0)->getValueType(0) != MVT::v2i32)
23001     return SDValue();
23002
23003   SDValue V = N->getOperand(0);
23004   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
23005   if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
23006     return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
23007                        N->getValueType(0), V.getOperand(0));
23008
23009   return SDValue();
23010 }
23011
23012 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
23013 /// generation and convert it from being a bunch of shuffles and extracts
23014 /// into a somewhat faster sequence. For i686, the best sequence is apparently
23015 /// storing the value and loading scalars back, while for x64 we should
23016 /// use 64-bit extracts and shifts.
23017 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
23018                                          TargetLowering::DAGCombinerInfo &DCI) {
23019   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
23020   if (NewOp.getNode())
23021     return NewOp;
23022
23023   SDValue InputVector = N->getOperand(0);
23024
23025   // Detect mmx to i32 conversion through a v2i32 elt extract.
23026   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
23027       N->getValueType(0) == MVT::i32 &&
23028       InputVector.getValueType() == MVT::v2i32) {
23029
23030     // The bitcast source is a direct mmx result.
23031     SDValue MMXSrc = InputVector.getNode()->getOperand(0);
23032     if (MMXSrc.getValueType() == MVT::x86mmx)
23033       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23034                          N->getValueType(0),
23035                          InputVector.getNode()->getOperand(0));
23036
23037     // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
23038     SDValue MMXSrcOp = MMXSrc.getOperand(0);
23039     if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
23040         MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
23041         MMXSrcOp.getOpcode() == ISD::BITCAST &&
23042         MMXSrcOp.getValueType() == MVT::v1i64 &&
23043         MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
23044       return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
23045                          N->getValueType(0),
23046                          MMXSrcOp.getOperand(0));
23047   }
23048
23049   // Only operate on vectors of 4 elements, where the alternative shuffling
23050   // gets to be more expensive.
23051   if (InputVector.getValueType() != MVT::v4i32)
23052     return SDValue();
23053
23054   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
23055   // single use which is a sign-extend or zero-extend, and all elements are
23056   // used.
23057   SmallVector<SDNode *, 4> Uses;
23058   unsigned ExtractedElements = 0;
23059   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
23060        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
23061     if (UI.getUse().getResNo() != InputVector.getResNo())
23062       return SDValue();
23063
23064     SDNode *Extract = *UI;
23065     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23066       return SDValue();
23067
23068     if (Extract->getValueType(0) != MVT::i32)
23069       return SDValue();
23070     if (!Extract->hasOneUse())
23071       return SDValue();
23072     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
23073         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
23074       return SDValue();
23075     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
23076       return SDValue();
23077
23078     // Record which element was extracted.
23079     ExtractedElements |=
23080       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
23081
23082     Uses.push_back(Extract);
23083   }
23084
23085   // If not all the elements were used, this may not be worthwhile.
23086   if (ExtractedElements != 15)
23087     return SDValue();
23088
23089   // Ok, we've now decided to do the transformation.
23090   // If 64-bit shifts are legal, use the extract-shift sequence,
23091   // otherwise bounce the vector off the cache.
23092   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23093   SDValue Vals[4];
23094   SDLoc dl(InputVector);
23095
23096   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
23097     SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
23098     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
23099     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23100       DAG.getConstant(0, VecIdxTy));
23101     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
23102       DAG.getConstant(1, VecIdxTy));
23103
23104     SDValue ShAmt = DAG.getConstant(32,
23105       DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
23106     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
23107     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23108       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
23109     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
23110     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
23111       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
23112   } else {
23113     // Store the value to a temporary stack slot.
23114     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
23115     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
23116       MachinePointerInfo(), false, false, 0);
23117
23118     EVT ElementType = InputVector.getValueType().getVectorElementType();
23119     unsigned EltSize = ElementType.getSizeInBits() / 8;
23120
23121     // Replace each use (extract) with a load of the appropriate element.
23122     for (unsigned i = 0; i < 4; ++i) {
23123       uint64_t Offset = EltSize * i;
23124       SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
23125
23126       SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
23127                                        StackPtr, OffsetVal);
23128
23129       // Load the scalar.
23130       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
23131                             ScalarAddr, MachinePointerInfo(),
23132                             false, false, false, 0);
23133
23134     }
23135   }
23136
23137   // Replace the extracts
23138   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
23139     UE = Uses.end(); UI != UE; ++UI) {
23140     SDNode *Extract = *UI;
23141
23142     SDValue Idx = Extract->getOperand(1);
23143     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
23144     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
23145   }
23146
23147   // The replacement was made in place; don't return anything.
23148   return SDValue();
23149 }
23150
23151 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
23152 static std::pair<unsigned, bool>
23153 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
23154                    SelectionDAG &DAG, const X86Subtarget *Subtarget) {
23155   if (!VT.isVector())
23156     return std::make_pair(0, false);
23157
23158   bool NeedSplit = false;
23159   switch (VT.getSimpleVT().SimpleTy) {
23160   default: return std::make_pair(0, false);
23161   case MVT::v4i64:
23162   case MVT::v2i64:
23163     if (!Subtarget->hasVLX())
23164       return std::make_pair(0, false);
23165     break;
23166   case MVT::v64i8:
23167   case MVT::v32i16:
23168     if (!Subtarget->hasBWI())
23169       return std::make_pair(0, false);
23170     break;
23171   case MVT::v16i32:
23172   case MVT::v8i64:
23173     if (!Subtarget->hasAVX512())
23174       return std::make_pair(0, false);
23175     break;
23176   case MVT::v32i8:
23177   case MVT::v16i16:
23178   case MVT::v8i32:
23179     if (!Subtarget->hasAVX2())
23180       NeedSplit = true;
23181     if (!Subtarget->hasAVX())
23182       return std::make_pair(0, false);
23183     break;
23184   case MVT::v16i8:
23185   case MVT::v8i16:
23186   case MVT::v4i32:
23187     if (!Subtarget->hasSSE2())
23188       return std::make_pair(0, false);
23189   }
23190
23191   // SSE2 has only a small subset of the operations.
23192   bool hasUnsigned = Subtarget->hasSSE41() ||
23193                      (Subtarget->hasSSE2() && VT == MVT::v16i8);
23194   bool hasSigned = Subtarget->hasSSE41() ||
23195                    (Subtarget->hasSSE2() && VT == MVT::v8i16);
23196
23197   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23198
23199   unsigned Opc = 0;
23200   // Check for x CC y ? x : y.
23201   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23202       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23203     switch (CC) {
23204     default: break;
23205     case ISD::SETULT:
23206     case ISD::SETULE:
23207       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23208     case ISD::SETUGT:
23209     case ISD::SETUGE:
23210       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23211     case ISD::SETLT:
23212     case ISD::SETLE:
23213       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23214     case ISD::SETGT:
23215     case ISD::SETGE:
23216       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23217     }
23218   // Check for x CC y ? y : x -- a min/max with reversed arms.
23219   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23220              DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23221     switch (CC) {
23222     default: break;
23223     case ISD::SETULT:
23224     case ISD::SETULE:
23225       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23226     case ISD::SETUGT:
23227     case ISD::SETUGE:
23228       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23229     case ISD::SETLT:
23230     case ISD::SETLE:
23231       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23232     case ISD::SETGT:
23233     case ISD::SETGE:
23234       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23235     }
23236   }
23237
23238   return std::make_pair(Opc, NeedSplit);
23239 }
23240
23241 static SDValue
23242 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
23243                                       const X86Subtarget *Subtarget) {
23244   SDLoc dl(N);
23245   SDValue Cond = N->getOperand(0);
23246   SDValue LHS = N->getOperand(1);
23247   SDValue RHS = N->getOperand(2);
23248
23249   if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
23250     SDValue CondSrc = Cond->getOperand(0);
23251     if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
23252       Cond = CondSrc->getOperand(0);
23253   }
23254
23255   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
23256     return SDValue();
23257
23258   // A vselect where all conditions and data are constants can be optimized into
23259   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
23260   if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
23261       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
23262     return SDValue();
23263
23264   unsigned MaskValue = 0;
23265   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
23266     return SDValue();
23267
23268   MVT VT = N->getSimpleValueType(0);
23269   unsigned NumElems = VT.getVectorNumElements();
23270   SmallVector<int, 8> ShuffleMask(NumElems, -1);
23271   for (unsigned i = 0; i < NumElems; ++i) {
23272     // Be sure we emit undef where we can.
23273     if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
23274       ShuffleMask[i] = -1;
23275     else
23276       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
23277   }
23278
23279   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23280   if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
23281     return SDValue();
23282   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
23283 }
23284
23285 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
23286 /// nodes.
23287 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
23288                                     TargetLowering::DAGCombinerInfo &DCI,
23289                                     const X86Subtarget *Subtarget) {
23290   SDLoc DL(N);
23291   SDValue Cond = N->getOperand(0);
23292   // Get the LHS/RHS of the select.
23293   SDValue LHS = N->getOperand(1);
23294   SDValue RHS = N->getOperand(2);
23295   EVT VT = LHS.getValueType();
23296   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23297
23298   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
23299   // instructions match the semantics of the common C idiom x<y?x:y but not
23300   // x<=y?x:y, because of how they handle negative zero (which can be
23301   // ignored in unsafe-math mode).
23302   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
23303   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
23304       VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
23305       (Subtarget->hasSSE2() ||
23306        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
23307     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23308
23309     unsigned Opcode = 0;
23310     // Check for x CC y ? x : y.
23311     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23312         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23313       switch (CC) {
23314       default: break;
23315       case ISD::SETULT:
23316         // Converting this to a min would handle NaNs incorrectly, and swapping
23317         // the operands would cause it to handle comparisons between positive
23318         // and negative zero incorrectly.
23319         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23320           if (!DAG.getTarget().Options.UnsafeFPMath &&
23321               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23322             break;
23323           std::swap(LHS, RHS);
23324         }
23325         Opcode = X86ISD::FMIN;
23326         break;
23327       case ISD::SETOLE:
23328         // Converting this to a min would handle comparisons between positive
23329         // and negative zero incorrectly.
23330         if (!DAG.getTarget().Options.UnsafeFPMath &&
23331             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23332           break;
23333         Opcode = X86ISD::FMIN;
23334         break;
23335       case ISD::SETULE:
23336         // Converting this to a min would handle both negative zeros and NaNs
23337         // incorrectly, but we can swap the operands to fix both.
23338         std::swap(LHS, RHS);
23339       case ISD::SETOLT:
23340       case ISD::SETLT:
23341       case ISD::SETLE:
23342         Opcode = X86ISD::FMIN;
23343         break;
23344
23345       case ISD::SETOGE:
23346         // Converting this to a max would handle comparisons between positive
23347         // and negative zero incorrectly.
23348         if (!DAG.getTarget().Options.UnsafeFPMath &&
23349             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23350           break;
23351         Opcode = X86ISD::FMAX;
23352         break;
23353       case ISD::SETUGT:
23354         // Converting this to a max would handle NaNs incorrectly, and swapping
23355         // the operands would cause it to handle comparisons between positive
23356         // and negative zero incorrectly.
23357         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23358           if (!DAG.getTarget().Options.UnsafeFPMath &&
23359               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23360             break;
23361           std::swap(LHS, RHS);
23362         }
23363         Opcode = X86ISD::FMAX;
23364         break;
23365       case ISD::SETUGE:
23366         // Converting this to a max would handle both negative zeros and NaNs
23367         // incorrectly, but we can swap the operands to fix both.
23368         std::swap(LHS, RHS);
23369       case ISD::SETOGT:
23370       case ISD::SETGT:
23371       case ISD::SETGE:
23372         Opcode = X86ISD::FMAX;
23373         break;
23374       }
23375     // Check for x CC y ? y : x -- a min/max with reversed arms.
23376     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23377                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23378       switch (CC) {
23379       default: break;
23380       case ISD::SETOGE:
23381         // Converting this to a min would handle comparisons between positive
23382         // and negative zero incorrectly, and swapping the operands would
23383         // cause it to handle NaNs incorrectly.
23384         if (!DAG.getTarget().Options.UnsafeFPMath &&
23385             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
23386           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23387             break;
23388           std::swap(LHS, RHS);
23389         }
23390         Opcode = X86ISD::FMIN;
23391         break;
23392       case ISD::SETUGT:
23393         // Converting this to a min would handle NaNs incorrectly.
23394         if (!DAG.getTarget().Options.UnsafeFPMath &&
23395             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
23396           break;
23397         Opcode = X86ISD::FMIN;
23398         break;
23399       case ISD::SETUGE:
23400         // Converting this to a min would handle both negative zeros and NaNs
23401         // incorrectly, but we can swap the operands to fix both.
23402         std::swap(LHS, RHS);
23403       case ISD::SETOGT:
23404       case ISD::SETGT:
23405       case ISD::SETGE:
23406         Opcode = X86ISD::FMIN;
23407         break;
23408
23409       case ISD::SETULT:
23410         // Converting this to a max would handle NaNs incorrectly.
23411         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23412           break;
23413         Opcode = X86ISD::FMAX;
23414         break;
23415       case ISD::SETOLE:
23416         // Converting this to a max would handle comparisons between positive
23417         // and negative zero incorrectly, and swapping the operands would
23418         // cause it to handle NaNs incorrectly.
23419         if (!DAG.getTarget().Options.UnsafeFPMath &&
23420             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
23421           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23422             break;
23423           std::swap(LHS, RHS);
23424         }
23425         Opcode = X86ISD::FMAX;
23426         break;
23427       case ISD::SETULE:
23428         // Converting this to a max would handle both negative zeros and NaNs
23429         // incorrectly, but we can swap the operands to fix both.
23430         std::swap(LHS, RHS);
23431       case ISD::SETOLT:
23432       case ISD::SETLT:
23433       case ISD::SETLE:
23434         Opcode = X86ISD::FMAX;
23435         break;
23436       }
23437     }
23438
23439     if (Opcode)
23440       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
23441   }
23442
23443   EVT CondVT = Cond.getValueType();
23444   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
23445       CondVT.getVectorElementType() == MVT::i1) {
23446     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
23447     // lowering on KNL. In this case we convert it to
23448     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
23449     // The same situation for all 128 and 256-bit vectors of i8 and i16.
23450     // Since SKX these selects have a proper lowering.
23451     EVT OpVT = LHS.getValueType();
23452     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
23453         (OpVT.getVectorElementType() == MVT::i8 ||
23454          OpVT.getVectorElementType() == MVT::i16) &&
23455         !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
23456       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
23457       DCI.AddToWorklist(Cond.getNode());
23458       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
23459     }
23460   }
23461   // If this is a select between two integer constants, try to do some
23462   // optimizations.
23463   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
23464     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
23465       // Don't do this for crazy integer types.
23466       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
23467         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
23468         // so that TrueC (the true value) is larger than FalseC.
23469         bool NeedsCondInvert = false;
23470
23471         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
23472             // Efficiently invertible.
23473             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
23474              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
23475               isa<ConstantSDNode>(Cond.getOperand(1))))) {
23476           NeedsCondInvert = true;
23477           std::swap(TrueC, FalseC);
23478         }
23479
23480         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
23481         if (FalseC->getAPIntValue() == 0 &&
23482             TrueC->getAPIntValue().isPowerOf2()) {
23483           if (NeedsCondInvert) // Invert the condition if needed.
23484             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23485                                DAG.getConstant(1, Cond.getValueType()));
23486
23487           // Zero extend the condition if needed.
23488           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
23489
23490           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23491           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
23492                              DAG.getConstant(ShAmt, MVT::i8));
23493         }
23494
23495         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
23496         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23497           if (NeedsCondInvert) // Invert the condition if needed.
23498             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23499                                DAG.getConstant(1, Cond.getValueType()));
23500
23501           // Zero extend the condition if needed.
23502           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23503                              FalseC->getValueType(0), Cond);
23504           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23505                              SDValue(FalseC, 0));
23506         }
23507
23508         // Optimize cases that will turn into an LEA instruction.  This requires
23509         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23510         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23511           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23512           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23513
23514           bool isFastMultiplier = false;
23515           if (Diff < 10) {
23516             switch ((unsigned char)Diff) {
23517               default: break;
23518               case 1:  // result = add base, cond
23519               case 2:  // result = lea base(    , cond*2)
23520               case 3:  // result = lea base(cond, cond*2)
23521               case 4:  // result = lea base(    , cond*4)
23522               case 5:  // result = lea base(cond, cond*4)
23523               case 8:  // result = lea base(    , cond*8)
23524               case 9:  // result = lea base(cond, cond*8)
23525                 isFastMultiplier = true;
23526                 break;
23527             }
23528           }
23529
23530           if (isFastMultiplier) {
23531             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23532             if (NeedsCondInvert) // Invert the condition if needed.
23533               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23534                                  DAG.getConstant(1, Cond.getValueType()));
23535
23536             // Zero extend the condition if needed.
23537             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23538                                Cond);
23539             // Scale the condition by the difference.
23540             if (Diff != 1)
23541               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23542                                  DAG.getConstant(Diff, Cond.getValueType()));
23543
23544             // Add the base if non-zero.
23545             if (FalseC->getAPIntValue() != 0)
23546               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23547                                  SDValue(FalseC, 0));
23548             return Cond;
23549           }
23550         }
23551       }
23552   }
23553
23554   // Canonicalize max and min:
23555   // (x > y) ? x : y -> (x >= y) ? x : y
23556   // (x < y) ? x : y -> (x <= y) ? x : y
23557   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
23558   // the need for an extra compare
23559   // against zero. e.g.
23560   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
23561   // subl   %esi, %edi
23562   // testl  %edi, %edi
23563   // movl   $0, %eax
23564   // cmovgl %edi, %eax
23565   // =>
23566   // xorl   %eax, %eax
23567   // subl   %esi, $edi
23568   // cmovsl %eax, %edi
23569   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
23570       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23571       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23572     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23573     switch (CC) {
23574     default: break;
23575     case ISD::SETLT:
23576     case ISD::SETGT: {
23577       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
23578       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
23579                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
23580       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
23581     }
23582     }
23583   }
23584
23585   // Early exit check
23586   if (!TLI.isTypeLegal(VT))
23587     return SDValue();
23588
23589   // Match VSELECTs into subs with unsigned saturation.
23590   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
23591       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
23592       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
23593        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
23594     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23595
23596     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
23597     // left side invert the predicate to simplify logic below.
23598     SDValue Other;
23599     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
23600       Other = RHS;
23601       CC = ISD::getSetCCInverse(CC, true);
23602     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
23603       Other = LHS;
23604     }
23605
23606     if (Other.getNode() && Other->getNumOperands() == 2 &&
23607         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
23608       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
23609       SDValue CondRHS = Cond->getOperand(1);
23610
23611       // Look for a general sub with unsigned saturation first.
23612       // x >= y ? x-y : 0 --> subus x, y
23613       // x >  y ? x-y : 0 --> subus x, y
23614       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
23615           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
23616         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
23617
23618       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
23619         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
23620           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
23621             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
23622               // If the RHS is a constant we have to reverse the const
23623               // canonicalization.
23624               // x > C-1 ? x+-C : 0 --> subus x, C
23625               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
23626                   CondRHSConst->getAPIntValue() ==
23627                       (-OpRHSConst->getAPIntValue() - 1))
23628                 return DAG.getNode(
23629                     X86ISD::SUBUS, DL, VT, OpLHS,
23630                     DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
23631
23632           // Another special case: If C was a sign bit, the sub has been
23633           // canonicalized into a xor.
23634           // FIXME: Would it be better to use computeKnownBits to determine
23635           //        whether it's safe to decanonicalize the xor?
23636           // x s< 0 ? x^C : 0 --> subus x, C
23637           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
23638               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
23639               OpRHSConst->getAPIntValue().isSignBit())
23640             // Note that we have to rebuild the RHS constant here to ensure we
23641             // don't rely on particular values of undef lanes.
23642             return DAG.getNode(
23643                 X86ISD::SUBUS, DL, VT, OpLHS,
23644                 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
23645         }
23646     }
23647   }
23648
23649   // Try to match a min/max vector operation.
23650   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
23651     std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
23652     unsigned Opc = ret.first;
23653     bool NeedSplit = ret.second;
23654
23655     if (Opc && NeedSplit) {
23656       unsigned NumElems = VT.getVectorNumElements();
23657       // Extract the LHS vectors
23658       SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
23659       SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
23660
23661       // Extract the RHS vectors
23662       SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
23663       SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
23664
23665       // Create min/max for each subvector
23666       LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
23667       RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
23668
23669       // Merge the result
23670       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
23671     } else if (Opc)
23672       return DAG.getNode(Opc, DL, VT, LHS, RHS);
23673   }
23674
23675   // Simplify vector selection if condition value type matches vselect
23676   // operand type
23677   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
23678     assert(Cond.getValueType().isVector() &&
23679            "vector select expects a vector selector!");
23680
23681     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
23682     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
23683
23684     // Try invert the condition if true value is not all 1s and false value
23685     // is not all 0s.
23686     if (!TValIsAllOnes && !FValIsAllZeros &&
23687         // Check if the selector will be produced by CMPP*/PCMP*
23688         Cond.getOpcode() == ISD::SETCC &&
23689         // Check if SETCC has already been promoted
23690         TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
23691       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
23692       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
23693
23694       if (TValIsAllZeros || FValIsAllOnes) {
23695         SDValue CC = Cond.getOperand(2);
23696         ISD::CondCode NewCC =
23697           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
23698                                Cond.getOperand(0).getValueType().isInteger());
23699         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
23700         std::swap(LHS, RHS);
23701         TValIsAllOnes = FValIsAllOnes;
23702         FValIsAllZeros = TValIsAllZeros;
23703       }
23704     }
23705
23706     if (TValIsAllOnes || FValIsAllZeros) {
23707       SDValue Ret;
23708
23709       if (TValIsAllOnes && FValIsAllZeros)
23710         Ret = Cond;
23711       else if (TValIsAllOnes)
23712         Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
23713                           DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
23714       else if (FValIsAllZeros)
23715         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
23716                           DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
23717
23718       return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
23719     }
23720   }
23721
23722   // If we know that this node is legal then we know that it is going to be
23723   // matched by one of the SSE/AVX BLEND instructions. These instructions only
23724   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
23725   // to simplify previous instructions.
23726   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
23727       !DCI.isBeforeLegalize() &&
23728       // We explicitly check against v8i16 and v16i16 because, although
23729       // they're marked as Custom, they might only be legal when Cond is a
23730       // build_vector of constants. This will be taken care in a later
23731       // condition.
23732       (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
23733        VT != MVT::v8i16) &&
23734       // Don't optimize vector of constants. Those are handled by
23735       // the generic code and all the bits must be properly set for
23736       // the generic optimizer.
23737       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
23738     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
23739
23740     // Don't optimize vector selects that map to mask-registers.
23741     if (BitWidth == 1)
23742       return SDValue();
23743
23744     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
23745     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
23746
23747     APInt KnownZero, KnownOne;
23748     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
23749                                           DCI.isBeforeLegalizeOps());
23750     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
23751         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
23752                                  TLO)) {
23753       // If we changed the computation somewhere in the DAG, this change
23754       // will affect all users of Cond.
23755       // Make sure it is fine and update all the nodes so that we do not
23756       // use the generic VSELECT anymore. Otherwise, we may perform
23757       // wrong optimizations as we messed up with the actual expectation
23758       // for the vector boolean values.
23759       if (Cond != TLO.Old) {
23760         // Check all uses of that condition operand to check whether it will be
23761         // consumed by non-BLEND instructions, which may depend on all bits are
23762         // set properly.
23763         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23764              I != E; ++I)
23765           if (I->getOpcode() != ISD::VSELECT)
23766             // TODO: Add other opcodes eventually lowered into BLEND.
23767             return SDValue();
23768
23769         // Update all the users of the condition, before committing the change,
23770         // so that the VSELECT optimizations that expect the correct vector
23771         // boolean value will not be triggered.
23772         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23773              I != E; ++I)
23774           DAG.ReplaceAllUsesOfValueWith(
23775               SDValue(*I, 0),
23776               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
23777                           Cond, I->getOperand(1), I->getOperand(2)));
23778         DCI.CommitTargetLoweringOpt(TLO);
23779         return SDValue();
23780       }
23781       // At this point, only Cond is changed. Change the condition
23782       // just for N to keep the opportunity to optimize all other
23783       // users their own way.
23784       DAG.ReplaceAllUsesOfValueWith(
23785           SDValue(N, 0),
23786           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
23787                       TLO.New, N->getOperand(1), N->getOperand(2)));
23788       return SDValue();
23789     }
23790   }
23791
23792   // We should generate an X86ISD::BLENDI from a vselect if its argument
23793   // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
23794   // constants. This specific pattern gets generated when we split a
23795   // selector for a 512 bit vector in a machine without AVX512 (but with
23796   // 256-bit vectors), during legalization:
23797   //
23798   // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
23799   //
23800   // Iff we find this pattern and the build_vectors are built from
23801   // constants, we translate the vselect into a shuffle_vector that we
23802   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
23803   if ((N->getOpcode() == ISD::VSELECT ||
23804        N->getOpcode() == X86ISD::SHRUNKBLEND) &&
23805       !DCI.isBeforeLegalize()) {
23806     SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
23807     if (Shuffle.getNode())
23808       return Shuffle;
23809   }
23810
23811   return SDValue();
23812 }
23813
23814 // Check whether a boolean test is testing a boolean value generated by
23815 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
23816 // code.
23817 //
23818 // Simplify the following patterns:
23819 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
23820 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
23821 // to (Op EFLAGS Cond)
23822 //
23823 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
23824 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
23825 // to (Op EFLAGS !Cond)
23826 //
23827 // where Op could be BRCOND or CMOV.
23828 //
23829 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
23830   // Quit if not CMP and SUB with its value result used.
23831   if (Cmp.getOpcode() != X86ISD::CMP &&
23832       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
23833       return SDValue();
23834
23835   // Quit if not used as a boolean value.
23836   if (CC != X86::COND_E && CC != X86::COND_NE)
23837     return SDValue();
23838
23839   // Check CMP operands. One of them should be 0 or 1 and the other should be
23840   // an SetCC or extended from it.
23841   SDValue Op1 = Cmp.getOperand(0);
23842   SDValue Op2 = Cmp.getOperand(1);
23843
23844   SDValue SetCC;
23845   const ConstantSDNode* C = nullptr;
23846   bool needOppositeCond = (CC == X86::COND_E);
23847   bool checkAgainstTrue = false; // Is it a comparison against 1?
23848
23849   if ((C = dyn_cast<ConstantSDNode>(Op1)))
23850     SetCC = Op2;
23851   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
23852     SetCC = Op1;
23853   else // Quit if all operands are not constants.
23854     return SDValue();
23855
23856   if (C->getZExtValue() == 1) {
23857     needOppositeCond = !needOppositeCond;
23858     checkAgainstTrue = true;
23859   } else if (C->getZExtValue() != 0)
23860     // Quit if the constant is neither 0 or 1.
23861     return SDValue();
23862
23863   bool truncatedToBoolWithAnd = false;
23864   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
23865   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
23866          SetCC.getOpcode() == ISD::TRUNCATE ||
23867          SetCC.getOpcode() == ISD::AND) {
23868     if (SetCC.getOpcode() == ISD::AND) {
23869       int OpIdx = -1;
23870       ConstantSDNode *CS;
23871       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
23872           CS->getZExtValue() == 1)
23873         OpIdx = 1;
23874       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
23875           CS->getZExtValue() == 1)
23876         OpIdx = 0;
23877       if (OpIdx == -1)
23878         break;
23879       SetCC = SetCC.getOperand(OpIdx);
23880       truncatedToBoolWithAnd = true;
23881     } else
23882       SetCC = SetCC.getOperand(0);
23883   }
23884
23885   switch (SetCC.getOpcode()) {
23886   case X86ISD::SETCC_CARRY:
23887     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
23888     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
23889     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
23890     // truncated to i1 using 'and'.
23891     if (checkAgainstTrue && !truncatedToBoolWithAnd)
23892       break;
23893     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
23894            "Invalid use of SETCC_CARRY!");
23895     // FALL THROUGH
23896   case X86ISD::SETCC:
23897     // Set the condition code or opposite one if necessary.
23898     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
23899     if (needOppositeCond)
23900       CC = X86::GetOppositeBranchCondition(CC);
23901     return SetCC.getOperand(1);
23902   case X86ISD::CMOV: {
23903     // Check whether false/true value has canonical one, i.e. 0 or 1.
23904     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
23905     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
23906     // Quit if true value is not a constant.
23907     if (!TVal)
23908       return SDValue();
23909     // Quit if false value is not a constant.
23910     if (!FVal) {
23911       SDValue Op = SetCC.getOperand(0);
23912       // Skip 'zext' or 'trunc' node.
23913       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
23914           Op.getOpcode() == ISD::TRUNCATE)
23915         Op = Op.getOperand(0);
23916       // A special case for rdrand/rdseed, where 0 is set if false cond is
23917       // found.
23918       if ((Op.getOpcode() != X86ISD::RDRAND &&
23919            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
23920         return SDValue();
23921     }
23922     // Quit if false value is not the constant 0 or 1.
23923     bool FValIsFalse = true;
23924     if (FVal && FVal->getZExtValue() != 0) {
23925       if (FVal->getZExtValue() != 1)
23926         return SDValue();
23927       // If FVal is 1, opposite cond is needed.
23928       needOppositeCond = !needOppositeCond;
23929       FValIsFalse = false;
23930     }
23931     // Quit if TVal is not the constant opposite of FVal.
23932     if (FValIsFalse && TVal->getZExtValue() != 1)
23933       return SDValue();
23934     if (!FValIsFalse && TVal->getZExtValue() != 0)
23935       return SDValue();
23936     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
23937     if (needOppositeCond)
23938       CC = X86::GetOppositeBranchCondition(CC);
23939     return SetCC.getOperand(3);
23940   }
23941   }
23942
23943   return SDValue();
23944 }
23945
23946 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
23947 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
23948                                   TargetLowering::DAGCombinerInfo &DCI,
23949                                   const X86Subtarget *Subtarget) {
23950   SDLoc DL(N);
23951
23952   // If the flag operand isn't dead, don't touch this CMOV.
23953   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
23954     return SDValue();
23955
23956   SDValue FalseOp = N->getOperand(0);
23957   SDValue TrueOp = N->getOperand(1);
23958   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
23959   SDValue Cond = N->getOperand(3);
23960
23961   if (CC == X86::COND_E || CC == X86::COND_NE) {
23962     switch (Cond.getOpcode()) {
23963     default: break;
23964     case X86ISD::BSR:
23965     case X86ISD::BSF:
23966       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
23967       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
23968         return (CC == X86::COND_E) ? FalseOp : TrueOp;
23969     }
23970   }
23971
23972   SDValue Flags;
23973
23974   Flags = checkBoolTestSetCCCombine(Cond, CC);
23975   if (Flags.getNode() &&
23976       // Extra check as FCMOV only supports a subset of X86 cond.
23977       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
23978     SDValue Ops[] = { FalseOp, TrueOp,
23979                       DAG.getConstant(CC, MVT::i8), Flags };
23980     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
23981   }
23982
23983   // If this is a select between two integer constants, try to do some
23984   // optimizations.  Note that the operands are ordered the opposite of SELECT
23985   // operands.
23986   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
23987     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
23988       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
23989       // larger than FalseC (the false value).
23990       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
23991         CC = X86::GetOppositeBranchCondition(CC);
23992         std::swap(TrueC, FalseC);
23993         std::swap(TrueOp, FalseOp);
23994       }
23995
23996       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
23997       // This is efficient for any integer data type (including i8/i16) and
23998       // shift amount.
23999       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
24000         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24001                            DAG.getConstant(CC, MVT::i8), Cond);
24002
24003         // Zero extend the condition if needed.
24004         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
24005
24006         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
24007         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
24008                            DAG.getConstant(ShAmt, MVT::i8));
24009         if (N->getNumValues() == 2)  // Dead flag value?
24010           return DCI.CombineTo(N, Cond, SDValue());
24011         return Cond;
24012       }
24013
24014       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
24015       // for any integer data type, including i8/i16.
24016       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
24017         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24018                            DAG.getConstant(CC, MVT::i8), Cond);
24019
24020         // Zero extend the condition if needed.
24021         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
24022                            FalseC->getValueType(0), Cond);
24023         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24024                            SDValue(FalseC, 0));
24025
24026         if (N->getNumValues() == 2)  // Dead flag value?
24027           return DCI.CombineTo(N, Cond, SDValue());
24028         return Cond;
24029       }
24030
24031       // Optimize cases that will turn into an LEA instruction.  This requires
24032       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
24033       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
24034         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
24035         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
24036
24037         bool isFastMultiplier = false;
24038         if (Diff < 10) {
24039           switch ((unsigned char)Diff) {
24040           default: break;
24041           case 1:  // result = add base, cond
24042           case 2:  // result = lea base(    , cond*2)
24043           case 3:  // result = lea base(cond, cond*2)
24044           case 4:  // result = lea base(    , cond*4)
24045           case 5:  // result = lea base(cond, cond*4)
24046           case 8:  // result = lea base(    , cond*8)
24047           case 9:  // result = lea base(cond, cond*8)
24048             isFastMultiplier = true;
24049             break;
24050           }
24051         }
24052
24053         if (isFastMultiplier) {
24054           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
24055           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
24056                              DAG.getConstant(CC, MVT::i8), Cond);
24057           // Zero extend the condition if needed.
24058           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
24059                              Cond);
24060           // Scale the condition by the difference.
24061           if (Diff != 1)
24062             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
24063                                DAG.getConstant(Diff, Cond.getValueType()));
24064
24065           // Add the base if non-zero.
24066           if (FalseC->getAPIntValue() != 0)
24067             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
24068                                SDValue(FalseC, 0));
24069           if (N->getNumValues() == 2)  // Dead flag value?
24070             return DCI.CombineTo(N, Cond, SDValue());
24071           return Cond;
24072         }
24073       }
24074     }
24075   }
24076
24077   // Handle these cases:
24078   //   (select (x != c), e, c) -> select (x != c), e, x),
24079   //   (select (x == c), c, e) -> select (x == c), x, e)
24080   // where the c is an integer constant, and the "select" is the combination
24081   // of CMOV and CMP.
24082   //
24083   // The rationale for this change is that the conditional-move from a constant
24084   // needs two instructions, however, conditional-move from a register needs
24085   // only one instruction.
24086   //
24087   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
24088   //  some instruction-combining opportunities. This opt needs to be
24089   //  postponed as late as possible.
24090   //
24091   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
24092     // the DCI.xxxx conditions are provided to postpone the optimization as
24093     // late as possible.
24094
24095     ConstantSDNode *CmpAgainst = nullptr;
24096     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
24097         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
24098         !isa<ConstantSDNode>(Cond.getOperand(0))) {
24099
24100       if (CC == X86::COND_NE &&
24101           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
24102         CC = X86::GetOppositeBranchCondition(CC);
24103         std::swap(TrueOp, FalseOp);
24104       }
24105
24106       if (CC == X86::COND_E &&
24107           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
24108         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
24109                           DAG.getConstant(CC, MVT::i8), Cond };
24110         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
24111       }
24112     }
24113   }
24114
24115   return SDValue();
24116 }
24117
24118 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
24119                                                 const X86Subtarget *Subtarget) {
24120   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
24121   switch (IntNo) {
24122   default: return SDValue();
24123   // SSE/AVX/AVX2 blend intrinsics.
24124   case Intrinsic::x86_avx2_pblendvb:
24125   case Intrinsic::x86_avx2_pblendw:
24126   case Intrinsic::x86_avx2_pblendd_128:
24127   case Intrinsic::x86_avx2_pblendd_256:
24128     // Don't try to simplify this intrinsic if we don't have AVX2.
24129     if (!Subtarget->hasAVX2())
24130       return SDValue();
24131     // FALL-THROUGH
24132   case Intrinsic::x86_avx_blend_pd_256:
24133   case Intrinsic::x86_avx_blend_ps_256:
24134   case Intrinsic::x86_avx_blendv_pd_256:
24135   case Intrinsic::x86_avx_blendv_ps_256:
24136     // Don't try to simplify this intrinsic if we don't have AVX.
24137     if (!Subtarget->hasAVX())
24138       return SDValue();
24139     // FALL-THROUGH
24140   case Intrinsic::x86_sse41_pblendw:
24141   case Intrinsic::x86_sse41_blendpd:
24142   case Intrinsic::x86_sse41_blendps:
24143   case Intrinsic::x86_sse41_blendvps:
24144   case Intrinsic::x86_sse41_blendvpd:
24145   case Intrinsic::x86_sse41_pblendvb: {
24146     SDValue Op0 = N->getOperand(1);
24147     SDValue Op1 = N->getOperand(2);
24148     SDValue Mask = N->getOperand(3);
24149
24150     // Don't try to simplify this intrinsic if we don't have SSE4.1.
24151     if (!Subtarget->hasSSE41())
24152       return SDValue();
24153
24154     // fold (blend A, A, Mask) -> A
24155     if (Op0 == Op1)
24156       return Op0;
24157     // fold (blend A, B, allZeros) -> A
24158     if (ISD::isBuildVectorAllZeros(Mask.getNode()))
24159       return Op0;
24160     // fold (blend A, B, allOnes) -> B
24161     if (ISD::isBuildVectorAllOnes(Mask.getNode()))
24162       return Op1;
24163
24164     // Simplify the case where the mask is a constant i32 value.
24165     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
24166       if (C->isNullValue())
24167         return Op0;
24168       if (C->isAllOnesValue())
24169         return Op1;
24170     }
24171
24172     return SDValue();
24173   }
24174
24175   // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
24176   case Intrinsic::x86_sse2_psrai_w:
24177   case Intrinsic::x86_sse2_psrai_d:
24178   case Intrinsic::x86_avx2_psrai_w:
24179   case Intrinsic::x86_avx2_psrai_d:
24180   case Intrinsic::x86_sse2_psra_w:
24181   case Intrinsic::x86_sse2_psra_d:
24182   case Intrinsic::x86_avx2_psra_w:
24183   case Intrinsic::x86_avx2_psra_d: {
24184     SDValue Op0 = N->getOperand(1);
24185     SDValue Op1 = N->getOperand(2);
24186     EVT VT = Op0.getValueType();
24187     assert(VT.isVector() && "Expected a vector type!");
24188
24189     if (isa<BuildVectorSDNode>(Op1))
24190       Op1 = Op1.getOperand(0);
24191
24192     if (!isa<ConstantSDNode>(Op1))
24193       return SDValue();
24194
24195     EVT SVT = VT.getVectorElementType();
24196     unsigned SVTBits = SVT.getSizeInBits();
24197
24198     ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
24199     const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
24200     uint64_t ShAmt = C.getZExtValue();
24201
24202     // Don't try to convert this shift into a ISD::SRA if the shift
24203     // count is bigger than or equal to the element size.
24204     if (ShAmt >= SVTBits)
24205       return SDValue();
24206
24207     // Trivial case: if the shift count is zero, then fold this
24208     // into the first operand.
24209     if (ShAmt == 0)
24210       return Op0;
24211
24212     // Replace this packed shift intrinsic with a target independent
24213     // shift dag node.
24214     SDValue Splat = DAG.getConstant(C, VT);
24215     return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
24216   }
24217   }
24218 }
24219
24220 /// PerformMulCombine - Optimize a single multiply with constant into two
24221 /// in order to implement it with two cheaper instructions, e.g.
24222 /// LEA + SHL, LEA + LEA.
24223 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
24224                                  TargetLowering::DAGCombinerInfo &DCI) {
24225   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24226     return SDValue();
24227
24228   EVT VT = N->getValueType(0);
24229   if (VT != MVT::i64 && VT != MVT::i32)
24230     return SDValue();
24231
24232   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
24233   if (!C)
24234     return SDValue();
24235   uint64_t MulAmt = C->getZExtValue();
24236   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
24237     return SDValue();
24238
24239   uint64_t MulAmt1 = 0;
24240   uint64_t MulAmt2 = 0;
24241   if ((MulAmt % 9) == 0) {
24242     MulAmt1 = 9;
24243     MulAmt2 = MulAmt / 9;
24244   } else if ((MulAmt % 5) == 0) {
24245     MulAmt1 = 5;
24246     MulAmt2 = MulAmt / 5;
24247   } else if ((MulAmt % 3) == 0) {
24248     MulAmt1 = 3;
24249     MulAmt2 = MulAmt / 3;
24250   }
24251   if (MulAmt2 &&
24252       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
24253     SDLoc DL(N);
24254
24255     if (isPowerOf2_64(MulAmt2) &&
24256         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
24257       // If second multiplifer is pow2, issue it first. We want the multiply by
24258       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
24259       // is an add.
24260       std::swap(MulAmt1, MulAmt2);
24261
24262     SDValue NewMul;
24263     if (isPowerOf2_64(MulAmt1))
24264       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
24265                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
24266     else
24267       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
24268                            DAG.getConstant(MulAmt1, VT));
24269
24270     if (isPowerOf2_64(MulAmt2))
24271       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
24272                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
24273     else
24274       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
24275                            DAG.getConstant(MulAmt2, VT));
24276
24277     // Do not add new nodes to DAG combiner worklist.
24278     DCI.CombineTo(N, NewMul, false);
24279   }
24280   return SDValue();
24281 }
24282
24283 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
24284   SDValue N0 = N->getOperand(0);
24285   SDValue N1 = N->getOperand(1);
24286   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
24287   EVT VT = N0.getValueType();
24288
24289   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
24290   // since the result of setcc_c is all zero's or all ones.
24291   if (VT.isInteger() && !VT.isVector() &&
24292       N1C && N0.getOpcode() == ISD::AND &&
24293       N0.getOperand(1).getOpcode() == ISD::Constant) {
24294     SDValue N00 = N0.getOperand(0);
24295     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
24296         ((N00.getOpcode() == ISD::ANY_EXTEND ||
24297           N00.getOpcode() == ISD::ZERO_EXTEND) &&
24298          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
24299       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
24300       APInt ShAmt = N1C->getAPIntValue();
24301       Mask = Mask.shl(ShAmt);
24302       if (Mask != 0)
24303         return DAG.getNode(ISD::AND, SDLoc(N), VT,
24304                            N00, DAG.getConstant(Mask, VT));
24305     }
24306   }
24307
24308   // Hardware support for vector shifts is sparse which makes us scalarize the
24309   // vector operations in many cases. Also, on sandybridge ADD is faster than
24310   // shl.
24311   // (shl V, 1) -> add V,V
24312   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
24313     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
24314       assert(N0.getValueType().isVector() && "Invalid vector shift type");
24315       // We shift all of the values by one. In many cases we do not have
24316       // hardware support for this operation. This is better expressed as an ADD
24317       // of two values.
24318       if (N1SplatC->getZExtValue() == 1)
24319         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
24320     }
24321
24322   return SDValue();
24323 }
24324
24325 /// \brief Returns a vector of 0s if the node in input is a vector logical
24326 /// shift by a constant amount which is known to be bigger than or equal
24327 /// to the vector element size in bits.
24328 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
24329                                       const X86Subtarget *Subtarget) {
24330   EVT VT = N->getValueType(0);
24331
24332   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
24333       (!Subtarget->hasInt256() ||
24334        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
24335     return SDValue();
24336
24337   SDValue Amt = N->getOperand(1);
24338   SDLoc DL(N);
24339   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
24340     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
24341       APInt ShiftAmt = AmtSplat->getAPIntValue();
24342       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
24343
24344       // SSE2/AVX2 logical shifts always return a vector of 0s
24345       // if the shift amount is bigger than or equal to
24346       // the element size. The constant shift amount will be
24347       // encoded as a 8-bit immediate.
24348       if (ShiftAmt.trunc(8).uge(MaxAmount))
24349         return getZeroVector(VT, Subtarget, DAG, DL);
24350     }
24351
24352   return SDValue();
24353 }
24354
24355 /// PerformShiftCombine - Combine shifts.
24356 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
24357                                    TargetLowering::DAGCombinerInfo &DCI,
24358                                    const X86Subtarget *Subtarget) {
24359   if (N->getOpcode() == ISD::SHL) {
24360     SDValue V = PerformSHLCombine(N, DAG);
24361     if (V.getNode()) return V;
24362   }
24363
24364   if (N->getOpcode() != ISD::SRA) {
24365     // Try to fold this logical shift into a zero vector.
24366     SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
24367     if (V.getNode()) return V;
24368   }
24369
24370   return SDValue();
24371 }
24372
24373 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
24374 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
24375 // and friends.  Likewise for OR -> CMPNEQSS.
24376 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
24377                             TargetLowering::DAGCombinerInfo &DCI,
24378                             const X86Subtarget *Subtarget) {
24379   unsigned opcode;
24380
24381   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
24382   // we're requiring SSE2 for both.
24383   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
24384     SDValue N0 = N->getOperand(0);
24385     SDValue N1 = N->getOperand(1);
24386     SDValue CMP0 = N0->getOperand(1);
24387     SDValue CMP1 = N1->getOperand(1);
24388     SDLoc DL(N);
24389
24390     // The SETCCs should both refer to the same CMP.
24391     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
24392       return SDValue();
24393
24394     SDValue CMP00 = CMP0->getOperand(0);
24395     SDValue CMP01 = CMP0->getOperand(1);
24396     EVT     VT    = CMP00.getValueType();
24397
24398     if (VT == MVT::f32 || VT == MVT::f64) {
24399       bool ExpectingFlags = false;
24400       // Check for any users that want flags:
24401       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
24402            !ExpectingFlags && UI != UE; ++UI)
24403         switch (UI->getOpcode()) {
24404         default:
24405         case ISD::BR_CC:
24406         case ISD::BRCOND:
24407         case ISD::SELECT:
24408           ExpectingFlags = true;
24409           break;
24410         case ISD::CopyToReg:
24411         case ISD::SIGN_EXTEND:
24412         case ISD::ZERO_EXTEND:
24413         case ISD::ANY_EXTEND:
24414           break;
24415         }
24416
24417       if (!ExpectingFlags) {
24418         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
24419         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
24420
24421         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
24422           X86::CondCode tmp = cc0;
24423           cc0 = cc1;
24424           cc1 = tmp;
24425         }
24426
24427         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
24428             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
24429           // FIXME: need symbolic constants for these magic numbers.
24430           // See X86ATTInstPrinter.cpp:printSSECC().
24431           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
24432           if (Subtarget->hasAVX512()) {
24433             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
24434                                          CMP01, DAG.getConstant(x86cc, MVT::i8));
24435             if (N->getValueType(0) != MVT::i1)
24436               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
24437                                  FSetCC);
24438             return FSetCC;
24439           }
24440           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
24441                                               CMP00.getValueType(), CMP00, CMP01,
24442                                               DAG.getConstant(x86cc, MVT::i8));
24443
24444           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
24445           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
24446
24447           if (is64BitFP && !Subtarget->is64Bit()) {
24448             // On a 32-bit target, we cannot bitcast the 64-bit float to a
24449             // 64-bit integer, since that's not a legal type. Since
24450             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
24451             // bits, but can do this little dance to extract the lowest 32 bits
24452             // and work with those going forward.
24453             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
24454                                            OnesOrZeroesF);
24455             SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
24456                                            Vector64);
24457             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
24458                                         Vector32, DAG.getIntPtrConstant(0));
24459             IntVT = MVT::i32;
24460           }
24461
24462           SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
24463           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
24464                                       DAG.getConstant(1, IntVT));
24465           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
24466           return OneBitOfTruth;
24467         }
24468       }
24469     }
24470   }
24471   return SDValue();
24472 }
24473
24474 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
24475 /// so it can be folded inside ANDNP.
24476 static bool CanFoldXORWithAllOnes(const SDNode *N) {
24477   EVT VT = N->getValueType(0);
24478
24479   // Match direct AllOnes for 128 and 256-bit vectors
24480   if (ISD::isBuildVectorAllOnes(N))
24481     return true;
24482
24483   // Look through a bit convert.
24484   if (N->getOpcode() == ISD::BITCAST)
24485     N = N->getOperand(0).getNode();
24486
24487   // Sometimes the operand may come from a insert_subvector building a 256-bit
24488   // allones vector
24489   if (VT.is256BitVector() &&
24490       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
24491     SDValue V1 = N->getOperand(0);
24492     SDValue V2 = N->getOperand(1);
24493
24494     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
24495         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
24496         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
24497         ISD::isBuildVectorAllOnes(V2.getNode()))
24498       return true;
24499   }
24500
24501   return false;
24502 }
24503
24504 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
24505 // register. In most cases we actually compare or select YMM-sized registers
24506 // and mixing the two types creates horrible code. This method optimizes
24507 // some of the transition sequences.
24508 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
24509                                  TargetLowering::DAGCombinerInfo &DCI,
24510                                  const X86Subtarget *Subtarget) {
24511   EVT VT = N->getValueType(0);
24512   if (!VT.is256BitVector())
24513     return SDValue();
24514
24515   assert((N->getOpcode() == ISD::ANY_EXTEND ||
24516           N->getOpcode() == ISD::ZERO_EXTEND ||
24517           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
24518
24519   SDValue Narrow = N->getOperand(0);
24520   EVT NarrowVT = Narrow->getValueType(0);
24521   if (!NarrowVT.is128BitVector())
24522     return SDValue();
24523
24524   if (Narrow->getOpcode() != ISD::XOR &&
24525       Narrow->getOpcode() != ISD::AND &&
24526       Narrow->getOpcode() != ISD::OR)
24527     return SDValue();
24528
24529   SDValue N0  = Narrow->getOperand(0);
24530   SDValue N1  = Narrow->getOperand(1);
24531   SDLoc DL(Narrow);
24532
24533   // The Left side has to be a trunc.
24534   if (N0.getOpcode() != ISD::TRUNCATE)
24535     return SDValue();
24536
24537   // The type of the truncated inputs.
24538   EVT WideVT = N0->getOperand(0)->getValueType(0);
24539   if (WideVT != VT)
24540     return SDValue();
24541
24542   // The right side has to be a 'trunc' or a constant vector.
24543   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
24544   ConstantSDNode *RHSConstSplat = nullptr;
24545   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
24546     RHSConstSplat = RHSBV->getConstantSplatNode();
24547   if (!RHSTrunc && !RHSConstSplat)
24548     return SDValue();
24549
24550   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24551
24552   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
24553     return SDValue();
24554
24555   // Set N0 and N1 to hold the inputs to the new wide operation.
24556   N0 = N0->getOperand(0);
24557   if (RHSConstSplat) {
24558     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
24559                      SDValue(RHSConstSplat, 0));
24560     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
24561     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
24562   } else if (RHSTrunc) {
24563     N1 = N1->getOperand(0);
24564   }
24565
24566   // Generate the wide operation.
24567   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
24568   unsigned Opcode = N->getOpcode();
24569   switch (Opcode) {
24570   case ISD::ANY_EXTEND:
24571     return Op;
24572   case ISD::ZERO_EXTEND: {
24573     unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
24574     APInt Mask = APInt::getAllOnesValue(InBits);
24575     Mask = Mask.zext(VT.getScalarType().getSizeInBits());
24576     return DAG.getNode(ISD::AND, DL, VT,
24577                        Op, DAG.getConstant(Mask, VT));
24578   }
24579   case ISD::SIGN_EXTEND:
24580     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
24581                        Op, DAG.getValueType(NarrowVT));
24582   default:
24583     llvm_unreachable("Unexpected opcode");
24584   }
24585 }
24586
24587 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
24588                                  TargetLowering::DAGCombinerInfo &DCI,
24589                                  const X86Subtarget *Subtarget) {
24590   EVT VT = N->getValueType(0);
24591   if (DCI.isBeforeLegalizeOps())
24592     return SDValue();
24593
24594   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24595   if (R.getNode())
24596     return R;
24597
24598   // Create BEXTR instructions
24599   // BEXTR is ((X >> imm) & (2**size-1))
24600   if (VT == MVT::i32 || VT == MVT::i64) {
24601     SDValue N0 = N->getOperand(0);
24602     SDValue N1 = N->getOperand(1);
24603     SDLoc DL(N);
24604
24605     // Check for BEXTR.
24606     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
24607         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
24608       ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
24609       ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
24610       if (MaskNode && ShiftNode) {
24611         uint64_t Mask = MaskNode->getZExtValue();
24612         uint64_t Shift = ShiftNode->getZExtValue();
24613         if (isMask_64(Mask)) {
24614           uint64_t MaskSize = CountPopulation_64(Mask);
24615           if (Shift + MaskSize <= VT.getSizeInBits())
24616             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
24617                                DAG.getConstant(Shift | (MaskSize << 8), VT));
24618         }
24619       }
24620     } // BEXTR
24621
24622     return SDValue();
24623   }
24624
24625   // Want to form ANDNP nodes:
24626   // 1) In the hopes of then easily combining them with OR and AND nodes
24627   //    to form PBLEND/PSIGN.
24628   // 2) To match ANDN packed intrinsics
24629   if (VT != MVT::v2i64 && VT != MVT::v4i64)
24630     return SDValue();
24631
24632   SDValue N0 = N->getOperand(0);
24633   SDValue N1 = N->getOperand(1);
24634   SDLoc DL(N);
24635
24636   // Check LHS for vnot
24637   if (N0.getOpcode() == ISD::XOR &&
24638       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
24639       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
24640     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
24641
24642   // Check RHS for vnot
24643   if (N1.getOpcode() == ISD::XOR &&
24644       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
24645       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
24646     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
24647
24648   return SDValue();
24649 }
24650
24651 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
24652                                 TargetLowering::DAGCombinerInfo &DCI,
24653                                 const X86Subtarget *Subtarget) {
24654   if (DCI.isBeforeLegalizeOps())
24655     return SDValue();
24656
24657   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24658   if (R.getNode())
24659     return R;
24660
24661   SDValue N0 = N->getOperand(0);
24662   SDValue N1 = N->getOperand(1);
24663   EVT VT = N->getValueType(0);
24664
24665   // look for psign/blend
24666   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
24667     if (!Subtarget->hasSSSE3() ||
24668         (VT == MVT::v4i64 && !Subtarget->hasInt256()))
24669       return SDValue();
24670
24671     // Canonicalize pandn to RHS
24672     if (N0.getOpcode() == X86ISD::ANDNP)
24673       std::swap(N0, N1);
24674     // or (and (m, y), (pandn m, x))
24675     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
24676       SDValue Mask = N1.getOperand(0);
24677       SDValue X    = N1.getOperand(1);
24678       SDValue Y;
24679       if (N0.getOperand(0) == Mask)
24680         Y = N0.getOperand(1);
24681       if (N0.getOperand(1) == Mask)
24682         Y = N0.getOperand(0);
24683
24684       // Check to see if the mask appeared in both the AND and ANDNP and
24685       if (!Y.getNode())
24686         return SDValue();
24687
24688       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
24689       // Look through mask bitcast.
24690       if (Mask.getOpcode() == ISD::BITCAST)
24691         Mask = Mask.getOperand(0);
24692       if (X.getOpcode() == ISD::BITCAST)
24693         X = X.getOperand(0);
24694       if (Y.getOpcode() == ISD::BITCAST)
24695         Y = Y.getOperand(0);
24696
24697       EVT MaskVT = Mask.getValueType();
24698
24699       // Validate that the Mask operand is a vector sra node.
24700       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
24701       // there is no psrai.b
24702       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
24703       unsigned SraAmt = ~0;
24704       if (Mask.getOpcode() == ISD::SRA) {
24705         if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
24706           if (auto *AmtConst = AmtBV->getConstantSplatNode())
24707             SraAmt = AmtConst->getZExtValue();
24708       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
24709         SDValue SraC = Mask.getOperand(1);
24710         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
24711       }
24712       if ((SraAmt + 1) != EltBits)
24713         return SDValue();
24714
24715       SDLoc DL(N);
24716
24717       // Now we know we at least have a plendvb with the mask val.  See if
24718       // we can form a psignb/w/d.
24719       // psign = x.type == y.type == mask.type && y = sub(0, x);
24720       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
24721           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
24722           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
24723         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
24724                "Unsupported VT for PSIGN");
24725         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
24726         return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24727       }
24728       // PBLENDVB only available on SSE 4.1
24729       if (!Subtarget->hasSSE41())
24730         return SDValue();
24731
24732       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
24733
24734       X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
24735       Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
24736       Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
24737       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
24738       return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24739     }
24740   }
24741
24742   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
24743     return SDValue();
24744
24745   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
24746   MachineFunction &MF = DAG.getMachineFunction();
24747   bool OptForSize = MF.getFunction()->getAttributes().
24748     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
24749
24750   // SHLD/SHRD instructions have lower register pressure, but on some
24751   // platforms they have higher latency than the equivalent
24752   // series of shifts/or that would otherwise be generated.
24753   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
24754   // have higher latencies and we are not optimizing for size.
24755   if (!OptForSize && Subtarget->isSHLDSlow())
24756     return SDValue();
24757
24758   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
24759     std::swap(N0, N1);
24760   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
24761     return SDValue();
24762   if (!N0.hasOneUse() || !N1.hasOneUse())
24763     return SDValue();
24764
24765   SDValue ShAmt0 = N0.getOperand(1);
24766   if (ShAmt0.getValueType() != MVT::i8)
24767     return SDValue();
24768   SDValue ShAmt1 = N1.getOperand(1);
24769   if (ShAmt1.getValueType() != MVT::i8)
24770     return SDValue();
24771   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
24772     ShAmt0 = ShAmt0.getOperand(0);
24773   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
24774     ShAmt1 = ShAmt1.getOperand(0);
24775
24776   SDLoc DL(N);
24777   unsigned Opc = X86ISD::SHLD;
24778   SDValue Op0 = N0.getOperand(0);
24779   SDValue Op1 = N1.getOperand(0);
24780   if (ShAmt0.getOpcode() == ISD::SUB) {
24781     Opc = X86ISD::SHRD;
24782     std::swap(Op0, Op1);
24783     std::swap(ShAmt0, ShAmt1);
24784   }
24785
24786   unsigned Bits = VT.getSizeInBits();
24787   if (ShAmt1.getOpcode() == ISD::SUB) {
24788     SDValue Sum = ShAmt1.getOperand(0);
24789     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
24790       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
24791       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
24792         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
24793       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
24794         return DAG.getNode(Opc, DL, VT,
24795                            Op0, Op1,
24796                            DAG.getNode(ISD::TRUNCATE, DL,
24797                                        MVT::i8, ShAmt0));
24798     }
24799   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
24800     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
24801     if (ShAmt0C &&
24802         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
24803       return DAG.getNode(Opc, DL, VT,
24804                          N0.getOperand(0), N1.getOperand(0),
24805                          DAG.getNode(ISD::TRUNCATE, DL,
24806                                        MVT::i8, ShAmt0));
24807   }
24808
24809   return SDValue();
24810 }
24811
24812 // Generate NEG and CMOV for integer abs.
24813 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
24814   EVT VT = N->getValueType(0);
24815
24816   // Since X86 does not have CMOV for 8-bit integer, we don't convert
24817   // 8-bit integer abs to NEG and CMOV.
24818   if (VT.isInteger() && VT.getSizeInBits() == 8)
24819     return SDValue();
24820
24821   SDValue N0 = N->getOperand(0);
24822   SDValue N1 = N->getOperand(1);
24823   SDLoc DL(N);
24824
24825   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
24826   // and change it to SUB and CMOV.
24827   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
24828       N0.getOpcode() == ISD::ADD &&
24829       N0.getOperand(1) == N1 &&
24830       N1.getOpcode() == ISD::SRA &&
24831       N1.getOperand(0) == N0.getOperand(0))
24832     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
24833       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
24834         // Generate SUB & CMOV.
24835         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
24836                                   DAG.getConstant(0, VT), N0.getOperand(0));
24837
24838         SDValue Ops[] = { N0.getOperand(0), Neg,
24839                           DAG.getConstant(X86::COND_GE, MVT::i8),
24840                           SDValue(Neg.getNode(), 1) };
24841         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
24842       }
24843   return SDValue();
24844 }
24845
24846 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
24847 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
24848                                  TargetLowering::DAGCombinerInfo &DCI,
24849                                  const X86Subtarget *Subtarget) {
24850   if (DCI.isBeforeLegalizeOps())
24851     return SDValue();
24852
24853   if (Subtarget->hasCMov()) {
24854     SDValue RV = performIntegerAbsCombine(N, DAG);
24855     if (RV.getNode())
24856       return RV;
24857   }
24858
24859   return SDValue();
24860 }
24861
24862 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
24863 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
24864                                   TargetLowering::DAGCombinerInfo &DCI,
24865                                   const X86Subtarget *Subtarget) {
24866   LoadSDNode *Ld = cast<LoadSDNode>(N);
24867   EVT RegVT = Ld->getValueType(0);
24868   EVT MemVT = Ld->getMemoryVT();
24869   SDLoc dl(Ld);
24870   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24871
24872   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
24873   // into two 16-byte operations.
24874   ISD::LoadExtType Ext = Ld->getExtensionType();
24875   unsigned Alignment = Ld->getAlignment();
24876   bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
24877   if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24878       !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
24879     unsigned NumElems = RegVT.getVectorNumElements();
24880     if (NumElems < 2)
24881       return SDValue();
24882
24883     SDValue Ptr = Ld->getBasePtr();
24884     SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
24885
24886     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
24887                                   NumElems/2);
24888     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24889                                 Ld->getPointerInfo(), Ld->isVolatile(),
24890                                 Ld->isNonTemporal(), Ld->isInvariant(),
24891                                 Alignment);
24892     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24893     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24894                                 Ld->getPointerInfo(), Ld->isVolatile(),
24895                                 Ld->isNonTemporal(), Ld->isInvariant(),
24896                                 std::min(16U, Alignment));
24897     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24898                              Load1.getValue(1),
24899                              Load2.getValue(1));
24900
24901     SDValue NewVec = DAG.getUNDEF(RegVT);
24902     NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
24903     NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
24904     return DCI.CombineTo(N, NewVec, TF, true);
24905   }
24906
24907   return SDValue();
24908 }
24909
24910 /// PerformMLOADCombine - Resolve extending loads
24911 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
24912                                    TargetLowering::DAGCombinerInfo &DCI,
24913                                    const X86Subtarget *Subtarget) {
24914   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
24915   if (Mld->getExtensionType() != ISD::SEXTLOAD)
24916     return SDValue();
24917
24918   EVT VT = Mld->getValueType(0);
24919   unsigned NumElems = VT.getVectorNumElements();
24920   EVT LdVT = Mld->getMemoryVT();
24921   SDLoc dl(Mld);
24922
24923   assert(LdVT != VT && "Cannot extend to the same type");
24924   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
24925   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
24926   // From, To sizes and ElemCount must be pow of two
24927   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
24928     "Unexpected size for extending masked load");
24929
24930   unsigned SizeRatio  = ToSz / FromSz;
24931   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
24932
24933   // Create a type on which we perform the shuffle
24934   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24935           LdVT.getScalarType(), NumElems*SizeRatio);
24936   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24937
24938   // Convert Src0 value
24939   SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
24940   if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
24941     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24942     for (unsigned i = 0; i != NumElems; ++i)
24943       ShuffleVec[i] = i * SizeRatio;
24944
24945     // Can't shuffle using an illegal type.
24946     assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
24947             && "WideVecVT should be legal");
24948     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
24949                                     DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
24950   }
24951   // Prepare the new mask
24952   SDValue NewMask;
24953   SDValue Mask = Mld->getMask();
24954   if (Mask.getValueType() == VT) {
24955     // Mask and original value have the same type
24956     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
24957     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24958     for (unsigned i = 0; i != NumElems; ++i)
24959       ShuffleVec[i] = i * SizeRatio;
24960     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
24961       ShuffleVec[i] = NumElems*SizeRatio;
24962     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
24963                                    DAG.getConstant(0, WideVecVT),
24964                                    &ShuffleVec[0]);
24965   }
24966   else {
24967     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
24968     unsigned WidenNumElts = NumElems*SizeRatio;
24969     unsigned MaskNumElts = VT.getVectorNumElements();
24970     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
24971                                      WidenNumElts);
24972
24973     unsigned NumConcat = WidenNumElts / MaskNumElts;
24974     SmallVector<SDValue, 16> Ops(NumConcat);
24975     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
24976     Ops[0] = Mask;
24977     for (unsigned i = 1; i != NumConcat; ++i)
24978       Ops[i] = ZeroVal;
24979
24980     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
24981   }
24982
24983   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
24984                                      Mld->getBasePtr(), NewMask, WideSrc0,
24985                                      Mld->getMemoryVT(), Mld->getMemOperand(),
24986                                      ISD::NON_EXTLOAD);
24987   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
24988   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
24989
24990 }
24991 /// PerformMSTORECombine - Resolve truncating stores
24992 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
24993                                     const X86Subtarget *Subtarget) {
24994   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
24995   if (!Mst->isTruncatingStore())
24996     return SDValue();
24997
24998   EVT VT = Mst->getValue().getValueType();
24999   unsigned NumElems = VT.getVectorNumElements();
25000   EVT StVT = Mst->getMemoryVT();
25001   SDLoc dl(Mst);
25002
25003   assert(StVT != VT && "Cannot truncate to the same type");
25004   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25005   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25006
25007   // From, To sizes and ElemCount must be pow of two
25008   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
25009     "Unexpected size for truncating masked store");
25010   // We are going to use the original vector elt for storing.
25011   // Accumulated smaller vector elements must be a multiple of the store size.
25012   assert (((NumElems * FromSz) % ToSz) == 0 &&
25013           "Unexpected ratio for truncating masked store");
25014
25015   unsigned SizeRatio  = FromSz / ToSz;
25016   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25017
25018   // Create a type on which we perform the shuffle
25019   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25020           StVT.getScalarType(), NumElems*SizeRatio);
25021
25022   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25023
25024   SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
25025   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
25026   for (unsigned i = 0; i != NumElems; ++i)
25027     ShuffleVec[i] = i * SizeRatio;
25028
25029   // Can't shuffle using an illegal type.
25030   assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
25031           && "WideVecVT should be legal");
25032
25033   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25034                                         DAG.getUNDEF(WideVecVT),
25035                                         &ShuffleVec[0]);
25036
25037   SDValue NewMask;
25038   SDValue Mask = Mst->getMask();
25039   if (Mask.getValueType() == VT) {
25040     // Mask and original value have the same type
25041     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
25042     for (unsigned i = 0; i != NumElems; ++i)
25043       ShuffleVec[i] = i * SizeRatio;
25044     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
25045       ShuffleVec[i] = NumElems*SizeRatio;
25046     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
25047                                    DAG.getConstant(0, WideVecVT),
25048                                    &ShuffleVec[0]);
25049   }
25050   else {
25051     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
25052     unsigned WidenNumElts = NumElems*SizeRatio;
25053     unsigned MaskNumElts = VT.getVectorNumElements();
25054     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
25055                                      WidenNumElts);
25056
25057     unsigned NumConcat = WidenNumElts / MaskNumElts;
25058     SmallVector<SDValue, 16> Ops(NumConcat);
25059     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
25060     Ops[0] = Mask;
25061     for (unsigned i = 1; i != NumConcat; ++i)
25062       Ops[i] = ZeroVal;
25063
25064     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
25065   }
25066
25067   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
25068                             NewMask, StVT, Mst->getMemOperand(), false);
25069 }
25070 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
25071 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
25072                                    const X86Subtarget *Subtarget) {
25073   StoreSDNode *St = cast<StoreSDNode>(N);
25074   EVT VT = St->getValue().getValueType();
25075   EVT StVT = St->getMemoryVT();
25076   SDLoc dl(St);
25077   SDValue StoredVal = St->getOperand(1);
25078   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25079
25080   // If we are saving a concatenation of two XMM registers and 32-byte stores
25081   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
25082   unsigned Alignment = St->getAlignment();
25083   bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
25084   if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
25085       StVT == VT && !IsAligned) {
25086     unsigned NumElems = VT.getVectorNumElements();
25087     if (NumElems < 2)
25088       return SDValue();
25089
25090     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
25091     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
25092
25093     SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
25094     SDValue Ptr0 = St->getBasePtr();
25095     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
25096
25097     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
25098                                 St->getPointerInfo(), St->isVolatile(),
25099                                 St->isNonTemporal(), Alignment);
25100     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
25101                                 St->getPointerInfo(), St->isVolatile(),
25102                                 St->isNonTemporal(),
25103                                 std::min(16U, Alignment));
25104     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
25105   }
25106
25107   // Optimize trunc store (of multiple scalars) to shuffle and store.
25108   // First, pack all of the elements in one place. Next, store to memory
25109   // in fewer chunks.
25110   if (St->isTruncatingStore() && VT.isVector()) {
25111     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25112     unsigned NumElems = VT.getVectorNumElements();
25113     assert(StVT != VT && "Cannot truncate to the same type");
25114     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
25115     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
25116
25117     // From, To sizes and ElemCount must be pow of two
25118     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
25119     // We are going to use the original vector elt for storing.
25120     // Accumulated smaller vector elements must be a multiple of the store size.
25121     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
25122
25123     unsigned SizeRatio  = FromSz / ToSz;
25124
25125     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
25126
25127     // Create a type on which we perform the shuffle
25128     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
25129             StVT.getScalarType(), NumElems*SizeRatio);
25130
25131     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
25132
25133     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
25134     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
25135     for (unsigned i = 0; i != NumElems; ++i)
25136       ShuffleVec[i] = i * SizeRatio;
25137
25138     // Can't shuffle using an illegal type.
25139     if (!TLI.isTypeLegal(WideVecVT))
25140       return SDValue();
25141
25142     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
25143                                          DAG.getUNDEF(WideVecVT),
25144                                          &ShuffleVec[0]);
25145     // At this point all of the data is stored at the bottom of the
25146     // register. We now need to save it to mem.
25147
25148     // Find the largest store unit
25149     MVT StoreType = MVT::i8;
25150     for (MVT Tp : MVT::integer_valuetypes()) {
25151       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
25152         StoreType = Tp;
25153     }
25154
25155     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
25156     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
25157         (64 <= NumElems * ToSz))
25158       StoreType = MVT::f64;
25159
25160     // Bitcast the original vector into a vector of store-size units
25161     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
25162             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
25163     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
25164     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
25165     SmallVector<SDValue, 8> Chains;
25166     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
25167                                         TLI.getPointerTy());
25168     SDValue Ptr = St->getBasePtr();
25169
25170     // Perform one or more big stores into memory.
25171     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
25172       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
25173                                    StoreType, ShuffWide,
25174                                    DAG.getIntPtrConstant(i));
25175       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
25176                                 St->getPointerInfo(), St->isVolatile(),
25177                                 St->isNonTemporal(), St->getAlignment());
25178       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
25179       Chains.push_back(Ch);
25180     }
25181
25182     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
25183   }
25184
25185   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
25186   // the FP state in cases where an emms may be missing.
25187   // A preferable solution to the general problem is to figure out the right
25188   // places to insert EMMS.  This qualifies as a quick hack.
25189
25190   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
25191   if (VT.getSizeInBits() != 64)
25192     return SDValue();
25193
25194   const Function *F = DAG.getMachineFunction().getFunction();
25195   bool NoImplicitFloatOps = F->getAttributes().
25196     hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
25197   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
25198                      && Subtarget->hasSSE2();
25199   if ((VT.isVector() ||
25200        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
25201       isa<LoadSDNode>(St->getValue()) &&
25202       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
25203       St->getChain().hasOneUse() && !St->isVolatile()) {
25204     SDNode* LdVal = St->getValue().getNode();
25205     LoadSDNode *Ld = nullptr;
25206     int TokenFactorIndex = -1;
25207     SmallVector<SDValue, 8> Ops;
25208     SDNode* ChainVal = St->getChain().getNode();
25209     // Must be a store of a load.  We currently handle two cases:  the load
25210     // is a direct child, and it's under an intervening TokenFactor.  It is
25211     // possible to dig deeper under nested TokenFactors.
25212     if (ChainVal == LdVal)
25213       Ld = cast<LoadSDNode>(St->getChain());
25214     else if (St->getValue().hasOneUse() &&
25215              ChainVal->getOpcode() == ISD::TokenFactor) {
25216       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
25217         if (ChainVal->getOperand(i).getNode() == LdVal) {
25218           TokenFactorIndex = i;
25219           Ld = cast<LoadSDNode>(St->getValue());
25220         } else
25221           Ops.push_back(ChainVal->getOperand(i));
25222       }
25223     }
25224
25225     if (!Ld || !ISD::isNormalLoad(Ld))
25226       return SDValue();
25227
25228     // If this is not the MMX case, i.e. we are just turning i64 load/store
25229     // into f64 load/store, avoid the transformation if there are multiple
25230     // uses of the loaded value.
25231     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
25232       return SDValue();
25233
25234     SDLoc LdDL(Ld);
25235     SDLoc StDL(N);
25236     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
25237     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
25238     // pair instead.
25239     if (Subtarget->is64Bit() || F64IsLegal) {
25240       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
25241       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
25242                                   Ld->getPointerInfo(), Ld->isVolatile(),
25243                                   Ld->isNonTemporal(), Ld->isInvariant(),
25244                                   Ld->getAlignment());
25245       SDValue NewChain = NewLd.getValue(1);
25246       if (TokenFactorIndex != -1) {
25247         Ops.push_back(NewChain);
25248         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25249       }
25250       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
25251                           St->getPointerInfo(),
25252                           St->isVolatile(), St->isNonTemporal(),
25253                           St->getAlignment());
25254     }
25255
25256     // Otherwise, lower to two pairs of 32-bit loads / stores.
25257     SDValue LoAddr = Ld->getBasePtr();
25258     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
25259                                  DAG.getConstant(4, MVT::i32));
25260
25261     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
25262                                Ld->getPointerInfo(),
25263                                Ld->isVolatile(), Ld->isNonTemporal(),
25264                                Ld->isInvariant(), Ld->getAlignment());
25265     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
25266                                Ld->getPointerInfo().getWithOffset(4),
25267                                Ld->isVolatile(), Ld->isNonTemporal(),
25268                                Ld->isInvariant(),
25269                                MinAlign(Ld->getAlignment(), 4));
25270
25271     SDValue NewChain = LoLd.getValue(1);
25272     if (TokenFactorIndex != -1) {
25273       Ops.push_back(LoLd);
25274       Ops.push_back(HiLd);
25275       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25276     }
25277
25278     LoAddr = St->getBasePtr();
25279     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
25280                          DAG.getConstant(4, MVT::i32));
25281
25282     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
25283                                 St->getPointerInfo(),
25284                                 St->isVolatile(), St->isNonTemporal(),
25285                                 St->getAlignment());
25286     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
25287                                 St->getPointerInfo().getWithOffset(4),
25288                                 St->isVolatile(),
25289                                 St->isNonTemporal(),
25290                                 MinAlign(St->getAlignment(), 4));
25291     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
25292   }
25293   return SDValue();
25294 }
25295
25296 /// Return 'true' if this vector operation is "horizontal"
25297 /// and return the operands for the horizontal operation in LHS and RHS.  A
25298 /// horizontal operation performs the binary operation on successive elements
25299 /// of its first operand, then on successive elements of its second operand,
25300 /// returning the resulting values in a vector.  For example, if
25301 ///   A = < float a0, float a1, float a2, float a3 >
25302 /// and
25303 ///   B = < float b0, float b1, float b2, float b3 >
25304 /// then the result of doing a horizontal operation on A and B is
25305 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
25306 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
25307 /// A horizontal-op B, for some already available A and B, and if so then LHS is
25308 /// set to A, RHS to B, and the routine returns 'true'.
25309 /// Note that the binary operation should have the property that if one of the
25310 /// operands is UNDEF then the result is UNDEF.
25311 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
25312   // Look for the following pattern: if
25313   //   A = < float a0, float a1, float a2, float a3 >
25314   //   B = < float b0, float b1, float b2, float b3 >
25315   // and
25316   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
25317   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
25318   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
25319   // which is A horizontal-op B.
25320
25321   // At least one of the operands should be a vector shuffle.
25322   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
25323       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
25324     return false;
25325
25326   MVT VT = LHS.getSimpleValueType();
25327
25328   assert((VT.is128BitVector() || VT.is256BitVector()) &&
25329          "Unsupported vector type for horizontal add/sub");
25330
25331   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
25332   // operate independently on 128-bit lanes.
25333   unsigned NumElts = VT.getVectorNumElements();
25334   unsigned NumLanes = VT.getSizeInBits()/128;
25335   unsigned NumLaneElts = NumElts / NumLanes;
25336   assert((NumLaneElts % 2 == 0) &&
25337          "Vector type should have an even number of elements in each lane");
25338   unsigned HalfLaneElts = NumLaneElts/2;
25339
25340   // View LHS in the form
25341   //   LHS = VECTOR_SHUFFLE A, B, LMask
25342   // If LHS is not a shuffle then pretend it is the shuffle
25343   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
25344   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
25345   // type VT.
25346   SDValue A, B;
25347   SmallVector<int, 16> LMask(NumElts);
25348   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25349     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
25350       A = LHS.getOperand(0);
25351     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
25352       B = LHS.getOperand(1);
25353     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
25354     std::copy(Mask.begin(), Mask.end(), LMask.begin());
25355   } else {
25356     if (LHS.getOpcode() != ISD::UNDEF)
25357       A = LHS;
25358     for (unsigned i = 0; i != NumElts; ++i)
25359       LMask[i] = i;
25360   }
25361
25362   // Likewise, view RHS in the form
25363   //   RHS = VECTOR_SHUFFLE C, D, RMask
25364   SDValue C, D;
25365   SmallVector<int, 16> RMask(NumElts);
25366   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25367     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
25368       C = RHS.getOperand(0);
25369     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
25370       D = RHS.getOperand(1);
25371     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
25372     std::copy(Mask.begin(), Mask.end(), RMask.begin());
25373   } else {
25374     if (RHS.getOpcode() != ISD::UNDEF)
25375       C = RHS;
25376     for (unsigned i = 0; i != NumElts; ++i)
25377       RMask[i] = i;
25378   }
25379
25380   // Check that the shuffles are both shuffling the same vectors.
25381   if (!(A == C && B == D) && !(A == D && B == C))
25382     return false;
25383
25384   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
25385   if (!A.getNode() && !B.getNode())
25386     return false;
25387
25388   // If A and B occur in reverse order in RHS, then "swap" them (which means
25389   // rewriting the mask).
25390   if (A != C)
25391     CommuteVectorShuffleMask(RMask, NumElts);
25392
25393   // At this point LHS and RHS are equivalent to
25394   //   LHS = VECTOR_SHUFFLE A, B, LMask
25395   //   RHS = VECTOR_SHUFFLE A, B, RMask
25396   // Check that the masks correspond to performing a horizontal operation.
25397   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
25398     for (unsigned i = 0; i != NumLaneElts; ++i) {
25399       int LIdx = LMask[i+l], RIdx = RMask[i+l];
25400
25401       // Ignore any UNDEF components.
25402       if (LIdx < 0 || RIdx < 0 ||
25403           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
25404           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
25405         continue;
25406
25407       // Check that successive elements are being operated on.  If not, this is
25408       // not a horizontal operation.
25409       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
25410       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
25411       if (!(LIdx == Index && RIdx == Index + 1) &&
25412           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
25413         return false;
25414     }
25415   }
25416
25417   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
25418   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
25419   return true;
25420 }
25421
25422 /// Do target-specific dag combines on floating point adds.
25423 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
25424                                   const X86Subtarget *Subtarget) {
25425   EVT VT = N->getValueType(0);
25426   SDValue LHS = N->getOperand(0);
25427   SDValue RHS = N->getOperand(1);
25428
25429   // Try to synthesize horizontal adds from adds of shuffles.
25430   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25431        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25432       isHorizontalBinOp(LHS, RHS, true))
25433     return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
25434   return SDValue();
25435 }
25436
25437 /// Do target-specific dag combines on floating point subs.
25438 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
25439                                   const X86Subtarget *Subtarget) {
25440   EVT VT = N->getValueType(0);
25441   SDValue LHS = N->getOperand(0);
25442   SDValue RHS = N->getOperand(1);
25443
25444   // Try to synthesize horizontal subs from subs of shuffles.
25445   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25446        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25447       isHorizontalBinOp(LHS, RHS, false))
25448     return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
25449   return SDValue();
25450 }
25451
25452 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
25453 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
25454   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
25455
25456   // F[X]OR(0.0, x) -> x
25457   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25458     if (C->getValueAPF().isPosZero())
25459       return N->getOperand(1);
25460
25461   // F[X]OR(x, 0.0) -> x
25462   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25463     if (C->getValueAPF().isPosZero())
25464       return N->getOperand(0);
25465   return SDValue();
25466 }
25467
25468 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
25469 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
25470   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
25471
25472   // Only perform optimizations if UnsafeMath is used.
25473   if (!DAG.getTarget().Options.UnsafeFPMath)
25474     return SDValue();
25475
25476   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
25477   // into FMINC and FMAXC, which are Commutative operations.
25478   unsigned NewOp = 0;
25479   switch (N->getOpcode()) {
25480     default: llvm_unreachable("unknown opcode");
25481     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
25482     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
25483   }
25484
25485   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
25486                      N->getOperand(0), N->getOperand(1));
25487 }
25488
25489 /// Do target-specific dag combines on X86ISD::FAND nodes.
25490 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
25491   // FAND(0.0, x) -> 0.0
25492   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25493     if (C->getValueAPF().isPosZero())
25494       return N->getOperand(0);
25495
25496   // FAND(x, 0.0) -> 0.0
25497   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25498     if (C->getValueAPF().isPosZero())
25499       return N->getOperand(1);
25500
25501   return SDValue();
25502 }
25503
25504 /// Do target-specific dag combines on X86ISD::FANDN nodes
25505 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
25506   // FANDN(0.0, x) -> x
25507   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25508     if (C->getValueAPF().isPosZero())
25509       return N->getOperand(1);
25510
25511   // FANDN(x, 0.0) -> 0.0
25512   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25513     if (C->getValueAPF().isPosZero())
25514       return N->getOperand(1);
25515
25516   return SDValue();
25517 }
25518
25519 static SDValue PerformBTCombine(SDNode *N,
25520                                 SelectionDAG &DAG,
25521                                 TargetLowering::DAGCombinerInfo &DCI) {
25522   // BT ignores high bits in the bit index operand.
25523   SDValue Op1 = N->getOperand(1);
25524   if (Op1.hasOneUse()) {
25525     unsigned BitWidth = Op1.getValueSizeInBits();
25526     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
25527     APInt KnownZero, KnownOne;
25528     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
25529                                           !DCI.isBeforeLegalizeOps());
25530     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25531     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
25532         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
25533       DCI.CommitTargetLoweringOpt(TLO);
25534   }
25535   return SDValue();
25536 }
25537
25538 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
25539   SDValue Op = N->getOperand(0);
25540   if (Op.getOpcode() == ISD::BITCAST)
25541     Op = Op.getOperand(0);
25542   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
25543   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
25544       VT.getVectorElementType().getSizeInBits() ==
25545       OpVT.getVectorElementType().getSizeInBits()) {
25546     return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
25547   }
25548   return SDValue();
25549 }
25550
25551 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
25552                                                const X86Subtarget *Subtarget) {
25553   EVT VT = N->getValueType(0);
25554   if (!VT.isVector())
25555     return SDValue();
25556
25557   SDValue N0 = N->getOperand(0);
25558   SDValue N1 = N->getOperand(1);
25559   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
25560   SDLoc dl(N);
25561
25562   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
25563   // both SSE and AVX2 since there is no sign-extended shift right
25564   // operation on a vector with 64-bit elements.
25565   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
25566   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
25567   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
25568       N0.getOpcode() == ISD::SIGN_EXTEND)) {
25569     SDValue N00 = N0.getOperand(0);
25570
25571     // EXTLOAD has a better solution on AVX2,
25572     // it may be replaced with X86ISD::VSEXT node.
25573     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
25574       if (!ISD::isNormalLoad(N00.getNode()))
25575         return SDValue();
25576
25577     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
25578         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
25579                                   N00, N1);
25580       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
25581     }
25582   }
25583   return SDValue();
25584 }
25585
25586 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
25587                                   TargetLowering::DAGCombinerInfo &DCI,
25588                                   const X86Subtarget *Subtarget) {
25589   SDValue N0 = N->getOperand(0);
25590   EVT VT = N->getValueType(0);
25591
25592   // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
25593   // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
25594   // This exposes the sext to the sdivrem lowering, so that it directly extends
25595   // from AH (which we otherwise need to do contortions to access).
25596   if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
25597       N0.getValueType() == MVT::i8 && VT == MVT::i32) {
25598     SDLoc dl(N);
25599     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25600     SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
25601                             N0.getOperand(0), N0.getOperand(1));
25602     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25603     return R.getValue(1);
25604   }
25605
25606   if (!DCI.isBeforeLegalizeOps())
25607     return SDValue();
25608
25609   if (!Subtarget->hasFp256())
25610     return SDValue();
25611
25612   if (VT.isVector() && VT.getSizeInBits() == 256) {
25613     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25614     if (R.getNode())
25615       return R;
25616   }
25617
25618   return SDValue();
25619 }
25620
25621 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
25622                                  const X86Subtarget* Subtarget) {
25623   SDLoc dl(N);
25624   EVT VT = N->getValueType(0);
25625
25626   // Let legalize expand this if it isn't a legal type yet.
25627   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
25628     return SDValue();
25629
25630   EVT ScalarVT = VT.getScalarType();
25631   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
25632       (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
25633     return SDValue();
25634
25635   SDValue A = N->getOperand(0);
25636   SDValue B = N->getOperand(1);
25637   SDValue C = N->getOperand(2);
25638
25639   bool NegA = (A.getOpcode() == ISD::FNEG);
25640   bool NegB = (B.getOpcode() == ISD::FNEG);
25641   bool NegC = (C.getOpcode() == ISD::FNEG);
25642
25643   // Negative multiplication when NegA xor NegB
25644   bool NegMul = (NegA != NegB);
25645   if (NegA)
25646     A = A.getOperand(0);
25647   if (NegB)
25648     B = B.getOperand(0);
25649   if (NegC)
25650     C = C.getOperand(0);
25651
25652   unsigned Opcode;
25653   if (!NegMul)
25654     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
25655   else
25656     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
25657
25658   return DAG.getNode(Opcode, dl, VT, A, B, C);
25659 }
25660
25661 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
25662                                   TargetLowering::DAGCombinerInfo &DCI,
25663                                   const X86Subtarget *Subtarget) {
25664   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
25665   //           (and (i32 x86isd::setcc_carry), 1)
25666   // This eliminates the zext. This transformation is necessary because
25667   // ISD::SETCC is always legalized to i8.
25668   SDLoc dl(N);
25669   SDValue N0 = N->getOperand(0);
25670   EVT VT = N->getValueType(0);
25671
25672   if (N0.getOpcode() == ISD::AND &&
25673       N0.hasOneUse() &&
25674       N0.getOperand(0).hasOneUse()) {
25675     SDValue N00 = N0.getOperand(0);
25676     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25677       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
25678       if (!C || C->getZExtValue() != 1)
25679         return SDValue();
25680       return DAG.getNode(ISD::AND, dl, VT,
25681                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25682                                      N00.getOperand(0), N00.getOperand(1)),
25683                          DAG.getConstant(1, VT));
25684     }
25685   }
25686
25687   if (N0.getOpcode() == ISD::TRUNCATE &&
25688       N0.hasOneUse() &&
25689       N0.getOperand(0).hasOneUse()) {
25690     SDValue N00 = N0.getOperand(0);
25691     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25692       return DAG.getNode(ISD::AND, dl, VT,
25693                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25694                                      N00.getOperand(0), N00.getOperand(1)),
25695                          DAG.getConstant(1, VT));
25696     }
25697   }
25698   if (VT.is256BitVector()) {
25699     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25700     if (R.getNode())
25701       return R;
25702   }
25703
25704   // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
25705   // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
25706   // This exposes the zext to the udivrem lowering, so that it directly extends
25707   // from AH (which we otherwise need to do contortions to access).
25708   if (N0.getOpcode() == ISD::UDIVREM &&
25709       N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
25710       (VT == MVT::i32 || VT == MVT::i64)) {
25711     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25712     SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
25713                             N0.getOperand(0), N0.getOperand(1));
25714     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25715     return R.getValue(1);
25716   }
25717
25718   return SDValue();
25719 }
25720
25721 // Optimize x == -y --> x+y == 0
25722 //          x != -y --> x+y != 0
25723 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
25724                                       const X86Subtarget* Subtarget) {
25725   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
25726   SDValue LHS = N->getOperand(0);
25727   SDValue RHS = N->getOperand(1);
25728   EVT VT = N->getValueType(0);
25729   SDLoc DL(N);
25730
25731   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
25732     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
25733       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
25734         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25735                                    LHS.getValueType(), RHS, LHS.getOperand(1));
25736         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25737                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25738       }
25739   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
25740     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
25741       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
25742         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25743                                    RHS.getValueType(), LHS, RHS.getOperand(1));
25744         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25745                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25746       }
25747
25748   if (VT.getScalarType() == MVT::i1) {
25749     bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
25750       (LHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25751     bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
25752     if (!IsSEXT0 && !IsVZero0)
25753       return SDValue();
25754     bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
25755       (RHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25756     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
25757
25758     if (!IsSEXT1 && !IsVZero1)
25759       return SDValue();
25760
25761     if (IsSEXT0 && IsVZero1) {
25762       assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
25763       if (CC == ISD::SETEQ)
25764         return DAG.getNOT(DL, LHS.getOperand(0), VT);
25765       return LHS.getOperand(0);
25766     }
25767     if (IsSEXT1 && IsVZero0) {
25768       assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
25769       if (CC == ISD::SETEQ)
25770         return DAG.getNOT(DL, RHS.getOperand(0), VT);
25771       return RHS.getOperand(0);
25772     }
25773   }
25774
25775   return SDValue();
25776 }
25777
25778 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
25779                                       const X86Subtarget *Subtarget) {
25780   SDLoc dl(N);
25781   MVT VT = N->getOperand(1)->getSimpleValueType(0);
25782   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
25783          "X86insertps is only defined for v4x32");
25784
25785   SDValue Ld = N->getOperand(1);
25786   if (MayFoldLoad(Ld)) {
25787     // Extract the countS bits from the immediate so we can get the proper
25788     // address when narrowing the vector load to a specific element.
25789     // When the second source op is a memory address, interps doesn't use
25790     // countS and just gets an f32 from that address.
25791     unsigned DestIndex =
25792         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
25793     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
25794   } else
25795     return SDValue();
25796
25797   // Create this as a scalar to vector to match the instruction pattern.
25798   SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
25799   // countS bits are ignored when loading from memory on insertps, which
25800   // means we don't need to explicitly set them to 0.
25801   return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
25802                      LoadScalarToVector, N->getOperand(2));
25803 }
25804
25805 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
25806 // as "sbb reg,reg", since it can be extended without zext and produces
25807 // an all-ones bit which is more useful than 0/1 in some cases.
25808 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
25809                                MVT VT) {
25810   if (VT == MVT::i8)
25811     return DAG.getNode(ISD::AND, DL, VT,
25812                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25813                                    DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
25814                        DAG.getConstant(1, VT));
25815   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
25816   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
25817                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25818                                  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
25819 }
25820
25821 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
25822 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
25823                                    TargetLowering::DAGCombinerInfo &DCI,
25824                                    const X86Subtarget *Subtarget) {
25825   SDLoc DL(N);
25826   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
25827   SDValue EFLAGS = N->getOperand(1);
25828
25829   if (CC == X86::COND_A) {
25830     // Try to convert COND_A into COND_B in an attempt to facilitate
25831     // materializing "setb reg".
25832     //
25833     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
25834     // cannot take an immediate as its first operand.
25835     //
25836     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
25837         EFLAGS.getValueType().isInteger() &&
25838         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
25839       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
25840                                    EFLAGS.getNode()->getVTList(),
25841                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
25842       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
25843       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
25844     }
25845   }
25846
25847   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
25848   // a zext and produces an all-ones bit which is more useful than 0/1 in some
25849   // cases.
25850   if (CC == X86::COND_B)
25851     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
25852
25853   SDValue Flags;
25854
25855   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25856   if (Flags.getNode()) {
25857     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25858     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
25859   }
25860
25861   return SDValue();
25862 }
25863
25864 // Optimize branch condition evaluation.
25865 //
25866 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
25867                                     TargetLowering::DAGCombinerInfo &DCI,
25868                                     const X86Subtarget *Subtarget) {
25869   SDLoc DL(N);
25870   SDValue Chain = N->getOperand(0);
25871   SDValue Dest = N->getOperand(1);
25872   SDValue EFLAGS = N->getOperand(3);
25873   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
25874
25875   SDValue Flags;
25876
25877   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25878   if (Flags.getNode()) {
25879     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25880     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
25881                        Flags);
25882   }
25883
25884   return SDValue();
25885 }
25886
25887 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
25888                                                          SelectionDAG &DAG) {
25889   // Take advantage of vector comparisons producing 0 or -1 in each lane to
25890   // optimize away operation when it's from a constant.
25891   //
25892   // The general transformation is:
25893   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
25894   //       AND(VECTOR_CMP(x,y), constant2)
25895   //    constant2 = UNARYOP(constant)
25896
25897   // Early exit if this isn't a vector operation, the operand of the
25898   // unary operation isn't a bitwise AND, or if the sizes of the operations
25899   // aren't the same.
25900   EVT VT = N->getValueType(0);
25901   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
25902       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
25903       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
25904     return SDValue();
25905
25906   // Now check that the other operand of the AND is a constant. We could
25907   // make the transformation for non-constant splats as well, but it's unclear
25908   // that would be a benefit as it would not eliminate any operations, just
25909   // perform one more step in scalar code before moving to the vector unit.
25910   if (BuildVectorSDNode *BV =
25911           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
25912     // Bail out if the vector isn't a constant.
25913     if (!BV->isConstant())
25914       return SDValue();
25915
25916     // Everything checks out. Build up the new and improved node.
25917     SDLoc DL(N);
25918     EVT IntVT = BV->getValueType(0);
25919     // Create a new constant of the appropriate type for the transformed
25920     // DAG.
25921     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
25922     // The AND node needs bitcasts to/from an integer vector type around it.
25923     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
25924     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
25925                                  N->getOperand(0)->getOperand(0), MaskConst);
25926     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
25927     return Res;
25928   }
25929
25930   return SDValue();
25931 }
25932
25933 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
25934                                         const X86Subtarget *Subtarget) {
25935   // First try to optimize away the conversion entirely when it's
25936   // conditionally from a constant. Vectors only.
25937   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
25938   if (Res != SDValue())
25939     return Res;
25940
25941   // Now move on to more general possibilities.
25942   SDValue Op0 = N->getOperand(0);
25943   EVT InVT = Op0->getValueType(0);
25944
25945   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
25946   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
25947     SDLoc dl(N);
25948     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
25949     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
25950     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
25951   }
25952
25953   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
25954   // a 32-bit target where SSE doesn't support i64->FP operations.
25955   if (Op0.getOpcode() == ISD::LOAD) {
25956     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
25957     EVT VT = Ld->getValueType(0);
25958     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
25959         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
25960         !Subtarget->is64Bit() && VT == MVT::i64) {
25961       SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
25962           SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
25963       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
25964       return FILDChain;
25965     }
25966   }
25967   return SDValue();
25968 }
25969
25970 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
25971 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
25972                                  X86TargetLowering::DAGCombinerInfo &DCI) {
25973   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
25974   // the result is either zero or one (depending on the input carry bit).
25975   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
25976   if (X86::isZeroNode(N->getOperand(0)) &&
25977       X86::isZeroNode(N->getOperand(1)) &&
25978       // We don't have a good way to replace an EFLAGS use, so only do this when
25979       // dead right now.
25980       SDValue(N, 1).use_empty()) {
25981     SDLoc DL(N);
25982     EVT VT = N->getValueType(0);
25983     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
25984     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
25985                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
25986                                            DAG.getConstant(X86::COND_B,MVT::i8),
25987                                            N->getOperand(2)),
25988                                DAG.getConstant(1, VT));
25989     return DCI.CombineTo(N, Res1, CarryOut);
25990   }
25991
25992   return SDValue();
25993 }
25994
25995 // fold (add Y, (sete  X, 0)) -> adc  0, Y
25996 //      (add Y, (setne X, 0)) -> sbb -1, Y
25997 //      (sub (sete  X, 0), Y) -> sbb  0, Y
25998 //      (sub (setne X, 0), Y) -> adc -1, Y
25999 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
26000   SDLoc DL(N);
26001
26002   // Look through ZExts.
26003   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
26004   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
26005     return SDValue();
26006
26007   SDValue SetCC = Ext.getOperand(0);
26008   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
26009     return SDValue();
26010
26011   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
26012   if (CC != X86::COND_E && CC != X86::COND_NE)
26013     return SDValue();
26014
26015   SDValue Cmp = SetCC.getOperand(1);
26016   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
26017       !X86::isZeroNode(Cmp.getOperand(1)) ||
26018       !Cmp.getOperand(0).getValueType().isInteger())
26019     return SDValue();
26020
26021   SDValue CmpOp0 = Cmp.getOperand(0);
26022   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
26023                                DAG.getConstant(1, CmpOp0.getValueType()));
26024
26025   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
26026   if (CC == X86::COND_NE)
26027     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
26028                        DL, OtherVal.getValueType(), OtherVal,
26029                        DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
26030   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
26031                      DL, OtherVal.getValueType(), OtherVal,
26032                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
26033 }
26034
26035 /// PerformADDCombine - Do target-specific dag combines on integer adds.
26036 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
26037                                  const X86Subtarget *Subtarget) {
26038   EVT VT = N->getValueType(0);
26039   SDValue Op0 = N->getOperand(0);
26040   SDValue Op1 = N->getOperand(1);
26041
26042   // Try to synthesize horizontal adds from adds of shuffles.
26043   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26044        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26045       isHorizontalBinOp(Op0, Op1, true))
26046     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
26047
26048   return OptimizeConditionalInDecrement(N, DAG);
26049 }
26050
26051 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
26052                                  const X86Subtarget *Subtarget) {
26053   SDValue Op0 = N->getOperand(0);
26054   SDValue Op1 = N->getOperand(1);
26055
26056   // X86 can't encode an immediate LHS of a sub. See if we can push the
26057   // negation into a preceding instruction.
26058   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
26059     // If the RHS of the sub is a XOR with one use and a constant, invert the
26060     // immediate. Then add one to the LHS of the sub so we can turn
26061     // X-Y -> X+~Y+1, saving one register.
26062     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
26063         isa<ConstantSDNode>(Op1.getOperand(1))) {
26064       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
26065       EVT VT = Op0.getValueType();
26066       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
26067                                    Op1.getOperand(0),
26068                                    DAG.getConstant(~XorC, VT));
26069       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
26070                          DAG.getConstant(C->getAPIntValue()+1, VT));
26071     }
26072   }
26073
26074   // Try to synthesize horizontal adds from adds of shuffles.
26075   EVT VT = N->getValueType(0);
26076   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
26077        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
26078       isHorizontalBinOp(Op0, Op1, true))
26079     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
26080
26081   return OptimizeConditionalInDecrement(N, DAG);
26082 }
26083
26084 /// performVZEXTCombine - Performs build vector combines
26085 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
26086                                    TargetLowering::DAGCombinerInfo &DCI,
26087                                    const X86Subtarget *Subtarget) {
26088   SDLoc DL(N);
26089   MVT VT = N->getSimpleValueType(0);
26090   SDValue Op = N->getOperand(0);
26091   MVT OpVT = Op.getSimpleValueType();
26092   MVT OpEltVT = OpVT.getVectorElementType();
26093   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
26094
26095   // (vzext (bitcast (vzext (x)) -> (vzext x)
26096   SDValue V = Op;
26097   while (V.getOpcode() == ISD::BITCAST)
26098     V = V.getOperand(0);
26099
26100   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
26101     MVT InnerVT = V.getSimpleValueType();
26102     MVT InnerEltVT = InnerVT.getVectorElementType();
26103
26104     // If the element sizes match exactly, we can just do one larger vzext. This
26105     // is always an exact type match as vzext operates on integer types.
26106     if (OpEltVT == InnerEltVT) {
26107       assert(OpVT == InnerVT && "Types must match for vzext!");
26108       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
26109     }
26110
26111     // The only other way we can combine them is if only a single element of the
26112     // inner vzext is used in the input to the outer vzext.
26113     if (InnerEltVT.getSizeInBits() < InputBits)
26114       return SDValue();
26115
26116     // In this case, the inner vzext is completely dead because we're going to
26117     // only look at bits inside of the low element. Just do the outer vzext on
26118     // a bitcast of the input to the inner.
26119     return DAG.getNode(X86ISD::VZEXT, DL, VT,
26120                        DAG.getNode(ISD::BITCAST, DL, OpVT, V));
26121   }
26122
26123   // Check if we can bypass extracting and re-inserting an element of an input
26124   // vector. Essentialy:
26125   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
26126   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
26127       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
26128       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
26129     SDValue ExtractedV = V.getOperand(0);
26130     SDValue OrigV = ExtractedV.getOperand(0);
26131     if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
26132       if (ExtractIdx->getZExtValue() == 0) {
26133         MVT OrigVT = OrigV.getSimpleValueType();
26134         // Extract a subvector if necessary...
26135         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
26136           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
26137           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
26138                                     OrigVT.getVectorNumElements() / Ratio);
26139           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
26140                               DAG.getIntPtrConstant(0));
26141         }
26142         Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
26143         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
26144       }
26145   }
26146
26147   return SDValue();
26148 }
26149
26150 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
26151                                              DAGCombinerInfo &DCI) const {
26152   SelectionDAG &DAG = DCI.DAG;
26153   switch (N->getOpcode()) {
26154   default: break;
26155   case ISD::EXTRACT_VECTOR_ELT:
26156     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
26157   case ISD::VSELECT:
26158   case ISD::SELECT:
26159   case X86ISD::SHRUNKBLEND:
26160     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
26161   case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG);
26162   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
26163   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
26164   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
26165   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
26166   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
26167   case ISD::SHL:
26168   case ISD::SRA:
26169   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
26170   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
26171   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
26172   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
26173   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
26174   case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
26175   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
26176   case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
26177   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
26178   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
26179   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
26180   case X86ISD::FXOR:
26181   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
26182   case X86ISD::FMIN:
26183   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
26184   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
26185   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
26186   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
26187   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
26188   case ISD::ANY_EXTEND:
26189   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
26190   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
26191   case ISD::SIGN_EXTEND_INREG:
26192     return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
26193   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
26194   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
26195   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
26196   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
26197   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
26198   case X86ISD::SHUFP:       // Handle all target specific shuffles
26199   case X86ISD::PALIGNR:
26200   case X86ISD::UNPCKH:
26201   case X86ISD::UNPCKL:
26202   case X86ISD::MOVHLPS:
26203   case X86ISD::MOVLHPS:
26204   case X86ISD::PSHUFB:
26205   case X86ISD::PSHUFD:
26206   case X86ISD::PSHUFHW:
26207   case X86ISD::PSHUFLW:
26208   case X86ISD::MOVSS:
26209   case X86ISD::MOVSD:
26210   case X86ISD::VPERMILPI:
26211   case X86ISD::VPERM2X128:
26212   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
26213   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
26214   case ISD::INTRINSIC_WO_CHAIN:
26215     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
26216   case X86ISD::INSERTPS: {
26217     if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
26218       return PerformINSERTPSCombine(N, DAG, Subtarget);
26219     break;
26220   }
26221   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
26222   }
26223
26224   return SDValue();
26225 }
26226
26227 /// isTypeDesirableForOp - Return true if the target has native support for
26228 /// the specified value type and it is 'desirable' to use the type for the
26229 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
26230 /// instruction encodings are longer and some i16 instructions are slow.
26231 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
26232   if (!isTypeLegal(VT))
26233     return false;
26234   if (VT != MVT::i16)
26235     return true;
26236
26237   switch (Opc) {
26238   default:
26239     return true;
26240   case ISD::LOAD:
26241   case ISD::SIGN_EXTEND:
26242   case ISD::ZERO_EXTEND:
26243   case ISD::ANY_EXTEND:
26244   case ISD::SHL:
26245   case ISD::SRL:
26246   case ISD::SUB:
26247   case ISD::ADD:
26248   case ISD::MUL:
26249   case ISD::AND:
26250   case ISD::OR:
26251   case ISD::XOR:
26252     return false;
26253   }
26254 }
26255
26256 /// IsDesirableToPromoteOp - This method query the target whether it is
26257 /// beneficial for dag combiner to promote the specified node. If true, it
26258 /// should return the desired promotion type by reference.
26259 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
26260   EVT VT = Op.getValueType();
26261   if (VT != MVT::i16)
26262     return false;
26263
26264   bool Promote = false;
26265   bool Commute = false;
26266   switch (Op.getOpcode()) {
26267   default: break;
26268   case ISD::LOAD: {
26269     LoadSDNode *LD = cast<LoadSDNode>(Op);
26270     // If the non-extending load has a single use and it's not live out, then it
26271     // might be folded.
26272     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
26273                                                      Op.hasOneUse()*/) {
26274       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
26275              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
26276         // The only case where we'd want to promote LOAD (rather then it being
26277         // promoted as an operand is when it's only use is liveout.
26278         if (UI->getOpcode() != ISD::CopyToReg)
26279           return false;
26280       }
26281     }
26282     Promote = true;
26283     break;
26284   }
26285   case ISD::SIGN_EXTEND:
26286   case ISD::ZERO_EXTEND:
26287   case ISD::ANY_EXTEND:
26288     Promote = true;
26289     break;
26290   case ISD::SHL:
26291   case ISD::SRL: {
26292     SDValue N0 = Op.getOperand(0);
26293     // Look out for (store (shl (load), x)).
26294     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
26295       return false;
26296     Promote = true;
26297     break;
26298   }
26299   case ISD::ADD:
26300   case ISD::MUL:
26301   case ISD::AND:
26302   case ISD::OR:
26303   case ISD::XOR:
26304     Commute = true;
26305     // fallthrough
26306   case ISD::SUB: {
26307     SDValue N0 = Op.getOperand(0);
26308     SDValue N1 = Op.getOperand(1);
26309     if (!Commute && MayFoldLoad(N1))
26310       return false;
26311     // Avoid disabling potential load folding opportunities.
26312     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
26313       return false;
26314     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
26315       return false;
26316     Promote = true;
26317   }
26318   }
26319
26320   PVT = MVT::i32;
26321   return Promote;
26322 }
26323
26324 //===----------------------------------------------------------------------===//
26325 //                           X86 Inline Assembly Support
26326 //===----------------------------------------------------------------------===//
26327
26328 namespace {
26329   // Helper to match a string separated by whitespace.
26330   bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
26331     s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
26332
26333     for (unsigned i = 0, e = args.size(); i != e; ++i) {
26334       StringRef piece(*args[i]);
26335       if (!s.startswith(piece)) // Check if the piece matches.
26336         return false;
26337
26338       s = s.substr(piece.size());
26339       StringRef::size_type pos = s.find_first_not_of(" \t");
26340       if (pos == 0) // We matched a prefix.
26341         return false;
26342
26343       s = s.substr(pos);
26344     }
26345
26346     return s.empty();
26347   }
26348   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
26349 }
26350
26351 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
26352
26353   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
26354     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
26355         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
26356         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
26357
26358       if (AsmPieces.size() == 3)
26359         return true;
26360       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
26361         return true;
26362     }
26363   }
26364   return false;
26365 }
26366
26367 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
26368   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
26369
26370   std::string AsmStr = IA->getAsmString();
26371
26372   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
26373   if (!Ty || Ty->getBitWidth() % 16 != 0)
26374     return false;
26375
26376   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
26377   SmallVector<StringRef, 4> AsmPieces;
26378   SplitString(AsmStr, AsmPieces, ";\n");
26379
26380   switch (AsmPieces.size()) {
26381   default: return false;
26382   case 1:
26383     // FIXME: this should verify that we are targeting a 486 or better.  If not,
26384     // we will turn this bswap into something that will be lowered to logical
26385     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
26386     // lower so don't worry about this.
26387     // bswap $0
26388     if (matchAsm(AsmPieces[0], "bswap", "$0") ||
26389         matchAsm(AsmPieces[0], "bswapl", "$0") ||
26390         matchAsm(AsmPieces[0], "bswapq", "$0") ||
26391         matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
26392         matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
26393         matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
26394       // No need to check constraints, nothing other than the equivalent of
26395       // "=r,0" would be valid here.
26396       return IntrinsicLowering::LowerToByteSwap(CI);
26397     }
26398
26399     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
26400     if (CI->getType()->isIntegerTy(16) &&
26401         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26402         (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
26403          matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
26404       AsmPieces.clear();
26405       const std::string &ConstraintsStr = IA->getConstraintString();
26406       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26407       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26408       if (clobbersFlagRegisters(AsmPieces))
26409         return IntrinsicLowering::LowerToByteSwap(CI);
26410     }
26411     break;
26412   case 3:
26413     if (CI->getType()->isIntegerTy(32) &&
26414         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26415         matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
26416         matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
26417         matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
26418       AsmPieces.clear();
26419       const std::string &ConstraintsStr = IA->getConstraintString();
26420       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26421       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26422       if (clobbersFlagRegisters(AsmPieces))
26423         return IntrinsicLowering::LowerToByteSwap(CI);
26424     }
26425
26426     if (CI->getType()->isIntegerTy(64)) {
26427       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
26428       if (Constraints.size() >= 2 &&
26429           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
26430           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
26431         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
26432         if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
26433             matchAsm(AsmPieces[1], "bswap", "%edx") &&
26434             matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
26435           return IntrinsicLowering::LowerToByteSwap(CI);
26436       }
26437     }
26438     break;
26439   }
26440   return false;
26441 }
26442
26443 /// getConstraintType - Given a constraint letter, return the type of
26444 /// constraint it is for this target.
26445 X86TargetLowering::ConstraintType
26446 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
26447   if (Constraint.size() == 1) {
26448     switch (Constraint[0]) {
26449     case 'R':
26450     case 'q':
26451     case 'Q':
26452     case 'f':
26453     case 't':
26454     case 'u':
26455     case 'y':
26456     case 'x':
26457     case 'Y':
26458     case 'l':
26459       return C_RegisterClass;
26460     case 'a':
26461     case 'b':
26462     case 'c':
26463     case 'd':
26464     case 'S':
26465     case 'D':
26466     case 'A':
26467       return C_Register;
26468     case 'I':
26469     case 'J':
26470     case 'K':
26471     case 'L':
26472     case 'M':
26473     case 'N':
26474     case 'G':
26475     case 'C':
26476     case 'e':
26477     case 'Z':
26478       return C_Other;
26479     default:
26480       break;
26481     }
26482   }
26483   return TargetLowering::getConstraintType(Constraint);
26484 }
26485
26486 /// Examine constraint type and operand type and determine a weight value.
26487 /// This object must already have been set up with the operand type
26488 /// and the current alternative constraint selected.
26489 TargetLowering::ConstraintWeight
26490   X86TargetLowering::getSingleConstraintMatchWeight(
26491     AsmOperandInfo &info, const char *constraint) const {
26492   ConstraintWeight weight = CW_Invalid;
26493   Value *CallOperandVal = info.CallOperandVal;
26494     // If we don't have a value, we can't do a match,
26495     // but allow it at the lowest weight.
26496   if (!CallOperandVal)
26497     return CW_Default;
26498   Type *type = CallOperandVal->getType();
26499   // Look at the constraint type.
26500   switch (*constraint) {
26501   default:
26502     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
26503   case 'R':
26504   case 'q':
26505   case 'Q':
26506   case 'a':
26507   case 'b':
26508   case 'c':
26509   case 'd':
26510   case 'S':
26511   case 'D':
26512   case 'A':
26513     if (CallOperandVal->getType()->isIntegerTy())
26514       weight = CW_SpecificReg;
26515     break;
26516   case 'f':
26517   case 't':
26518   case 'u':
26519     if (type->isFloatingPointTy())
26520       weight = CW_SpecificReg;
26521     break;
26522   case 'y':
26523     if (type->isX86_MMXTy() && Subtarget->hasMMX())
26524       weight = CW_SpecificReg;
26525     break;
26526   case 'x':
26527   case 'Y':
26528     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
26529         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
26530       weight = CW_Register;
26531     break;
26532   case 'I':
26533     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
26534       if (C->getZExtValue() <= 31)
26535         weight = CW_Constant;
26536     }
26537     break;
26538   case 'J':
26539     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26540       if (C->getZExtValue() <= 63)
26541         weight = CW_Constant;
26542     }
26543     break;
26544   case 'K':
26545     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26546       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
26547         weight = CW_Constant;
26548     }
26549     break;
26550   case 'L':
26551     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26552       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
26553         weight = CW_Constant;
26554     }
26555     break;
26556   case 'M':
26557     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26558       if (C->getZExtValue() <= 3)
26559         weight = CW_Constant;
26560     }
26561     break;
26562   case 'N':
26563     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26564       if (C->getZExtValue() <= 0xff)
26565         weight = CW_Constant;
26566     }
26567     break;
26568   case 'G':
26569   case 'C':
26570     if (dyn_cast<ConstantFP>(CallOperandVal)) {
26571       weight = CW_Constant;
26572     }
26573     break;
26574   case 'e':
26575     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26576       if ((C->getSExtValue() >= -0x80000000LL) &&
26577           (C->getSExtValue() <= 0x7fffffffLL))
26578         weight = CW_Constant;
26579     }
26580     break;
26581   case 'Z':
26582     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26583       if (C->getZExtValue() <= 0xffffffff)
26584         weight = CW_Constant;
26585     }
26586     break;
26587   }
26588   return weight;
26589 }
26590
26591 /// LowerXConstraint - try to replace an X constraint, which matches anything,
26592 /// with another that has more specific requirements based on the type of the
26593 /// corresponding operand.
26594 const char *X86TargetLowering::
26595 LowerXConstraint(EVT ConstraintVT) const {
26596   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
26597   // 'f' like normal targets.
26598   if (ConstraintVT.isFloatingPoint()) {
26599     if (Subtarget->hasSSE2())
26600       return "Y";
26601     if (Subtarget->hasSSE1())
26602       return "x";
26603   }
26604
26605   return TargetLowering::LowerXConstraint(ConstraintVT);
26606 }
26607
26608 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
26609 /// vector.  If it is invalid, don't add anything to Ops.
26610 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
26611                                                      std::string &Constraint,
26612                                                      std::vector<SDValue>&Ops,
26613                                                      SelectionDAG &DAG) const {
26614   SDValue Result;
26615
26616   // Only support length 1 constraints for now.
26617   if (Constraint.length() > 1) return;
26618
26619   char ConstraintLetter = Constraint[0];
26620   switch (ConstraintLetter) {
26621   default: break;
26622   case 'I':
26623     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26624       if (C->getZExtValue() <= 31) {
26625         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26626         break;
26627       }
26628     }
26629     return;
26630   case 'J':
26631     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26632       if (C->getZExtValue() <= 63) {
26633         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26634         break;
26635       }
26636     }
26637     return;
26638   case 'K':
26639     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26640       if (isInt<8>(C->getSExtValue())) {
26641         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26642         break;
26643       }
26644     }
26645     return;
26646   case 'L':
26647     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26648       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
26649           (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
26650         Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
26651         break;
26652       }
26653     }
26654     return;
26655   case 'M':
26656     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26657       if (C->getZExtValue() <= 3) {
26658         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26659         break;
26660       }
26661     }
26662     return;
26663   case 'N':
26664     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26665       if (C->getZExtValue() <= 255) {
26666         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26667         break;
26668       }
26669     }
26670     return;
26671   case 'O':
26672     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26673       if (C->getZExtValue() <= 127) {
26674         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26675         break;
26676       }
26677     }
26678     return;
26679   case 'e': {
26680     // 32-bit signed value
26681     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26682       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26683                                            C->getSExtValue())) {
26684         // Widen to 64 bits here to get it sign extended.
26685         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
26686         break;
26687       }
26688     // FIXME gcc accepts some relocatable values here too, but only in certain
26689     // memory models; it's complicated.
26690     }
26691     return;
26692   }
26693   case 'Z': {
26694     // 32-bit unsigned value
26695     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26696       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26697                                            C->getZExtValue())) {
26698         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26699         break;
26700       }
26701     }
26702     // FIXME gcc accepts some relocatable values here too, but only in certain
26703     // memory models; it's complicated.
26704     return;
26705   }
26706   case 'i': {
26707     // Literal immediates are always ok.
26708     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
26709       // Widen to 64 bits here to get it sign extended.
26710       Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
26711       break;
26712     }
26713
26714     // In any sort of PIC mode addresses need to be computed at runtime by
26715     // adding in a register or some sort of table lookup.  These can't
26716     // be used as immediates.
26717     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
26718       return;
26719
26720     // If we are in non-pic codegen mode, we allow the address of a global (with
26721     // an optional displacement) to be used with 'i'.
26722     GlobalAddressSDNode *GA = nullptr;
26723     int64_t Offset = 0;
26724
26725     // Match either (GA), (GA+C), (GA+C1+C2), etc.
26726     while (1) {
26727       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
26728         Offset += GA->getOffset();
26729         break;
26730       } else if (Op.getOpcode() == ISD::ADD) {
26731         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26732           Offset += C->getZExtValue();
26733           Op = Op.getOperand(0);
26734           continue;
26735         }
26736       } else if (Op.getOpcode() == ISD::SUB) {
26737         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26738           Offset += -C->getZExtValue();
26739           Op = Op.getOperand(0);
26740           continue;
26741         }
26742       }
26743
26744       // Otherwise, this isn't something we can handle, reject it.
26745       return;
26746     }
26747
26748     const GlobalValue *GV = GA->getGlobal();
26749     // If we require an extra load to get this address, as in PIC mode, we
26750     // can't accept it.
26751     if (isGlobalStubReference(
26752             Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
26753       return;
26754
26755     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
26756                                         GA->getValueType(0), Offset);
26757     break;
26758   }
26759   }
26760
26761   if (Result.getNode()) {
26762     Ops.push_back(Result);
26763     return;
26764   }
26765   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
26766 }
26767
26768 std::pair<unsigned, const TargetRegisterClass*>
26769 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
26770                                                 MVT VT) const {
26771   // First, see if this is a constraint that directly corresponds to an LLVM
26772   // register class.
26773   if (Constraint.size() == 1) {
26774     // GCC Constraint Letters
26775     switch (Constraint[0]) {
26776     default: break;
26777       // TODO: Slight differences here in allocation order and leaving
26778       // RIP in the class. Do they matter any more here than they do
26779       // in the normal allocation?
26780     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
26781       if (Subtarget->is64Bit()) {
26782         if (VT == MVT::i32 || VT == MVT::f32)
26783           return std::make_pair(0U, &X86::GR32RegClass);
26784         if (VT == MVT::i16)
26785           return std::make_pair(0U, &X86::GR16RegClass);
26786         if (VT == MVT::i8 || VT == MVT::i1)
26787           return std::make_pair(0U, &X86::GR8RegClass);
26788         if (VT == MVT::i64 || VT == MVT::f64)
26789           return std::make_pair(0U, &X86::GR64RegClass);
26790         break;
26791       }
26792       // 32-bit fallthrough
26793     case 'Q':   // Q_REGS
26794       if (VT == MVT::i32 || VT == MVT::f32)
26795         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
26796       if (VT == MVT::i16)
26797         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
26798       if (VT == MVT::i8 || VT == MVT::i1)
26799         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
26800       if (VT == MVT::i64)
26801         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
26802       break;
26803     case 'r':   // GENERAL_REGS
26804     case 'l':   // INDEX_REGS
26805       if (VT == MVT::i8 || VT == MVT::i1)
26806         return std::make_pair(0U, &X86::GR8RegClass);
26807       if (VT == MVT::i16)
26808         return std::make_pair(0U, &X86::GR16RegClass);
26809       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
26810         return std::make_pair(0U, &X86::GR32RegClass);
26811       return std::make_pair(0U, &X86::GR64RegClass);
26812     case 'R':   // LEGACY_REGS
26813       if (VT == MVT::i8 || VT == MVT::i1)
26814         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
26815       if (VT == MVT::i16)
26816         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
26817       if (VT == MVT::i32 || !Subtarget->is64Bit())
26818         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
26819       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
26820     case 'f':  // FP Stack registers.
26821       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
26822       // value to the correct fpstack register class.
26823       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
26824         return std::make_pair(0U, &X86::RFP32RegClass);
26825       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
26826         return std::make_pair(0U, &X86::RFP64RegClass);
26827       return std::make_pair(0U, &X86::RFP80RegClass);
26828     case 'y':   // MMX_REGS if MMX allowed.
26829       if (!Subtarget->hasMMX()) break;
26830       return std::make_pair(0U, &X86::VR64RegClass);
26831     case 'Y':   // SSE_REGS if SSE2 allowed
26832       if (!Subtarget->hasSSE2()) break;
26833       // FALL THROUGH.
26834     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
26835       if (!Subtarget->hasSSE1()) break;
26836
26837       switch (VT.SimpleTy) {
26838       default: break;
26839       // Scalar SSE types.
26840       case MVT::f32:
26841       case MVT::i32:
26842         return std::make_pair(0U, &X86::FR32RegClass);
26843       case MVT::f64:
26844       case MVT::i64:
26845         return std::make_pair(0U, &X86::FR64RegClass);
26846       // Vector types.
26847       case MVT::v16i8:
26848       case MVT::v8i16:
26849       case MVT::v4i32:
26850       case MVT::v2i64:
26851       case MVT::v4f32:
26852       case MVT::v2f64:
26853         return std::make_pair(0U, &X86::VR128RegClass);
26854       // AVX types.
26855       case MVT::v32i8:
26856       case MVT::v16i16:
26857       case MVT::v8i32:
26858       case MVT::v4i64:
26859       case MVT::v8f32:
26860       case MVT::v4f64:
26861         return std::make_pair(0U, &X86::VR256RegClass);
26862       case MVT::v8f64:
26863       case MVT::v16f32:
26864       case MVT::v16i32:
26865       case MVT::v8i64:
26866         return std::make_pair(0U, &X86::VR512RegClass);
26867       }
26868       break;
26869     }
26870   }
26871
26872   // Use the default implementation in TargetLowering to convert the register
26873   // constraint into a member of a register class.
26874   std::pair<unsigned, const TargetRegisterClass*> Res;
26875   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
26876
26877   // Not found as a standard register?
26878   if (!Res.second) {
26879     // Map st(0) -> st(7) -> ST0
26880     if (Constraint.size() == 7 && Constraint[0] == '{' &&
26881         tolower(Constraint[1]) == 's' &&
26882         tolower(Constraint[2]) == 't' &&
26883         Constraint[3] == '(' &&
26884         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
26885         Constraint[5] == ')' &&
26886         Constraint[6] == '}') {
26887
26888       Res.first = X86::FP0+Constraint[4]-'0';
26889       Res.second = &X86::RFP80RegClass;
26890       return Res;
26891     }
26892
26893     // GCC allows "st(0)" to be called just plain "st".
26894     if (StringRef("{st}").equals_lower(Constraint)) {
26895       Res.first = X86::FP0;
26896       Res.second = &X86::RFP80RegClass;
26897       return Res;
26898     }
26899
26900     // flags -> EFLAGS
26901     if (StringRef("{flags}").equals_lower(Constraint)) {
26902       Res.first = X86::EFLAGS;
26903       Res.second = &X86::CCRRegClass;
26904       return Res;
26905     }
26906
26907     // 'A' means EAX + EDX.
26908     if (Constraint == "A") {
26909       Res.first = X86::EAX;
26910       Res.second = &X86::GR32_ADRegClass;
26911       return Res;
26912     }
26913     return Res;
26914   }
26915
26916   // Otherwise, check to see if this is a register class of the wrong value
26917   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
26918   // turn into {ax},{dx}.
26919   if (Res.second->hasType(VT))
26920     return Res;   // Correct type already, nothing to do.
26921
26922   // All of the single-register GCC register classes map their values onto
26923   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
26924   // really want an 8-bit or 32-bit register, map to the appropriate register
26925   // class and return the appropriate register.
26926   if (Res.second == &X86::GR16RegClass) {
26927     if (VT == MVT::i8 || VT == MVT::i1) {
26928       unsigned DestReg = 0;
26929       switch (Res.first) {
26930       default: break;
26931       case X86::AX: DestReg = X86::AL; break;
26932       case X86::DX: DestReg = X86::DL; break;
26933       case X86::CX: DestReg = X86::CL; break;
26934       case X86::BX: DestReg = X86::BL; break;
26935       }
26936       if (DestReg) {
26937         Res.first = DestReg;
26938         Res.second = &X86::GR8RegClass;
26939       }
26940     } else if (VT == MVT::i32 || VT == MVT::f32) {
26941       unsigned DestReg = 0;
26942       switch (Res.first) {
26943       default: break;
26944       case X86::AX: DestReg = X86::EAX; break;
26945       case X86::DX: DestReg = X86::EDX; break;
26946       case X86::CX: DestReg = X86::ECX; break;
26947       case X86::BX: DestReg = X86::EBX; break;
26948       case X86::SI: DestReg = X86::ESI; break;
26949       case X86::DI: DestReg = X86::EDI; break;
26950       case X86::BP: DestReg = X86::EBP; break;
26951       case X86::SP: DestReg = X86::ESP; break;
26952       }
26953       if (DestReg) {
26954         Res.first = DestReg;
26955         Res.second = &X86::GR32RegClass;
26956       }
26957     } else if (VT == MVT::i64 || VT == MVT::f64) {
26958       unsigned DestReg = 0;
26959       switch (Res.first) {
26960       default: break;
26961       case X86::AX: DestReg = X86::RAX; break;
26962       case X86::DX: DestReg = X86::RDX; break;
26963       case X86::CX: DestReg = X86::RCX; break;
26964       case X86::BX: DestReg = X86::RBX; break;
26965       case X86::SI: DestReg = X86::RSI; break;
26966       case X86::DI: DestReg = X86::RDI; break;
26967       case X86::BP: DestReg = X86::RBP; break;
26968       case X86::SP: DestReg = X86::RSP; break;
26969       }
26970       if (DestReg) {
26971         Res.first = DestReg;
26972         Res.second = &X86::GR64RegClass;
26973       }
26974     }
26975   } else if (Res.second == &X86::FR32RegClass ||
26976              Res.second == &X86::FR64RegClass ||
26977              Res.second == &X86::VR128RegClass ||
26978              Res.second == &X86::VR256RegClass ||
26979              Res.second == &X86::FR32XRegClass ||
26980              Res.second == &X86::FR64XRegClass ||
26981              Res.second == &X86::VR128XRegClass ||
26982              Res.second == &X86::VR256XRegClass ||
26983              Res.second == &X86::VR512RegClass) {
26984     // Handle references to XMM physical registers that got mapped into the
26985     // wrong class.  This can happen with constraints like {xmm0} where the
26986     // target independent register mapper will just pick the first match it can
26987     // find, ignoring the required type.
26988
26989     if (VT == MVT::f32 || VT == MVT::i32)
26990       Res.second = &X86::FR32RegClass;
26991     else if (VT == MVT::f64 || VT == MVT::i64)
26992       Res.second = &X86::FR64RegClass;
26993     else if (X86::VR128RegClass.hasType(VT))
26994       Res.second = &X86::VR128RegClass;
26995     else if (X86::VR256RegClass.hasType(VT))
26996       Res.second = &X86::VR256RegClass;
26997     else if (X86::VR512RegClass.hasType(VT))
26998       Res.second = &X86::VR512RegClass;
26999   }
27000
27001   return Res;
27002 }
27003
27004 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
27005                                             Type *Ty) const {
27006   // Scaling factors are not free at all.
27007   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
27008   // will take 2 allocations in the out of order engine instead of 1
27009   // for plain addressing mode, i.e. inst (reg1).
27010   // E.g.,
27011   // vaddps (%rsi,%drx), %ymm0, %ymm1
27012   // Requires two allocations (one for the load, one for the computation)
27013   // whereas:
27014   // vaddps (%rsi), %ymm0, %ymm1
27015   // Requires just 1 allocation, i.e., freeing allocations for other operations
27016   // and having less micro operations to execute.
27017   //
27018   // For some X86 architectures, this is even worse because for instance for
27019   // stores, the complex addressing mode forces the instruction to use the
27020   // "load" ports instead of the dedicated "store" port.
27021   // E.g., on Haswell:
27022   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
27023   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
27024   if (isLegalAddressingMode(AM, Ty))
27025     // Scale represents reg2 * scale, thus account for 1
27026     // as soon as we use a second register.
27027     return AM.Scale != 0;
27028   return -1;
27029 }
27030
27031 bool X86TargetLowering::isTargetFTOL() const {
27032   return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
27033 }